# Phase 2: Feature Engineering Validation

This notebook validates the feature engineering pipeline implemented in `tweet_classifier`.

**Validates:**
1. Data loading and column verification
2. Reliable sample filtering
3. Hash-based splitting with leakage verification
4. Categorical encoding mappings
5. TweetDataset creation with tokenizer
6. Numerical feature scaling
7. Class weight computation
8. DataLoader batch iteration


## Cell 1: Load Data and Verify Columns


In [None]:
import sys
sys.path.insert(0, "../src")

from tweet_classifier.data.loader import load_enriched_data, get_data_summary
from tweet_classifier.config import (
    TARGET_COLUMN,
    NUMERICAL_FEATURES,
    CATEGORICAL_FEATURES,
    EXCLUDED_FROM_FEATURES,
    TEXT_COLUMN,
)

print("=" * 60)
print("CELL 1: Load Data and Verify Columns")
print("=" * 60)

# Load data
df = load_enriched_data()
print(f"\n✓ Loaded {len(df)} rows from enriched dataset")

# Verify required columns exist
required_cols = [TARGET_COLUMN, TEXT_COLUMN, "tweet_hash", "is_reliable_label"]
required_cols.extend(NUMERICAL_FEATURES)
required_cols.extend(CATEGORICAL_FEATURES)

print(f"\n--- Required Columns Check ---")
all_present = True
for col in required_cols:
    present = col in df.columns
    status = "✓" if present else "✗ MISSING"
    print(f"  {status} {col}")
    all_present = all_present and present

if all_present:
    print("\n✓ All required columns present")
else:
    print("\n✗ Some required columns missing!")

# Show data summary
summary = get_data_summary(df)
print(f"\n--- Data Summary ---")
for key, value in summary.items():
    print(f"  {key}: {value}")


## Cell 2: Filter to Reliable Samples


In [None]:
from tweet_classifier.data.loader import filter_reliable

print("=" * 60)
print("CELL 2: Filter to Reliable Samples")
print("=" * 60)

# Filter data
df_reliable = filter_reliable(df, drop_missing_target=True)

print(f"\nBefore filtering: {len(df)} samples")
print(f"After filtering:  {len(df_reliable)} samples")
print(f"Removed: {len(df) - len(df_reliable)} samples ({100*(len(df) - len(df_reliable))/len(df):.1f}%)")

# Verify no missing targets
missing_targets = df_reliable[TARGET_COLUMN].isna().sum()
print(f"\nMissing targets after filter: {missing_targets}")
assert missing_targets == 0, "Filter should have removed all missing targets!"
print("✓ No missing targets in filtered data")

# Verify all samples are reliable
unreliable = (~df_reliable["is_reliable_label"]).sum()
print(f"Unreliable samples after filter: {unreliable}")
assert unreliable == 0, "Filter should have removed all unreliable samples!"
print("✓ All samples are reliable")


## Cell 3: Split by Hash, Verify No Leakage


In [None]:
from tweet_classifier.data.splitter import split_by_hash, get_split_summary, verify_no_leakage

print("=" * 60)
print("CELL 3: Split by Hash, Verify No Leakage")
print("=" * 60)

# Split data
df_train, df_val, df_test = split_by_hash(df_reliable)

# Get summary
split_summary = get_split_summary(df_train, df_val, df_test)

print(f"\n--- Split Summary ---")
print(f"{'Set':<10} {'Samples':>10} {'Percentage':>12} {'Unique Hashes':>15}")
print("-" * 50)
for split_name in ["train", "val", "test"]:
    s = split_summary[split_name]
    print(f"{split_name:<10} {s['samples']:>10} {s['percentage']:>11.1f}% {s['unique_hashes']:>15}")
print("-" * 50)
print(f"{'Total':<10} {split_summary['total']:>10}")

# Verify no leakage
print("\n--- Leakage Verification ---")
try:
    verify_no_leakage(df_train, df_val, df_test)
    print("✓ No hash overlap between train/val/test splits")
    print("✓ No text leakage detected")
except ValueError as e:
    print(f"✗ LEAKAGE DETECTED: {e}")


## Cell 4: Create Categorical Encodings, Verify Mappings


In [None]:
from tweet_classifier.dataset import create_categorical_encodings, encode_categorical

print("=" * 60)
print("CELL 4: Create Categorical Encodings, Verify Mappings")
print("=" * 60)

# Create encodings from training data only (to avoid leakage)
encodings = create_categorical_encodings(df_train)

print(f"\n--- Author Encoding ---")
print(f"Number of unique authors: {encodings['num_authors']}")
print(f"\nTop 5 author mappings:")
for author, idx in list(encodings['author_to_idx'].items())[:5]:
    print(f"  {idx}: {author}")

print(f"\n--- Category Encoding ---")
print(f"Number of unique categories: {encodings['num_categories']}")
print(f"\nCategory mappings:")
for category, idx in encodings['category_to_idx'].items():
    print(f"  {idx}: {category}")

# Apply encodings to train set
df_train_encoded = encode_categorical(
    df_train,
    encodings['author_to_idx'],
    encodings['category_to_idx'],
    handle_unknown='default'
)

print(f"\n--- Verification ---")
print(f"✓ author_idx column added: {'author_idx' in df_train_encoded.columns}")
print(f"✓ category_idx column added: {'category_idx' in df_train_encoded.columns}")
print(f"\nSample encoded rows:")
print(df_train_encoded[['author', 'author_idx', 'category', 'category_idx']].head(3).to_string())


## Cell 5: Create TweetDataset with Tokenizer, Verify Shapes


In [None]:
from transformers import BertTokenizer
from tweet_classifier.dataset import create_dataset_from_df
from tweet_classifier.config import FINBERT_MODEL_NAME, MAX_TEXT_LENGTH

print("=" * 60)
print("CELL 5: Create TweetDataset with Tokenizer, Verify Shapes")
print("=" * 60)

# Load tokenizer
print(f"\nLoading tokenizer: {FINBERT_MODEL_NAME}")
tokenizer = BertTokenizer.from_pretrained(FINBERT_MODEL_NAME)
print(f"✓ Tokenizer loaded (vocab size: {tokenizer.vocab_size})")

# Create dataset from training data
print(f"\nCreating TweetDataset from {len(df_train)} training samples...")
train_dataset, scaler = create_dataset_from_df(
    df_train,
    tokenizer,
    encodings['author_to_idx'],
    encodings['category_to_idx'],
    scaler=None,
    fit_scaler=True
)

print(f"✓ Dataset created with {len(train_dataset)} samples")

# Verify shapes
print(f"\n--- Tensor Shapes (single sample) ---")
sample = train_dataset[0]
for key, tensor in sample.items():
    print(f"  {key}: {tensor.shape}")

# Verify expected shapes
print(f"\n--- Shape Verification ---")
assert sample['input_ids'].shape[0] == MAX_TEXT_LENGTH, f"Expected {MAX_TEXT_LENGTH} tokens"
print(f"✓ input_ids: [{MAX_TEXT_LENGTH}] (max_length={MAX_TEXT_LENGTH})")

assert sample['attention_mask'].shape[0] == MAX_TEXT_LENGTH
print(f"✓ attention_mask: [{MAX_TEXT_LENGTH}]")

assert sample['numerical'].shape[0] == len(NUMERICAL_FEATURES)
print(f"✓ numerical: [{len(NUMERICAL_FEATURES)}] ({NUMERICAL_FEATURES})")

assert sample['author_idx'].shape == ()
print(f"✓ author_idx: scalar")

assert sample['category_idx'].shape == ()
print(f"✓ category_idx: scalar")

assert sample['labels'].shape == ()
print(f"✓ labels: scalar (0=SELL, 1=HOLD, 2=BUY)")


## Cell 6: Verify Numerical Features are Scaled Correctly


In [None]:
import numpy as np

print("=" * 60)
print("CELL 6: Verify Numerical Features are Scaled Correctly")
print("=" * 60)

# Get all numerical features from dataset
all_numerical = train_dataset.numerical.numpy()

print(f"\n--- Numerical Features Statistics ---")
print(f"Shape: {all_numerical.shape} (samples x features)")

print(f"\n{'Feature':<25} {'Mean':>10} {'Std':>10} {'Min':>10} {'Max':>10}")
print("-" * 70)
for i, feat_name in enumerate(NUMERICAL_FEATURES):
    col = all_numerical[:, i]
    print(f"{feat_name:<25} {col.mean():>10.4f} {col.std():>10.4f} {col.min():>10.4f} {col.max():>10.4f}")

# Verify scaling (StandardScaler should give mean~0, std~1)
print(f"\n--- Scaling Verification ---")
mean_close_to_zero = np.allclose(all_numerical.mean(axis=0), 0, atol=0.1)
std_close_to_one = np.allclose(all_numerical.std(axis=0), 1, atol=0.2)

if mean_close_to_zero:
    print("✓ Feature means are close to 0 (StandardScaler applied)")
else:
    print("⚠️  Feature means deviate from 0")

if std_close_to_one:
    print("✓ Feature stds are close to 1 (StandardScaler applied)")
else:
    print("⚠️  Feature stds deviate from 1")

# Verify scaler was fitted
print(f"\n--- Scaler Parameters ---")
print(f"Scaler mean_: {scaler.mean_}")
print(f"Scaler scale_: {scaler.scale_}")


## Cell 7: Verify Class Weights Match Expected Distribution


In [None]:
from tweet_classifier.data.weights import compute_class_weights, get_weight_summary, weights_to_tensor
from tweet_classifier.config import LABEL_MAP_INV

print("=" * 60)
print("CELL 7: Verify Class Weights Match Expected Distribution")
print("=" * 60)

# Get weight summary
weight_summary = get_weight_summary(df_train[TARGET_COLUMN])

print(f"\n--- Class Distribution and Weights ---")
print(f"{'Class':<10} {'Count':>10} {'Percentage':>12} {'Weight':>10}")
print("-" * 45)
total = sum(w['count'] for w in weight_summary.values())
for label, info in weight_summary.items():
    pct = 100 * info['count'] / total if total > 0 else 0
    print(f"{label:<10} {info['count']:>10} {pct:>11.1f}% {info['weight']:>10.4f}")

# Compute weights
weights = compute_class_weights(df_train[TARGET_COLUMN])
weights_tensor = weights_to_tensor(weights)

print(f"\n--- Weight Verification ---")
print(f"Computed weights (numpy): {weights}")
print(f"Computed weights (tensor): {weights_tensor}")

# Verify inverse relationship: higher weight for minority class
class_counts = [weight_summary[LABEL_MAP_INV[i]]['count'] for i in range(3)]
min_class_idx = np.argmin(class_counts)
max_weight_idx = np.argmax(weights)

if min_class_idx == max_weight_idx:
    print(f"\n✓ Minority class ({LABEL_MAP_INV[min_class_idx]}) has highest weight ({weights[min_class_idx]:.4f})")
else:
    print(f"\n⚠️  Weight assignment may be incorrect")

# Verify weights are balanced (mean should be ~1)
mean_weight = weights.mean()
print(f"\nMean weight: {mean_weight:.4f} (should be ~1.0 for balanced weights)")


## Cell 8: Test DataLoader Iteration (Batch Shapes)


In [None]:
from torch.utils.data import DataLoader
from tweet_classifier.config import DEFAULT_BATCH_SIZE

print("=" * 60)
print("CELL 8: Test DataLoader Iteration (Batch Shapes)")
print("=" * 60)

# Create DataLoader
train_loader = DataLoader(
    train_dataset,
    batch_size=DEFAULT_BATCH_SIZE,
    shuffle=True,
    num_workers=0  # Set to 0 for notebook compatibility
)

print(f"\nDataLoader created:")
print(f"  Batch size: {DEFAULT_BATCH_SIZE}")
print(f"  Total batches: {len(train_loader)}")
print(f"  Total samples: {len(train_dataset)}")

# Get first batch
print(f"\n--- First Batch Shapes ---")
batch = next(iter(train_loader))
for key, tensor in batch.items():
    print(f"  {key}: {tensor.shape}")

# Verify batch shapes
print(f"\n--- Batch Shape Verification ---")
B = DEFAULT_BATCH_SIZE

assert batch['input_ids'].shape == (B, MAX_TEXT_LENGTH)
print(f"✓ input_ids: [{B}, {MAX_TEXT_LENGTH}]")

assert batch['attention_mask'].shape == (B, MAX_TEXT_LENGTH)
print(f"✓ attention_mask: [{B}, {MAX_TEXT_LENGTH}]")

assert batch['numerical'].shape == (B, len(NUMERICAL_FEATURES))
print(f"✓ numerical: [{B}, {len(NUMERICAL_FEATURES)}]")

assert batch['author_idx'].shape == (B,)
print(f"✓ author_idx: [{B}]")

assert batch['category_idx'].shape == (B,)
print(f"✓ category_idx: [{B}]")

assert batch['labels'].shape == (B,)
print(f"✓ labels: [{B}]")

# Iterate through a few batches to verify no errors
print(f"\n--- Batch Iteration Test ---")
num_test_batches = min(5, len(train_loader))
for i, batch in enumerate(train_loader):
    if i >= num_test_batches:
        break
print(f"✓ Successfully iterated through {num_test_batches} batches without errors")

print("\n" + "=" * 60)
print("PHASE 2 FEATURE ENGINEERING VALIDATION COMPLETE")
print("=" * 60)
print("\nAll checks passed! The feature engineering pipeline is working correctly.")
print("\nNext steps:")
print("  1. Implement FinBERTMultiModal model (Phase 3)")
print("  2. Create training script (Phase 4)")
print("  3. Train and evaluate (Phase 5)")
