# 4. Feature Engineering & Model Preparation

This notebook handles feature engineering and prepares the data for model training.

## Objectives
1. Load processed data and NLP components
2. Engineer relevant features
3. Handle class imbalance
4. Split data for training
5. Prepare feature pipeline

In [None]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from collections import Counter
from tqdm.notebook import tqdm

# Set random seed
np.random.seed(42)

## 1. Load Data

In [None]:
def load_data_and_components():
    """Load processed data and NLP components."""
    # Load processed dataset
    df = pd.read_csv('../data/processed/processed_data.csv')
    
    # Load symptom vocabulary
    with open('../data/processed/symptom_vocab.json', 'r') as f:
        symptom_vocab = json.load(f)
    
    # Load symptom embeddings
    symptom_embeddings = np.load('../data/processed/symptom_embeddings.npy')
    
    return df, symptom_vocab, symptom_embeddings

# Load components
df, symptom_vocab, symptom_embeddings = load_data_and_components()

print(f"Loaded dataset with {len(df)} samples")
print(f"Number of symptoms: {len(symptom_vocab)}")
print(f"Embedding dimension: {symptom_embeddings.shape[1]}")

## 2. Feature Engineering

In [None]:
def engineer_features(df, symptom_vocab, symptom_embeddings):
    """Engineer features from symptoms.
    
    Args:
        df (pd.DataFrame): Processed dataset
        symptom_vocab (dict): Symptom vocabulary
        symptom_embeddings (np.array): Pre-computed symptom embeddings
        
    Returns:
        tuple: (feature matrix X, target vector y, feature names)
    """
    # Split features and target
    X = df.drop('disease', axis=1)
    y = df['disease']
    
    # Get feature names
    feature_names = X.columns.tolist()
    
    # Add derived features
    
    # 1. Symptom count per sample
    X['symptom_count'] = X.sum(axis=1)
    feature_names.append('symptom_count')
    
    # 2. Weighted symptoms by frequency
    weights = np.array([symptom_vocab[s]['frequency'] for s in X.columns[:-1]])
    weights = weights / weights.sum()
    X['weighted_symptoms'] = (X.iloc[:, :-1] * weights).sum(axis=1)
    feature_names.append('weighted_symptoms')
    
    # 3. Symptom embedding aggregation
    agg_embeddings = np.zeros((len(X), symptom_embeddings.shape[1]))
    
    for i, row in enumerate(X.iloc[:, :-2].values):
        active_embeddings = symptom_embeddings[row.astype(bool)]
        if len(active_embeddings) > 0:
            agg_embeddings[i] = active_embeddings.mean(axis=0)
    
    # Add embedding features
    for i in range(symptom_embeddings.shape[1]):
        X[f'emb_{i}'] = agg_embeddings[:, i]
        feature_names.append(f'emb_{i}')
    
    return X.values, y.values, feature_names

# Engineer features
X, y, feature_names = engineer_features(df, symptom_vocab, symptom_embeddings)

print(f"Feature matrix shape: {X.shape}")
print(f"Number of feature types:")
print(f"- Binary symptoms: {len(symptom_vocab)}")
print(f"- Derived features: 2")
print(f"- Embedding features: {symptom_embeddings.shape[1]}")

## 3. Handle Class Imbalance

In [None]:
def analyze_class_distribution(y):
    """Analyze and visualize class distribution."""
    print("Class Distribution:")
    counts = Counter(y)
    for label, count in counts.most_common():
        print(f"- {label}: {count} ({count/len(y)*100:.1f}%)")

print("Before SMOTE:")
analyze_class_distribution(y)

# Apply SMOTE to balance classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

print("\nAfter SMOTE:")
analyze_class_distribution(y_resampled)

## 4. Split Data

In [None]:
def prepare_train_val_test(X, y, test_size=0.2, val_size=0.2):
    """Split data into train, validation, and test sets.
    
    Args:
        X (np.array): Feature matrix
        y (np.array): Target vector
        test_size (float): Proportion for test set
        val_size (float): Proportion for validation set
        
    Returns:
        tuple: Train, validation, and test sets
    """
    # First split: separate test set
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )
    
    # Second split: separate train and validation
    val_size_adjusted = val_size / (1 - test_size)
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, 
        test_size=val_size_adjusted,
        random_state=42,
        stratify=y_train_val
    )
    
    return (
        (X_train, y_train),
        (X_val, y_val),
        (X_test, y_test)
    )

# Split data
(X_train, y_train), (X_val, y_val), (X_test, y_test) = \
    prepare_train_val_test(X_resampled, y_resampled)

print("Data split sizes:")
print(f"Train: {len(X_train)} samples")
print(f"Validation: {len(X_val)} samples")
print(f"Test: {len(X_test)} samples")

## 5. Save Prepared Data

In [None]:
def save_prepared_data():
    """Save prepared datasets and metadata."""
    # Save splits
    np.save('../data/processed/X_train.npy', X_train)
    np.save('../data/processed/y_train.npy', y_train)
    np.save('../data/processed/X_val.npy', X_val)
    np.save('../data/processed/y_val.npy', y_val)
    np.save('../data/processed/X_test.npy', X_test)
    np.save('../data/processed/y_test.npy', y_test)
    
    # Save feature names
    with open('../data/processed/feature_names.json', 'w') as f:
        json.dump(feature_names, f, indent=2)
    
    print("Saved prepared data:")
    print("- Train, validation, and test splits")
    print("- Feature names")

save_prepared_data()

## 6. Validate Feature Pipeline

In [None]:
def validate_pipeline():
    """Validate the feature engineering pipeline."""
    # Check feature statistics
    print("Feature Statistics:")
    print(f"Mean values: {X_train.mean(axis=0)[:5]}...")
    print(f"Std values: {X_train.std(axis=0)[:5]}...")
    
    # Verify class balance
    print("\nClass balance in splits:")
    for name, y_split in [('Train', y_train), ('Val', y_val), ('Test', y_test)]:
        counts = Counter(y_split)
        print(f"\n{name} set:")
        for label, count in counts.most_common(5):
            print(f"- {label}: {count}")
    
    # Verify feature correlations
    print("\nFeature correlations (sample):")
    correlations = np.corrcoef(X_train[:, :5].T)
    print(correlations)

validate_pipeline()