In [3]:
import numpy as np
import pandas as pd
import warnings
import joblib
import os
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import torch

warnings.filterwarnings("ignore")
np.random.seed(42)

# =============================================================================
# GPU CONFIGURATION
# =============================================================================
print("="*80)
print("GPU CONFIGURATION")
print("="*80)
if torch.cuda.is_available():
    print(f"✓ GPU detected: {torch.cuda.get_device_name(0)}")
    print(f"✓ CUDA version: {torch.cuda.get_device_properties(0).major}.{torch.cuda.get_device_properties(0).minor}")
    print(f"✓ GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    USE_GPU = True
else:
    print("⚠ No GPU detected, using CPU")
    USE_GPU = False

# =============================================================================
# 1. LOAD DATA
# =============================================================================
print("\n" + "="*80)
print("LOADING DATA")
print("="*80)

train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")
original_df = pd.read_csv("../data/Exam_Score_Prediction.csv")

TARGET = "exam_score"
ID_COL = "id"

print(f"Train: {train_df.shape}, Test: {test_df.shape}, Original: {original_df.shape}")

# =============================================================================
# 2. ADVERSARIAL VALIDATION
# =============================================================================
print("\n" + "="*80)
print("ADVERSARIAL VALIDATION - Detecting Distribution Shift")
print("="*80)

def adversarial_validation(train, test, features):
    """Identify train-test distribution differences"""
    train_adv = train[features].copy()
    test_adv = test[features].copy()

    train_adv['is_test'] = 0
    test_adv['is_test'] = 1

    combined = pd.concat([train_adv, test_adv], axis=0, ignore_index=True)

    for col in combined.select_dtypes(include=['object']).columns:
        if col != 'is_test':
            combined[col] = combined[col].astype('category').cat.codes

    X = combined.drop('is_test', axis=1)
    y = combined['is_test']

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    oof_preds = np.zeros(len(X))

    for fold, (tr_idx, val_idx) in enumerate(kf.split(X)):
        X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

        model = lgb.LGBMClassifier(n_estimators=100, random_state=42, verbose=-1)
        model.fit(X_tr, y_tr)
        oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]

    auc = roc_auc_score(y, oof_preds)
    print(f"Adversarial AUC: {auc:.4f}")

    if auc > 0.6:
        print("⚠️  Significant distribution shift detected!")
        model = lgb.LGBMClassifier(n_estimators=100, random_state=42, verbose=-1)
        model.fit(X, y)
        importance_df = pd.DataFrame({
            'feature': X.columns,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)
        print("\nTop features causing shift:")
        print(importance_df.head(10))
    else:
        print("✓ Distribution shift is minimal")

    return auc

base_features = [c for c in train_df.columns if c not in [ID_COL, TARGET]]
adv_auc = adversarial_validation(train_df, test_df, base_features)

# =============================================================================
# 3. OPTIMIZED FEATURE ENGINEERING (EDA-DRIVEN)
# =============================================================================
print("\n" + "="*80)
print("OPTIMIZED FEATURE ENGINEERING (EDA-DRIVEN)")
print("="*80)

def create_optimized_features(df):
    """
    Create high-impact features based strictly on EDA findings.
    Focus: Quality over Quantity - Only features with theoretical justification.
    """
    df = df.copy()
    eps = 1e-5
    
    # =========================================================================
    # TIER 1: CRITICAL FEATURES (Core predictors from EDA)
    # =========================================================================
    print("  Creating Tier 1: Critical features...")
    
    # Polynomial features (degree 2 only)
    df['study_sq'] = df['study_hours'] ** 2
    df['attendance_sq'] = df['class_attendance'] ** 2
    
    # Key interactions
    df['study_x_attendance'] = df['study_hours'] * df['class_attendance']
    df['study_x_sleep'] = df['study_hours'] * df['sleep_hours']
    
    # Efficiency metrics
    df['efficiency'] = (df['study_hours'] * df['class_attendance']) / (df['sleep_hours'] + 1)
    df['efficiency_sq'] = df['efficiency'] ** 2
    
    # Weighted effort (EDA-informed weights)
    df['weighted_effort'] = (0.06 * df['class_attendance'] + 
                             2.0 * df['study_hours'] + 
                             1.2 * df['sleep_hours'])
    
    # =========================================================================
    # TIER 2: HIGH-IMPACT CATEGORICAL FEATURES (η² > 2% from EDA)
    # =========================================================================
    print("  Creating Tier 2: High-impact categorical features...")
    
    # Ordinal encoding for features with clear monotonic relationships
    sleep_quality_map = {'poor': 0, 'average': 1, 'good': 2}
    df['sleep_quality_ord'] = df['sleep_quality'].map(sleep_quality_map).fillna(1)
    
    study_method_map = {'self-study': 0, 'online videos': 1, 'group study': 2, 
                        'mixed': 3, 'coaching': 4}
    df['study_method_ord'] = df['study_method'].map(study_method_map).fillna(2)
    
    facility_map = {'low': 0, 'medium': 1, 'high': 2}
    df['facility_ord'] = df['facility_rating'].map(facility_map).fillna(1)
    
    difficulty_map = {'easy': 0, 'moderate': 1, 'hard': 2}
    df['difficulty_ord'] = df['exam_difficulty'].map(difficulty_map).fillna(1)
    
    # =========================================================================
    # TIER 3: CATEGORICAL × NUMERIC INTERACTIONS
    # =========================================================================
    print("  Creating Tier 3: Categorical × numeric interactions...")
    
    df['sleep_quality_x_study'] = df['sleep_quality_ord'] * df['study_hours']
    df['facility_x_attendance'] = df['facility_ord'] * df['class_attendance']
    df['study_method_x_hours'] = df['study_method_ord'] * df['study_hours']
    df['difficulty_x_efficiency'] = df['difficulty_ord'] * df['efficiency']
    
    # =========================================================================
    # TIER 4: DOMAIN-SPECIFIC FLAGS
    # =========================================================================
    print("  Creating Tier 4: Domain-specific flags...")
    
    df['ideal_sleep'] = ((df['sleep_hours'] >= 7) & (df['sleep_hours'] <= 9)).astype(int)
    df['sleep_deprived'] = (df['sleep_hours'] <= 5.5).astype(int)
    df['high_performer'] = ((df['study_hours'] >= 6) & (df['class_attendance'] >= 85)).astype(int)
    df['at_risk'] = ((df['study_hours'] <= 3) | (df['class_attendance'] <= 60)).astype(int)
    
    # =========================================================================
    # TIER 5: BINNING FEATURES
    # =========================================================================
    print("  Creating Tier 5: Intelligent binning...")
    
    df['study_bin'] = pd.cut(df['study_hours'], bins=[-0.1, 2, 4, 6, 8], labels=False).astype(float)
    df['attendance_bin'] = pd.cut(df['class_attendance'], bins=[40, 60, 75, 85, 95, 100], labels=False).astype(float)
    df['sleep_bin'] = pd.cut(df['sleep_hours'], bins=[4, 5.5, 7, 8.5, 10], labels=False).astype(float)
    
    # =========================================================================
    # TIER 6: SELECTIVE TRANSFORMATIONS
    # =========================================================================
    print("  Creating Tier 6: Selective transformations...")
    
    df['log_study_hours'] = np.log1p(df['study_hours'])
    df['sqrt_attendance'] = np.sqrt(df['class_attendance'])
    df['age_rank'] = df['age'].rank(pct=True)
    
    # =========================================================================
    # TIER 7: DISTANCE FROM OPTIMAL
    # =========================================================================
    print("  Creating Tier 7: Distance from optimal...")
    
    df['sleep_from_optimal'] = np.abs(df['sleep_hours'] - 8)
    df['study_from_optimal'] = np.abs(df['study_hours'] - 6)
    df['attendance_from_optimal'] = np.abs(df['class_attendance'] - 95)
    
    # =========================================================================
    # TIER 8: RATIOS
    # =========================================================================
    print("  Creating Tier 8: Meaningful ratios...")
    
    df['study_per_sleep'] = df['study_hours'] / (df['sleep_hours'] + eps)
    df['geometric_mean'] = (
        (df['study_hours'] + 1) *
        (df['class_attendance'] + 1) *
        (df['sleep_hours'] + 1)
    ) ** (1/3)

    # =========================================================================
    # TIER 9: FORMULA-BASED FEATURES
    # =========================================================================
    print("  Creating Tier 9: Formula-based features...")

    # Core formula from domain analysis
    df['formula'] = (
        6 * df['study_hours'] + 
        0.35 * df['class_attendance'] + 
        1.5 * df['sleep_hours'] +
        5 * (df['sleep_quality'] == 'good') + 
        -5 * (df['sleep_quality'] == 'poor') +
        10 * (df['study_method'] == 'coaching') + 
        5 * (df['study_method'] == 'mixed') + 
        2 * (df['study_method'] == 'group study') + 
        1 * (df['study_method'] == 'online videos') +
        4 * (df['facility_rating'] == 'high') + 
        -4 * (df['facility_rating'] == 'low')
    )

    # Formula derivatives
    df['formula_sq'] = df['formula'] ** 2
    df['formula_sqrt'] = np.sqrt(np.abs(df['formula']))
    df['formula_log'] = np.log1p(df['formula'] - df['formula'].min() + 1)

    # Formula interactions with key features
    df['formula_x_efficiency'] = df['formula'] * df['efficiency']
    df['formula_x_study'] = df['formula'] * df['study_hours']
    df['formula_residual'] = df['formula'] - df['weighted_effort']

    # Formula-based ratios
    df['formula_per_study'] = df['formula'] / (df['study_hours'] + eps)
    df['formula_per_attendance'] = df['formula'] / (df['class_attendance'] + eps)
   
    
    print("✓ Feature engineering complete!")
    return df

# =============================================================================
# Apply feature engineering
# =============================================================================
print("\nApplying optimized feature engineering...")
train_fe = create_optimized_features(train_df)
test_fe = create_optimized_features(test_df)
original_fe = create_optimized_features(original_df)

y = train_df[TARGET].clip(0, 100).values
y_orig = original_df[TARGET].clip(0, 100).values

# Categorical features for target encoding (only high-impact from EDA)
cat_features = [
    'sleep_quality',      # η² = 5.6%
    'study_method',       # η² = 5.0%
    'facility_rating',    # η² = 3.6%
]

# Low-impact categoricals to remove
low_impact_cats = [
    'gender',           # η² < 0.2%
    'course',           # η² < 0.2%
    'internet_access',  # η² < 0.2%
    'exam_difficulty',  # We have difficulty_ord instead
]

print(f"\n{'='*60}")
print(f"FEATURE ENGINEERING SUMMARY")
print(f"{'='*60}")
print(f"High-impact categoricals (for target encoding): {cat_features}")
print(f"Low-impact categoricals (will be removed): {low_impact_cats}")

# =============================================================================
# 4. TARGET ENCODING
# =============================================================================
print("\n" + "="*80)
print("TARGET ENCODING ON CATEGORICAL FEATURES")
print("="*80)

def target_encode_cv(X_train, X_test, y_train, cat_cols, n_splits=5, alpha=10):
    """
    Target encoding with CV to prevent leakage.
    Alpha = smoothing parameter (higher = more regularization)
    """
    X_train_enc = X_train.copy()
    X_test_enc = X_test.copy()
    
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    global_mean = y_train.mean()
    
    for col in cat_cols:
        print(f"  Encoding {col}...")
        
        # For train: use CV to prevent leakage
        X_train_enc[f'{col}_target'] = 0.0
        
        for train_idx, val_idx in kf.split(X_train):
            X_tr = X_train.iloc[train_idx]
            y_tr = y_train[train_idx]
            
            # Create temporary DataFrame to align X and y
            df_fold = pd.DataFrame({
                'category': X_tr[col].values,
                'target': y_tr
            })
            
            # Calculate mean encoding on training fold
            target_means = df_fold.groupby('category')['target'].agg(['mean', 'count'])
            target_means['target_enc'] = (
                (target_means['mean'] * target_means['count'] + global_mean * alpha) / 
                (target_means['count'] + alpha)
            )
            
            # Apply to validation fold
            X_train_enc.loc[val_idx, f'{col}_target'] = (
                X_train.iloc[val_idx][col]
                .map(target_means['target_enc'])
                .fillna(global_mean)
            )
        
        # For test: use all training data to calculate means
        df_full = pd.DataFrame({
            'category': X_train[col].values,
            'target': y_train
        })
        
        target_means_full = df_full.groupby('category')['target'].agg(['mean', 'count'])
        target_means_full['target_enc'] = (
            (target_means_full['mean'] * target_means_full['count'] + global_mean * alpha) / 
            (target_means_full['count'] + alpha)
        )
        
        X_test_enc[f'{col}_target'] = (
            X_test[col]
            .map(target_means_full['target_enc'])
            .fillna(global_mean)
        )
        
        # Remove original column (now we have the encoded version)
        X_train_enc = X_train_enc.drop(columns=[col])
        X_test_enc = X_test_enc.drop(columns=[col])
    
    return X_train_enc, X_test_enc


def target_encode_simple(X, y, cat_cols, alpha=10):
    """Simple target encoding (no CV) for small datasets"""
    X_enc = X.copy()
    global_mean = y.mean()
    
    for col in cat_cols:
        print(f"  Encoding {col} (simple)...")
        
        df_temp = pd.DataFrame({
            'category': X[col].values,
            'target': y
        })
        
        target_means = df_temp.groupby('category')['target'].agg(['mean', 'count'])
        target_means['target_enc'] = (
            (target_means['mean'] * target_means['count'] + global_mean * alpha) / 
            (target_means['count'] + alpha)
        )
        
        X_enc[f'{col}_target'] = X[col].map(target_means['target_enc']).fillna(global_mean)
        X_enc = X_enc.drop(columns=[col])
    
    return X_enc


# Apply target encoding
print("\nTarget encoding TRAIN and TEST (with CV)...")
X_train_init, X_test_init = target_encode_cv(
    train_fe,
    test_fe,
    pd.Series(y, name='exam_score'),
    cat_features,
    n_splits=5,
    alpha=10
)

print("\nTarget encoding ORIGINAL (simple, no CV)...")
X_orig_init = target_encode_simple(
    original_fe,
    pd.Series(y_orig, name='exam_score'),
    cat_features,
    alpha=10
)

# Remove low-impact categoricals + ID + TARGET
cols_to_remove = [ID_COL, TARGET] + low_impact_cats
feature_cols = [c for c in X_train_init.columns if c not in cols_to_remove]

X_train_init = X_train_init[feature_cols]
X_test_init = X_test_init[feature_cols]
X_orig_init = X_orig_init[feature_cols]

# Convert to float32
X_train_init = X_train_init.astype(np.float32)
X_test_init = X_test_init.astype(np.float32)
X_orig_init = X_orig_init.astype(np.float32)

print(f"\n✓ Target encoding complete. Shape: {X_train_init.shape}")
print(f"✓ Features after cleanup: {len(feature_cols)}")

# =============================================================================
# 5. FEATURE IMPORTANCE ANALYSIS & SELECTION
# =============================================================================
print("\n" + "="*80)
print("FEATURE IMPORTANCE ANALYSIS")
print("="*80)

print("Training quick LightGBM for feature importance...")

X_temp = X_train_init.copy()
y_temp = y.copy()

kf_temp = KFold(n_splits=3, shuffle=True, random_state=42)
feature_importance = np.zeros(X_temp.shape[1])

for fold, (train_idx, val_idx) in enumerate(kf_temp.split(X_temp)):
    X_tr, X_val = X_temp.iloc[train_idx], X_temp.iloc[val_idx]
    y_tr, y_val = y_temp[train_idx], y_temp[val_idx]

    lgb_params_quick = {
        'objective': 'regression',
        'metric': 'rmse',
        'verbosity': -1,
        'learning_rate': 0.05,
        'num_leaves': 31,
        'device': 'gpu' if USE_GPU else 'cpu',
        'seed': 42
    }

    train_data = lgb.Dataset(X_tr, label=y_tr)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

    model = lgb.train(
        lgb_params_quick, train_data,
        num_boost_round=100,
        valid_sets=[val_data],
        callbacks=[lgb.early_stopping(20), lgb.log_evaluation(0)]
    )

    feature_importance += model.feature_importance(importance_type='gain')
    print(f"  Fold {fold+1}/3 done")

# Average importance
feature_importance /= 3

# Create DataFrame with importance
importance_df = pd.DataFrame({
    'feature': X_temp.columns,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

# Calculate cumulative importance
importance_df['cumulative_importance'] = importance_df['importance'].cumsum() / importance_df['importance'].sum()

print(f"\nTotal features: {len(importance_df)}")
print(f"\nTop 20 most important features:")
print(importance_df.head(20))

# Feature selection strategy
print("\n" + "="*60)
print("FEATURE SELECTION STRATEGY")
print("="*60)

USE_FEATURE_SELECTION = True
MIN_FEATURES = 8

if USE_FEATURE_SELECTION:
    # Keep features covering 98.5% of importance
    threshold = 0.995
    selected_features = importance_df[importance_df['cumulative_importance'] <= threshold]['feature'].tolist()
    
    # Ensure at least MIN_FEATURES features
    if len(selected_features) < MIN_FEATURES:
        selected_features = importance_df.head(MIN_FEATURES)['feature'].tolist()
else:
    selected_features = feature_cols
    print("Keeping ALL features (no selection)")

print(f"\n{'='*60}")
print(f"FEATURE SELECTION RESULTS")
print(f"{'='*60}")
print(f"Original features: {len(feature_cols)}")
print(f"Selected features: {len(selected_features)}")
print(f"Reduction: {100*(1 - len(selected_features)/len(feature_cols)):.1f}%")

# Update datasets
X_train_init = X_train_init[selected_features]
X_test_init = X_test_init[selected_features]
X_orig_init = X_orig_init[selected_features]
feature_cols = selected_features

print(f"\n✓ Feature selection complete!")
print(f"✓ Final dataset shape: {X_train_init.shape}")

# =============================================================================
# 6. SAVE PREPROCESSED DATA
# =============================================================================
print("\n" + "="*80)
print("SAVING PREPROCESSED DATA")
print("="*80)

# Create directories
os.makedirs('../data/preprocessed', exist_ok=True)
os.makedirs('../results', exist_ok=True)

# Save preprocessed datasets
joblib.dump(X_train_init, '../data/preprocessed/X_train.pkl')
joblib.dump(X_test_init, '../data/preprocessed/X_test.pkl')
joblib.dump(X_orig_init, '../data/preprocessed/X_orig.pkl')
joblib.dump(y, '../data/preprocessed/y_train.pkl')
joblib.dump(y_orig, '../data/preprocessed/y_orig.pkl')

print("✓ X_train.pkl saved")
print("✓ X_test.pkl saved")
print("✓ X_orig.pkl saved")
print("✓ y_train.pkl saved")
print("✓ y_orig.pkl saved")

# Save feature names
joblib.dump(selected_features, '../data/preprocessed/feature_names.pkl')
print("✓ feature_names.pkl saved")

# Save feature importance
importance_df.to_csv('../results/feature_importance.csv', index=False)
print("✓ feature_importance.csv saved")

# Save metadata
metadata = {
    'n_train': len(X_train_init),
    'n_test': len(X_test_init),
    'n_orig': len(X_orig_init),
    'n_features': len(selected_features),
    'cat_features': cat_features,
    'low_impact_cats': low_impact_cats,
    'adv_auc': adv_auc,
    'use_gpu': USE_GPU
}
joblib.dump(metadata, '../data/preprocessed/metadata.pkl')
print("✓ metadata.pkl saved")

print("\n" + "="*80)
print("PREPROCESSING COMPLETE!")
print("="*80)
print(f"✓ Train shape: {X_train_init.shape}")
print(f"✓ Test shape: {X_test_init.shape}")
print(f"✓ Original shape: {X_orig_init.shape}")
print(f"✓ All files saved in '../data/preprocessed/'")
print("\nYou can now proceed to hyperparameter optimization notebooks (02-09).")

GPU CONFIGURATION
⚠ No GPU detected, using CPU

LOADING DATA
Train: (630000, 13), Test: (270000, 12), Original: (20000, 13)

ADVERSARIAL VALIDATION - Detecting Distribution Shift
Adversarial AUC: 0.4996
✓ Distribution shift is minimal

OPTIMIZED FEATURE ENGINEERING (EDA-DRIVEN)

Applying optimized feature engineering...
  Creating Tier 1: Critical features...
  Creating Tier 2: High-impact categorical features...
  Creating Tier 3: Categorical × numeric interactions...
  Creating Tier 4: Domain-specific flags...
  Creating Tier 5: Intelligent binning...
  Creating Tier 6: Selective transformations...
  Creating Tier 7: Distance from optimal...
  Creating Tier 8: Meaningful ratios...
  Creating Tier 9: Formula-based features...
✓ Feature engineering complete!
  Creating Tier 1: Critical features...
  Creating Tier 2: High-impact categorical features...
  Creating Tier 3: Categorical × numeric interactions...
  Creating Tier 4: Domain-specific flags...
  Creating Tier 5: Intelligent binn