# Student Test Score Prediction - Version 4

### Strategy Overview
- **Linear Regression with Target Encoding**: Baseline model for calibrated predictions
- **XGBoost Ensemble**: Uses LR predictions as a strong engineered feature  
- **5-7 Fold Cross-Validation**: Stable out-of-fold predictions
- **Feature Engineering**: Polynomial, logarithmic, and interaction features
- **RMSE Score**: ~8.71 on validation set
- **Data Augmentation**: Combines original dataset (20k rows) with playground data for better generalization

In [None]:
# Section 1: Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import TargetEncoder
import xgboost as xgb
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings("ignore")

print("✓ All libraries imported successfully")

In [None]:
# Section 2: Load and Explore Data
# Load training and test data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
submission_df = pd.read_csv('sample_submission.csv')

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print(f"Submission shape: {submission_df.shape}")
print(f"\nTraining columns: {train_df.columns.tolist()}")
print(f"Test columns: {test_df.columns.tolist()}")

# Target and ID columns
TARGET = 'exam_score'
ID = 'id'
base_features = [col for col in train_df.columns if col not in [TARGET, ID]]
CATS = train_df.select_dtypes('object').columns.tolist()

print(f"\nBase features: {base_features}")
print(f"Categorical features: {CATS}")

In [None]:
# Section 3: Feature Engineering
def add_poly_features(df):
    """Add polynomial, logarithmic, and square root features"""
    df = df.copy()
    
    # Polynomial features (squared and cubed)
    df['study_hours_squared'] = df['study_hours'] ** 2
    df['study_hours_cubed'] = df['study_hours'] ** 3
    df['class_attendance_squared'] = df['class_attendance'] ** 2
    df['sleep_hours_squared'] = df['sleep_hours'] ** 2
    df['age_squared'] = df['age'] ** 2
    
    # Logarithmic transformations
    df['log_study_hours'] = np.log1p(df['study_hours'])
    df['log_class_attendance'] = np.log1p(df['class_attendance'])
    df['log_sleep_hours'] = np.log1p(df['sleep_hours'])
    
    # Square root transformations
    df['sqrt_study_hours'] = np.sqrt(df['study_hours'])
    df['sqrt_class_attendance'] = np.sqrt(df['class_attendance'])
    
    return df

# Apply feature engineering
train_df = add_poly_features(train_df)
test_df = add_poly_features(test_df)

# List of new polynomial features
poly_features = ['study_hours_squared', 'study_hours_cubed', 'class_attendance_squared',
                 'sleep_hours_squared', 'age_squared', 'log_study_hours', 'log_class_attendance',
                 'log_sleep_hours', 'sqrt_study_hours', 'sqrt_class_attendance']

all_features = base_features + poly_features

print(f"✓ Feature engineering complete")
print(f"Total features: {len(all_features)}")
print(f"New polynomial features: {poly_features}")

In [None]:
# Section 4: Data Preprocessing - Linear Regression with Target Encoding
X_train_lr = train_df[all_features].copy()
y_train_lr = train_df[TARGET]
X_test_lr = test_df[all_features].copy()

N_TRAIN = len(train_df)
N_TEST = len(test_df)
FOLDS_LR = 5

print("Starting Linear Regression with 5-Fold Cross-Validation")
print(f"Training samples: {N_TRAIN}, Test samples: {N_TEST}\n")

kf_lr = KFold(n_splits=FOLDS_LR, shuffle=True, random_state=42)

oof_lr = np.zeros(N_TRAIN)
test_preds_lr = np.zeros((N_TEST, FOLDS_LR))

for fold, (trn_idx, val_idx) in enumerate(kf_lr.split(X_train_lr)):
    print(f"LR Fold {fold+1}/{FOLDS_LR}")
    
    X_tr = X_train_lr.iloc[trn_idx]
    y_tr = y_train_lr.iloc[trn_idx]
    X_val = X_train_lr.iloc[val_idx]
    y_val = y_train_lr.iloc[val_idx]
    
    # Target encode categoricals
    te = TargetEncoder(smooth="auto", target_type="continuous")
    X_tr_enc = X_tr.copy()
    X_val_enc = X_val.copy()
    X_test_enc = X_test_lr.copy()
    
    X_tr_enc[CATS] = te.fit_transform(X_tr[CATS], y_tr)
    X_val_enc[CATS] = te.transform(X_val[CATS])
    X_test_enc[CATS] = te.transform(X_test_lr[CATS])
    
    # Fit Linear Regression
    lr = LinearRegression()
    lr.fit(X_tr_enc, y_tr)
    
    # Predict & clip
    val_pred = np.clip(lr.predict(X_val_enc), 0, 100)
    test_pred = np.clip(lr.predict(X_test_enc), 0, 100)
    
    oof_lr[val_idx] = val_pred
    test_preds_lr[:, fold] = test_pred
    
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    print(f"  → Fold RMSE: {rmse:.6f}")

oof_rmse = np.sqrt(mean_squared_error(y_train_lr, oof_lr))
print(f"\n✓ Linear Regression OOF RMSE: {oof_rmse:.6f}")

In [None]:
# Section 5: Model Configuration - XGBoost with LR predictions as feature
print("Preparing data for XGBoost with LR predictions as a strong feature...\n")

# Create full dataset with categorical support
full_df = pd.concat([train_df[all_features], test_df[all_features]], axis=0)

# Enable native categorical support for XGBoost
for col in CATS:
    full_df[col] = full_df[col].astype('category')

X_xgb = full_df.iloc[:N_TRAIN].copy()
X_test_xgb = full_df.iloc[N_TRAIN:N_TRAIN+N_TEST].copy()

# Add strong LR prediction as a feature
X_xgb['lr_pred'] = oof_lr
X_test_xgb['lr_pred'] = test_preds_lr.mean(axis=1)

y_xgb = train_df[TARGET]

print("✓ Data preparation complete")
print(f"XGBoost training samples: {X_xgb.shape}")
print(f"XGBoost test samples: {X_test_xgb.shape}\n")

In [None]:
# Section 6: XGBoost Training with 7-Fold Cross-Validation
print("Starting XGBoost training with 7-Fold Cross-Validation\n")

xgb_params = {
    'n_estimators': 10000,
    'learning_rate': 0.007,
    'max_depth': 7,
    'subsample': 0.8,
    'reg_lambda': 3,
    'colsample_bytree': 0.6,
    'colsample_bynode': 0.7,
    'tree_method': 'hist',
    'random_state': 42,
    'eval_metric': 'rmse',
    'enable_categorical': True,
    'verbose': 0
}

FOLDS_XGB = 7
kf_xgb = KFold(n_splits=FOLDS_XGB, shuffle=True, random_state=42)

oof_xgb = np.zeros(N_TRAIN)
test_preds_xgb = []
fold_rmses = []

for fold, (trn_idx, val_idx) in enumerate(kf_xgb.split(X_xgb)):
    print(f"XGB Fold {fold+1}/{FOLDS_XGB}")
    
    X_tr = X_xgb.iloc[trn_idx]
    y_tr = y_xgb.iloc[trn_idx]
    X_val = X_xgb.iloc[val_idx]
    y_val = y_xgb.iloc[val_idx]
    
    # Train XGBoost
    model = xgb.XGBRegressor(**xgb_params)
    model.fit(X_tr, y_tr,
              eval_set=[(X_val, y_val)],
              early_stopping_rounds=100,
              verbose=False)
    
    # Predictions
    val_pred = np.clip(model.predict(X_val), 0, 100)
    oof_xgb[val_idx] = val_pred
    
    # Calculate fold RMSE
    rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    fold_rmses.append(rmse)
    print(f"  → Fold RMSE: {rmse:.5f}")
    
    # Test predictions
    test_pred = np.clip(model.predict(X_test_xgb), 0, 100)
    test_preds_xgb.append(test_pred)

final_oof_rmse = np.sqrt(mean_squared_error(y_xgb, oof_xgb))
print(f"\n✓ XGBoost Final OOF RMSE: {final_oof_rmse:.5f}")
print(f"✓ Mean Fold RMSE: {np.mean(fold_rmses):.5f} ± {np.std(fold_rmses):.5f}")

In [None]:
# Section 7 & 8: Generate Predictions and Create Submission
print("Generating final predictions and creating submission...\n")

# Average test predictions across all folds
ensemble_test_preds = np.mean(test_preds_xgb, axis=0)

# Clip predictions to valid range [0, 100]
ensemble_test_preds = np.clip(ensemble_test_preds, 0, 100)

# Create submission DataFrame
submission_df['exam_score'] = ensemble_test_preds

# Save submission
submission_df.to_csv('submission.csv', index=False)

print("=" * 60)
print("SUBMISSION READY")
print("=" * 60)
print(f"\nSubmission file: submission.csv")
print(f"Shape: {submission_df.shape}")
print(f"\nFirst 10 predictions:")
print(submission_df.head(10))

print(f"\n\nSummary Statistics:")
print(f"Min score: {submission_df['exam_score'].min():.2f}")
print(f"Max score: {submission_df['exam_score'].max():.2f}")
print(f"Mean score: {submission_df['exam_score'].mean():.2f}")
print(f"Std score: {submission_df['exam_score'].std():.2f}")

print(f"\nValidation Metrics:")
print(f"Linear Regression OOF RMSE: {oof_rmse:.6f}")
print(f"XGBoost OOF RMSE: {final_oof_rmse:.5f}")
print(f"\n✓ Submission complete!")