In [7]:
# ==============================================================================
# PROJECT: Kaggle Playground Series - S6E2 (Heart Disease)
# MISSION: Feature Engineering & Cross-Validation
# ==============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

# --- STEP 1: MOUNT & LOAD ---
drive.mount('/content/drive')
TRAIN_PATH = '/content/drive/MyDrive/Nihal Data/kaggle/S6E1 - heart/train.csv'
TEST_PATH = '/content/drive/MyDrive/Nihal Data/kaggle/S6E1 - heart/test.csv'

train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

# --- STEP 2: THE FORGE (FEATURE ENGINEERING) ---
# I am creating 'Interaction Features' to help the model see hidden risks.

def engineer_features(df):
    df = df.copy()

    # 1. Risk Score: Combining Age and Blood Pressure (Hypothetical columns)
    # Check your column names! If you have 'Age' and 'RestingBP', use them:
    if 'Age' in df.columns and 'RestingBP' in df.columns:
        df['Age_BP_Risk'] = df['Age'] * df['RestingBP']

    # 2. Cholesterol Efficiency (If you have 'Cholesterol' and 'MaxHR')
    if 'Cholesterol' in df.columns and 'MaxHR' in df.columns:
        df['Chol_HR_Ratio'] = df['Cholesterol'] / (df['MaxHR'] + 1)

    return df

train = engineer_features(train)
test = engineer_features(test)

# --- STEP 3: ENCODING ---
target_mapping = {'Absence': 0, 'Presence': 1}
y = train['Heart Disease'].map(target_mapping)
features = [col for col in train.columns if col not in ['id', 'Heart Disease']]

X = train[features].copy()
X_test = test[features].copy()

for col in X.select_dtypes(include=['object']).columns:
    X[col] = X[col].astype('category').cat.codes
    X_test[col] = X_test[col].astype('category').cat.codes

# --- STEP 4: CROSS-VALIDATION (The "Polygraph" Test) ---
# Instead of one split, I will train on 5 different "folds" of data to ensure stability.

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
test_preds = np.zeros(len(X_test))
cv_scores = []

print(" Starting 5-Fold Cross-Validation...")

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = XGBClassifier(
        n_estimators=2000, # Increased for better learning
        learning_rate=0.03, # Slower learning rate for precision
        max_depth=7,
        tree_method='hist',
        eval_metric='auc',
        early_stopping_rounds=50,
        random_state=42
    )

    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=0)

    val_probs = model.predict_proba(X_val)[:, 1]
    score = roc_auc_score(y_val, val_probs)
    cv_scores.append(score)

    # Add this fold's predictions to the total (we will average them later)
    test_preds += model.predict_proba(X_test)[:, 1] / 5

    print(f" Fold {fold+1} AUC: {score:.5f}")

print(f"\n Mean CV AUC: {np.mean(cv_scores):.5f}")

# --- STEP 5: FINAL SUBMISSION ---
submission = pd.DataFrame({'id': test['id'], 'Heart Disease': test_preds})
submission.to_csv('submission_v2_engineered.csv', index=False)
print(" 'submission_v2_engineered.csv' generated!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
 Starting 5-Fold Cross-Validation...
 Fold 1 AUC: 0.95525
 Fold 2 AUC: 0.95435
 Fold 3 AUC: 0.95516
 Fold 4 AUC: 0.95458
 Fold 5 AUC: 0.95548

 Mean CV AUC: 0.95496
 'submission_v2_engineered.csv' generated!
