In [11]:
# ==============================================================================
# PROJECT: Kaggle Playground Series - S6E2 (Heart Disease)
# MISSION: Pseudo-Labeling (Augmenting the Training Set)
# ==============================================================================

import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from google.colab import drive

# --- STEP 1: MOUNT & LOAD ---
drive.mount('/content/drive')
TRAIN_PATH = '/content/drive/MyDrive/Nihal Data/kaggle/S6E1 - heart/train.csv'
TEST_PATH = '/content/drive/MyDrive/Nihal Data/kaggle/S6E1 - heart/test.csv'

train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

# Preparation
target_mapping = {'Absence': 0, 'Presence': 1}
train['target'] = train['Heart Disease'].map(target_mapping)

X = train.drop(['id', 'Heart Disease', 'target'], axis=1)
y = train['target']
X_test = test.drop(['id'], axis=1)

# Categorical Encoding
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = X[col].astype('category').cat.codes
        X_test[col] = X_test[col].astype('category').cat.codes

# --- STEP 2: GENERATE PSEUDO-LABELS ---
# Using your best parameters from Trial 4
best_params = {
    'n_estimators': 1000,
    'learning_rate': 0.0524,
    'max_depth': 4,
    'subsample': 0.817,
    'colsample_bytree': 0.509,
    'tree_method': 'hist',
    'device': 'cuda',
    'random_state': 42
}

model = XGBClassifier(**best_params)
model.fit(X, y)
test_probs = model.predict_proba(X_test)[:, 1]

# Identify highly confident predictions (Top and Bottom 5%)
pseudo_limit = 0.02 # Confidence threshold
high_conf_presence = test_probs > (1 - pseudo_limit)
high_conf_absence = test_probs < pseudo_limit

# Create Pseudo-labeled dataset
pseudo_X = X_test[high_conf_presence | high_conf_absence].copy()
pseudo_y = (test_probs[high_conf_presence | high_conf_absence] > 0.5).astype(int)

# --- STEP 3: AUGMENTED TRAINING ---
# Combine original train data with pseudo-labeled test data
X_augmented = pd.concat([X, pseudo_X])
y_augmented = pd.concat([y, pd.Series(pseudo_y)])

print(f"Original Train Size: {len(X)}")
print(f"Added Pseudo-labels: {len(pseudo_X)}")

# Re-train on the combined set
final_model = XGBClassifier(**best_params)
final_model.fit(X_augmented, y_augmented)

# --- STEP 4: FINAL SUBMISSION ---
final_probs = final_model.predict_proba(X_test)[:, 1]
submission = pd.DataFrame({'id': test['id'], 'Heart Disease': final_probs})
submission.to_csv('submission_v8_pseudo.csv', index=False)

print("Pseudo-labeled submission 'submission_v8_pseudo.csv' generated.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Original Train Size: 630000
Added Pseudo-labels: 83501
Pseudo-labeled submission 'submission_v8_pseudo.csv' generated.
