In [1]:
# %% 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report

# --- Memory Optimization ---
def reduce_memory_usage(df):
    start_mem = df.memory_usage(deep=True).sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type == object:
            if df[col].nunique() / len(df[col]) < 0.5:
                df[col] = df[col].astype('category')
        elif col_type.name.startswith('int'):
            df[col] = pd.to_numeric(df[col], downcast='integer')
        elif col_type.name.startswith('float'):
            df[col] = pd.to_numeric(df[col], downcast='float')
    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    print(f"üîß Memory reduced: {start_mem:.2f}MB ‚Üí {end_mem:.2f}MB ({100*(start_mem-end_mem)/start_mem:.1f}%)")
    return df

# --- Load & preprocess ---
df = pd.read_csv("train.csv")
df = reduce_memory_usage(df)

X = df.drop(columns=['ASI_category', 'ID'])
y = df['ASI_category'].astype('category').cat.codes

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("‚úÖ Data ready:", X_train.shape, X_val.shape)


üîß Memory reduced: 3.74MB ‚Üí 1.35MB (63.8%)
‚úÖ Data ready: (14522, 19) (3631, 19)


In [2]:
# %% 
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import early_stopping, log_evaluation

# --- Base models ---
best_xgb = XGBClassifier(
    n_estimators=800, learning_rate=0.045, max_depth=8,
    subsample=0.7, colsample_bytree=0.8, random_state=42,
    eval_metric="mlogloss", n_jobs=-1, use_label_encoder=False
)
best_lgbm = LGBMClassifier(
    n_estimators=400, learning_rate=0.055, num_leaves=40,
    max_depth=14, subsample=0.85, colsample_bytree=0.95,
    reg_alpha=1e-5, reg_lambda=0.02, min_child_samples=18,
    random_state=42, n_jobs=-1
)
best_rf = RandomForestClassifier(
    n_estimators=600, max_depth=10, min_samples_split=4,
    min_samples_leaf=2, max_features=0.8, criterion="log_loss",
    random_state=42, n_jobs=-1
)

print("üîß Training base models...")
best_xgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
best_lgbm.fit(X_train, y_train, eval_set=[(X_val, y_val)],
              callbacks=[early_stopping(80), log_evaluation(0)])
best_rf.fit(X_train, y_train)
print("‚úÖ Base models trained.")


üîß Training base models...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001741 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3847
[LightGBM] [Info] Number of data points in the train set: 14522, number of used features: 19
[LightGBM] [Info] Start training from score -1.755382
[LightGBM] [Info] Start training from score -0.355142
[LightGBM] [Info] Start training from score -2.070802
Training until validation scores don't improve for 80 rounds
Early stopping, best iteration is:
[123]	valid_0's multi_logloss: 0.157847
‚úÖ Base models trained.


In [3]:
# %% 
# --- Weighted ensemble optimization ---
from itertools import product

xgb_p, lgbm_p, rf_p = best_xgb.predict_proba(X_val), best_lgbm.predict_proba(X_val), best_rf.predict_proba(X_val)
grid = np.arange(0.1, 1.1, 0.1)
best_f1, best_w = 0, None

for w1, w2, w3 in product(grid, repeat=3):
    probs = (w1*xgb_p + w2*lgbm_p + w3*rf_p) / (w1+w2+w3)
    preds = np.argmax(probs, axis=1)
    f1 = f1_score(y_val, preds, average="macro")
    if f1 > best_f1: best_f1, best_w = f1, (w1, w2, w3)

print(f"üî• Best weights found: {best_w} | F1 = {best_f1:.4f}")
final_probs = (best_w[0]*xgb_p + best_w[1]*lgbm_p + best_w[2]*rf_p) / sum(best_w)
val_preds = np.argmax(final_probs, axis=1)
print("\n‚úÖ Ensemble Performance:")
print(f"Accuracy: {accuracy_score(y_val, val_preds):.4f} | F1: {f1_score(y_val, val_preds, average='macro'):.4f}")
print(classification_report(y_val, val_preds))


üî• Best weights found: (np.float64(1.0), np.float64(0.2), np.float64(0.1)) | F1 = 0.9227

‚úÖ Ensemble Performance:
Accuracy: 0.9435 | F1: 0.9227
              precision    recall  f1-score   support

           0       0.91      0.89      0.90       628
           1       0.96      0.97      0.96      2546
           2       0.92      0.89      0.91       457

    accuracy                           0.94      3631
   macro avg       0.93      0.92      0.92      3631
weighted avg       0.94      0.94      0.94      3631



In [4]:
# %% 
# --- Calibration + Meta-stacking ---
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression

print("‚öñÔ∏è Calibrating base models...")
cal_xgb = CalibratedClassifierCV(best_xgb, method="isotonic", cv="prefit")
cal_lgbm = CalibratedClassifierCV(best_lgbm, method="isotonic", cv="prefit")
cal_rf   = CalibratedClassifierCV(best_rf, method="isotonic", cv="prefit")
for model in [cal_xgb, cal_lgbm, cal_rf]:
    model.fit(X_val, y_val)

stack_X = np.hstack([cal_xgb.predict_proba(X_val), cal_lgbm.predict_proba(X_val), cal_rf.predict_proba(X_val)])
meta_lr = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
meta_lr.fit(stack_X, y_val)

stack_preds = np.argmax(meta_lr.predict_proba(stack_X), axis=1)
print(f"üèÅ Calibrated Meta Accuracy: {accuracy_score(y_val, stack_preds):.4f}")
print(f"üèÅ Calibrated Meta F1: {f1_score(y_val, stack_preds, average='macro'):.4f}")


‚öñÔ∏è Calibrating base models...




üèÅ Calibrated Meta Accuracy: 0.9433
üèÅ Calibrated Meta F1: 0.9218




In [5]:
# %% 
# --- Meta Feature Engineering + Pseudo-Labeling ---
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

print("üß™ Running Advanced Ensemble Refinement...")
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_train_poly, X_val_poly = poly.fit_transform(X_train), poly.transform(X_val)
scaler = StandardScaler()
X_train_poly, X_val_poly = scaler.fit_transform(X_train_poly), scaler.transform(X_val_poly)

xgb_p, lgbm_p, rf_p = cal_xgb.predict_proba(X_val), cal_lgbm.predict_proba(X_val), cal_rf.predict_proba(X_val)
meta_features = np.hstack([xgb_p, lgbm_p, rf_p, X_val_poly])

meta_lr = LogisticRegression(multi_class="multinomial", solver="lbfgs", C=2.0, max_iter=2000, random_state=42)
meta_lr.fit(meta_features, y_val)
meta_preds = meta_lr.predict(meta_features)

print(f"üèÅ Meta-LogReg F1: {f1_score(y_val, meta_preds, average='macro'):.4f}")
confidence = np.max(meta_lr.predict_proba(meta_features), axis=1)
pseudo_idx = np.where(confidence >= 0.95)[0]

if len(pseudo_idx) > 0:
    X_val_sel = X_val.iloc[pseudo_idx]
    val_aug = np.hstack([
        cal_xgb.predict_proba(X_val_sel),
        cal_lgbm.predict_proba(X_val_sel),
        cal_rf.predict_proba(X_val_sel),
        X_val_poly[pseudo_idx]
    ])
    X_aug = np.vstack([meta_features, val_aug])
    y_aug = np.concatenate([y_val, y_val.iloc[pseudo_idx]])
    meta_lr.fit(X_aug, y_aug)
    final_preds = meta_lr.predict(meta_features)
    print(f"üöÄ Pseudo-Labeled F1: {f1_score(y_val, final_preds, average='macro'):.4f}")
else:
    print("‚ö†Ô∏è No pseudo-labels added.")


üß™ Running Advanced Ensemble Refinement...




üèÅ Meta-LogReg F1: 0.9447




üöÄ Pseudo-Labeled F1: 0.9419
