In [2]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.isotonic import IsotonicRegression

from xgboost import XGBClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier

warnings.filterwarnings('ignore')

Step 1/6: Loading data and engineering v4 features.


In [9]:
try:
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
except FileNotFoundError:
    print("Error: 'train.csv' or 'test.csv' not found.")


for df in [train_df, test_df]:
    df['Precip_Check'] = (df['Rainfall'] + df['Snowfall']) - df['Precipitation']
    df['Pressure_Diff'] = df['Pressure_MSL'] - df['Surface_Pressure']
    df['Temp_Dew_Diff'] = df['Temperature'] - df['Dew_Point']
    df['Wind_Gust_Ratio'] = df['Wind_Gusts'] / (df['Wind_Speed'] + 1e-6)
    df['Sunshine_Fraction'] = df['Sunshine_Duration'] / (df['Daylight_Duration'] + 1e-6)
    df['Temp_Rad_Interaction'] = df['Temperature'] * df['Radiation']

train_df.replace([np.inf, -np.inf], 0, inplace=True)
test_df.replace([np.inf, -np.inf], 0, inplace=True)

print("v4 features created and added to dataframes.")

v4 features created and added to dataframes.


Step 2/6: Preparing data (encoding target, calculating weights)

In [10]:
X = train_df.drop(['ID', 'ASI_category'], axis=1)
y_raw = train_df['ASI_category']
X_test = test_df.drop('ID', axis=1)
test_ids = test_df['ID']

le = LabelEncoder()
y_true = le.fit_transform(y_raw)
print(f"Class mapping: {dict(zip(le.classes_, le.transform(le.classes_)))}")

sample_weights = compute_sample_weight(
    class_weight='balanced',
    y=y_true
)

y_true_one_hot = OneHotEncoder().fit_transform(y_true.reshape(-1, 1)).toarray()

print("Data preparation complete.")

Class mapping: {'Good': np.int64(0), 'Moderate': np.int64(1), 'Poor': np.int64(2)}
Data preparation complete.


Step 3/6: Defining all optimized model parameters

In [11]:
params_xgb = {
    'max_depth': 9, 'subsample': 0.8266, 'colsample_bytree': 0.6577,
    'min_child_weight': 1, 'reg_alpha': 0.0010, 'reg_lambda': 0.3525,
    'objective': 'multi:softprob', 'num_class': 3, 'eval_metric': 'mlogloss',
    'n_jobs': -1, 'tree_method': 'hist', 'random_state': 42,
    'n_estimators': 2000, 'learning_rate': 0.05
}

params_lgbm = {
    'num_leaves': 45, 'max_depth': 7, 'subsample': 0.7007,
    'colsample_bytree': 0.6263, 'min_child_samples': 20,
    'reg_alpha': 0.00024, 'reg_lambda': 0.0042,
    'objective': 'multiclass', 'num_class': 3, 'metric': 'multi_logloss',
    'random_state': 42, 'n_jobs': -1, 'n_estimators': 2000,
    'learning_rate': 0.05
}

params_cat = {
    'depth': 8, 'subsample': 0.7317, 'l2_leaf_reg': 0.1863,
    'random_strength': 0.1576,
    'iterations': 2000, 'learning_rate': 0.05, 'random_seed': 42,
    'eval_metric': 'MultiClass', 'loss_function': 'MultiClass',
    'task_type': 'CPU', 'bootstrap_type': 'Bernoulli'
}

print("Parameters saved.")

Parameters saved.


Step 4/6: Starting 3-model CV training

In [12]:
X_lgbm = X.copy()
X_test_lgbm = X_test.copy()
X_cat = X.copy()
X_test_cat = X_test.copy()

all_weather_codes = pd.unique(list(X_lgbm['Weather_Code']) + list(X_test_lgbm['Weather_Code']))
cat_dtype = pd.api.types.CategoricalDtype(categories=all_weather_codes, ordered=False)
X_lgbm['Weather_Code'] = X_lgbm['Weather_Code'].astype(cat_dtype)
X_test_lgbm['Weather_Code'] = X_test_lgbm['Weather_Code'].astype(cat_dtype)

cat_features_indices_cat = [X_cat.columns.get_loc('Weather_Code')]

N_SPLITS = 5 
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

oof_preds_xgb = np.zeros((len(train_df), 3))
test_preds_xgb = np.zeros((len(test_df), 3))
oof_preds_lgbm = np.zeros((len(train_df), 3))
test_preds_lgbm = np.zeros((len(test_df), 3))
oof_preds_cat = np.zeros((len(train_df), 3))
test_preds_cat = np.zeros((len(test_df), 3))

for fold, (train_index, val_index) in enumerate(skf.split(X, y_true)):
    print(f"\n  --- FOLD {fold+1}/{N_SPLITS} ---")
    
    y_train, y_val = y_true[train_index], y_true[val_index]
    fold_sample_weights = sample_weights[train_index]
    
    print("    Training XGBoost...")
    X_train_xgb, X_val_xgb = X.iloc[train_index], X.iloc[val_index]
    model_xgb = XGBClassifier(**params_xgb) 
    model_xgb.fit(
        X_train_xgb, y_train, eval_set=[(X_val_xgb, y_val)],
        early_stopping_rounds=100,
        sample_weight=fold_sample_weights, verbose=False
    )
    oof_preds_xgb[val_index] = model_xgb.predict_proba(X_val_xgb)
    test_preds_xgb += model_xgb.predict_proba(X_test, iteration_range=(0, model_xgb.best_iteration)) / N_SPLITS

    print("    Training LightGBM...")
    X_train_lgbm, X_val_lgbm = X_lgbm.iloc[train_index], X_lgbm.iloc[val_index]
    model_lgbm = lgb.LGBMClassifier(**params_lgbm)
    model_lgbm.fit(
        X_train_lgbm, y_train, eval_set=[(X_val_lgbm, y_val)],
        callbacks=[lgb.early_stopping(100, verbose=False)],
        sample_weight=fold_sample_weights,
        categorical_feature=['Weather_Code']
    )
    oof_preds_lgbm[val_index] = model_lgbm.predict_proba(X_val_lgbm)
    test_preds_lgbm += model_lgbm.predict_proba(X_test_lgbm, num_iteration=model_lgbm.best_iteration_) / N_SPLITS

    print("    Training CatBoost...")
    X_train_cat, X_val_cat = X_cat.iloc[train_index], X_cat.iloc[val_index]
    model_cat = CatBoostClassifier(**params_cat)
    model_cat.fit(
        X_train_cat, y_train, eval_set=[(X_val_cat, y_val)],
        early_stopping_rounds=100,
        sample_weight=fold_sample_weights,
        cat_features=cat_features_indices_cat,
        verbose=False
    )
    oof_preds_cat[val_index] = model_cat.predict_proba(X_val_cat)
    test_preds_cat += model_cat.predict_proba(X_test_cat) / N_SPLITS


  --- FOLD 1/5 ---
    Training XGBoost...
    Training LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001025 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5369
[LightGBM] [Info] Number of data points in the train set: 14522, number of used features: 25
[LightGBM] [Info] Start training from score -1.098808
[LightGBM] [Info] Start training from score -1.098708
[LightGBM] [Info] Start training from score -1.098321
    Training CatBoost...

  --- FOLD 2/5 ---
    Training XGBoost...
    Training LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000863 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5373
[LightGBM] [Info] Number of data points in the train set: 14522, number of used features: 25
[LightGBM] [Info] Start training from score -1.098808
[LightGBM] [Info] Start training from score

Step 5/6: Training 'manager' (stacking) model

In [13]:
X_meta_train = np.hstack((oof_preds_xgb, oof_preds_lgbm, oof_preds_cat))
X_meta_test = np.hstack((test_preds_xgb, test_preds_lgbm, test_preds_cat))

meta_model = LogisticRegression(random_state=42, n_jobs=-1)
meta_model.fit(X_meta_train, y_true)

oof_probs_from_stack = meta_model.predict_proba(X_meta_train)
test_probs_from_stack = meta_model.predict_proba(X_meta_test)

print("Stacking manager model trained.")

Stacking manager model trained.


Step 6/6: Calibrating final probabilities and creating submission

In [3]:
calibrators = []
calibrated_test_probs = np.zeros_like(test_probs_from_stack)

for i in range(oof_probs_from_stack.shape[1]): 
    ir = IsotonicRegression(out_of_bounds='clip', y_min=1e-6, y_max=1-1e-6)
    
    ir.fit(oof_probs_from_stack[:, i], y_true_one_hot[:, i])
    calibrators.append(ir)
    
    calibrated_test_probs[:, i] = ir.transform(test_probs_from_stack[:, i])

calibrated_test_probs /= calibrated_test_probs.sum(axis=1, keepdims=True)

calibrated_oof_probs = np.zeros_like(oof_probs_from_stack)
for i in range(oof_probs_from_stack.shape[1]):
    calibrated_oof_probs[:, i] = calibrators[i].transform(oof_probs_from_stack[:, i])
calibrated_oof_probs /= calibrated_oof_probs.sum(axis=1, keepdims=True)
final_oof_preds = np.argmax(calibrated_oof_probs, axis=1)
final_f1_score = f1_score(y_true, final_oof_preds, average='macro')

print(f"\nFinal calibrated OOF F1 Score: {final_f1_score:.6f}")

final_test_preds = np.argmax(calibrated_test_probs, axis=1)
final_test_labels = le.inverse_transform(final_test_preds)

submission_df = pd.DataFrame({'ID': test_ids, 'Target': final_test_labels})

submission_df.to_csv("submission_.csv", index=False)

print("Final submission file created.")

NameError: name 'test_probs_from_stack' is not defined

## New model

In [15]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB # <-- Our new model
from xgboost import XGBClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier

warnings.filterwarnings('ignore')

# --- 1. Load v4 Data (from our previous 'train_df' variable) ---
print("Step 1/3: Loading and preparing v4 data...")
try:
    X # Checks if X is in memory
except NameError:
    print("Error: 'train_df' or 'X' not found. Please re-run Cell 2 from our previous script.")
    # In a real script, we would stop

# --- 2. Define All Model Parameters ---
# (Using the same tuned params as before)
params_xgb = {
    'max_depth': 9, 'subsample': 0.8266, 'colsample_bytree': 0.6577,
    'min_child_weight': 1, 'reg_alpha': 0.0010, 'reg_lambda': 0.3525,
    'objective': 'multi:softprob', 'num_class': 3, 'eval_metric': 'mlogloss',
    'n_jobs': -1, 'tree_method': 'hist', 'random_state': 42,
    'n_estimators': 2000, 'learning_rate': 0.05
}
params_lgbm = {
    'num_leaves': 45, 'max_depth': 7, 'subsample': 0.7007,
    'colsample_bytree': 0.6263, 'min_child_samples': 20,
    'reg_alpha': 0.00024, 'reg_lambda': 0.0042,
    'objective': 'multiclass', 'num_class': 3, 'metric': 'multi_logloss',
    'random_state': 42, 'n_jobs': -1, 'n_estimators': 2000,
    'learning_rate': 0.05
}
params_cat = {
    'depth': 8, 'subsample': 0.7317, 'l2_leaf_reg': 0.1863,
    'random_strength': 0.1576,
    'iterations': 2000, 'learning_rate': 0.05, 'random_seed': 42,
    'eval_metric': 'MultiClass', 'loss_function': 'MultiClass',
    'task_type': 'CPU', 'bootstrap_type': 'Bernoulli'
}

# --- 3. Prepare Data Copies for different models ---
X_lgbm = X.copy()
X_test_lgbm = X_test.copy()
X_cat = X.copy()
X_test_cat = X_test.copy()
X_nb = X.copy() # For Naive Bayes
X_test_nb = X_test.copy()

# Fix categoricals for LGBM
all_weather_codes = pd.unique(list(X_lgbm['Weather_Code']) + list(X_test_lgbm['Weather_Code']))
cat_dtype = pd.api.types.CategoricalDtype(categories=all_weather_codes, ordered=False)
X_lgbm['Weather_Code'] = X_lgbm['Weather_Code'].astype(cat_dtype)
X_test_lgbm['Weather_Code'] = X_test_lgbm['Weather_Code'].astype(cat_dtype)
# Get categorical index for CatBoost
cat_features_indices_cat = [X_cat.columns.get_loc('Weather_Code')]

# --- 4. Set up CV and Prediction Arrays ---
N_SPLITS = 5 
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

# Create prediction arrays for 4 models
oof_preds_xgb = np.zeros((len(train_df), 3))
test_preds_xgb = np.zeros((len(test_df), 3))
oof_preds_lgbm = np.zeros((len(train_df), 3))
test_preds_lgbm = np.zeros((len(test_df), 3))
oof_preds_cat = np.zeros((len(train_df), 3))
test_preds_cat = np.zeros((len(test_df), 3))
oof_preds_nb = np.zeros((len(train_df), 3)) # <-- New
test_preds_nb = np.zeros((len(test_df), 3)) # <-- New

print("Step 2/3: Starting 4-model CV training... (This will take a while)")

# --- 5. Run the 4-Model CV Loop ---
for fold, (train_index, val_index) in enumerate(skf.split(X, y_true)):
    print(f"\n  --- FOLD {fold+1}/{N_SPLITS} ---")
    
    y_train, y_val = y_true[train_index], y_true[val_index]
    fold_sample_weights = sample_weights[train_index]
    
    # --- XGBoost ---
    print("    Training XGBoost...")
    X_train_xgb, X_val_xgb = X.iloc[train_index], X.iloc[val_index]
    model_xgb = XGBClassifier(**params_xgb) 
    model_xgb.fit(X_train_xgb, y_train, eval_set=[(X_val_xgb, y_val)], early_stopping_rounds=100, sample_weight=fold_sample_weights, verbose=False)
    oof_preds_xgb[val_index] = model_xgb.predict_proba(X_val_xgb)
    test_preds_xgb += model_xgb.predict_proba(X_test, iteration_range=(0, model_xgb.best_iteration)) / N_SPLITS

    # --- LightGBM ---
    print("    Training LightGBM...")
    X_train_lgbm, X_val_lgbm = X_lgbm.iloc[train_index], X_lgbm.iloc[val_index]
    model_lgbm = lgb.LGBMClassifier(**params_lgbm)
    model_lgbm.fit(X_train_lgbm, y_train, eval_set=[(X_val_lgbm, y_val)], callbacks=[lgb.early_stopping(100, verbose=False)], sample_weight=fold_sample_weights, categorical_feature=['Weather_Code'])
    oof_preds_lgbm[val_index] = model_lgbm.predict_proba(X_val_lgbm)
    test_preds_lgbm += model_lgbm.predict_proba(X_test_lgbm, num_iteration=model_lgbm.best_iteration_) / N_SPLITS

    # --- CatBoost ---
    print("    Training CatBoost...")
    X_train_cat, X_val_cat = X_cat.iloc[train_index], X_cat.iloc[val_index]
    model_cat = CatBoostClassifier(**params_cat)
    model_cat.fit(X_train_cat, y_train, eval_set=[(X_val_cat, y_val)], early_stopping_rounds=100, sample_weight=fold_sample_weights, cat_features=cat_features_indices_cat, verbose=False)
    oof_preds_cat[val_index] = model_cat.predict_proba(X_val_cat)
    test_preds_cat += model_cat.predict_proba(X_test_cat) / N_SPLITS
    
    # --- Gaussian Naive Bayes (New Model) ---
    print("    Training GaussianNB...")
    X_train_nb, X_val_nb = X_nb.iloc[train_index], X_nb.iloc[val_index]
    
    # Naive Bayes needs scaled data
    scaler = StandardScaler()
    X_train_nb_scaled = scaler.fit_transform(X_train_nb)
    X_val_nb_scaled = scaler.transform(X_val_nb)
    X_test_nb_scaled = scaler.transform(X_test_nb) # Scale test data
    
    model_nb = GaussianNB()
    # Note: GaussianNB doesn't use sample_weights in the same way, but we can fit it
    model_nb.fit(X_train_nb_scaled, y_train)
    oof_preds_nb[val_index] = model_nb.predict_proba(X_val_nb_scaled)
    test_preds_nb += model_nb.predict_proba(X_test_nb_scaled) / N_SPLITS
    
print("\nStep 2/3: All 4 models trained successfully.")

# --- 6. Save All 'v4' Prediction Arrays ---
print("Step 3/3: Saving 'v4' prediction arrays...")
all_preds_v4 = {
    'oof_xgb': oof_preds_xgb, 'test_xgb': test_preds_xgb,
    'oof_lgbm': oof_preds_lgbm, 'test_lgbm': test_preds_lgbm,
    'oof_cat': oof_preds_cat, 'test_cat': test_preds_cat,
    'oof_nb': oof_preds_nb, 'test_nb': test_preds_nb, # <-- New
    'y_true': y_true, 'test_ids': test_ids, 'label_encoder': le
}
np.save('all_model_preds_v4.npy', all_preds_v4, allow_pickle=True)

print("File 'all_model_preds_v4.npy' saved successfully.")

Step 1/3: Loading and preparing v4 data...
Step 2/3: Starting 4-model CV training... (This will take a while)

  --- FOLD 1/5 ---
    Training XGBoost...
    Training LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000827 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5369
[LightGBM] [Info] Number of data points in the train set: 14522, number of used features: 25
[LightGBM] [Info] Start training from score -1.098808
[LightGBM] [Info] Start training from score -1.098708
[LightGBM] [Info] Start training from score -1.098321
    Training CatBoost...
    Training GaussianNB...

  --- FOLD 2/5 ---
    Training XGBoost...
    Training LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000998 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5373
[LightGBM] [Info] Number of data points in the train 

In [16]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.isotonic import IsotonicRegression
import warnings

warnings.filterwarnings('ignore')

print("Loading 'all_model_preds_v4.npy' for the new 4-model stack...")
try:
    all_preds = np.load('all_model_preds_v4.npy', allow_pickle=True).item()
except FileNotFoundError:
    print("Error: 'all_model_preds_v4.npy' not found. Please run the cell above first.")

# --- 1. Extract all 'v4' arrays ---
oof_xgb = all_preds['oof_xgb']
test_xgb = all_preds['test_xgb']
oof_lgbm = all_preds['oof_lgbm']
test_lgbm = all_preds['test_lgbm']
oof_cat = all_preds['oof_cat']
test_cat = all_preds['test_cat']
oof_nb = all_preds['oof_nb']
test_nb = all_preds['test_nb']

y_true = all_preds['y_true']
test_ids = all_preds['test_ids']
le = all_preds['label_encoder']

# One-hot encode y_true for calibration
y_true_one_hot = OneHotEncoder().fit_transform(y_true.reshape(-1, 1)).toarray()

print("All 'v4' prediction arrays loaded.")

# --- 2. Create the 4-Model Meta-Training Data ---
# X_meta_train will have shape (18153, 12)
X_meta_train = np.hstack((oof_xgb, oof_lgbm, oof_cat, oof_nb))
X_meta_test = np.hstack((test_xgb, test_lgbm, test_cat, test_nb))

# --- 3. Train the Meta-Model (the "Manager") ---
print("\nTraining the 'manager' model on all 4 model predictions...")
meta_model = LogisticRegression(random_state=42, n_jobs=-1)
meta_model.fit(X_meta_train, y_true)

# --- 4. Get Probabilities from the new stack ---
oof_probs_from_stack = meta_model.predict_proba(X_meta_train)
test_probs_from_stack = meta_model.predict_proba(X_meta_test)

f1_uncalibrated = f1_score(y_true, np.argmax(oof_probs_from_stack, axis=1), average='macro')
print(f"4-Model Stack F1 (Uncalibrated): {f1_uncalibrated:.6f}")

# --- 5. Calibrate the 4-Model Stack (Our Best Technique) ---
print("Calibrating the 4-model stack probabilities...")
calibrators = []
calibrated_test_probs = np.zeros_like(test_probs_from_stack)

for i in range(oof_probs_from_stack.shape[1]): # Iterate per class
    ir = IsotonicRegression(out_of_bounds='clip', y_min=1e-6, y_max=1-1e-6)
    ir.fit(oof_probs_from_stack[:, i], y_true_one_hot[:, i])
    calibrators.append(ir)
    calibrated_test_probs[:, i] = ir.transform(test_probs_from_stack[:, i])

# Normalize
calibrated_test_probs /= calibrated_test_probs.sum(axis=1, keepdims=True)

# --- 6. Check our final OOF F1 score ---
calibrated_oof_probs = np.zeros_like(oof_probs_from_stack)
for i in range(oof_probs_from_stack.shape[1]):
    calibrated_oof_probs[:, i] = calibrators[i].transform(oof_probs_from_stack[:, i])
calibrated_oof_probs /= calibrated_oof_probs.sum(axis=1, keepdims=True)
final_oof_preds = np.argmax(calibrated_oof_probs, axis=1)
final_f1_score = f1_score(y_true, final_oof_preds, average='macro')

print(f"\nPrevious Best F1 (3-Model Stack): 0.923168")
print(f"NEW FINAL 4-MODEL STACK F1: {final_f1_score:.6f}")

# --- 7. Create Final Submission ---
print("Creating final submission file...")
final_test_preds = np.argmax(calibrated_test_probs, axis=1)
final_test_labels = le.inverse_transform(final_test_preds)

submission_df = pd.DataFrame({'ID': test_ids, 'Target': final_test_labels})
submission_df.to_csv("submission_stacked_v4_calibrated.csv", index=False)

print("\n--- SCRIPT COMPLETE ---")
print("Final submission file 'submission_stacked_v4_calibrated.csv' created successfully.")
print(f"This file was created using our new 4-model stack (OOF F1 Score: {final_f1_score:.6f}).")

Loading 'all_model_preds_v4.npy' for the new 4-model stack...
All 'v4' prediction arrays loaded.

Training the 'manager' model on all 4 model predictions...
4-Model Stack F1 (Uncalibrated): 0.920778
Calibrating the 4-model stack probabilities...

Previous Best F1 (3-Model Stack): 0.923168
NEW FINAL 4-MODEL STACK F1: 0.922632
Creating final submission file...

--- SCRIPT COMPLETE ---
Final submission file 'submission_stacked_v4_calibrated.csv' created successfully.
This file was created using our new 4-model stack (OOF F1 Score: 0.922632).
