In [22]:
# %%
import pandas as pd
import numpy as np

# Load train and test data
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

print("Train shape:", df_train.shape)
print("Test shape:", df_test.shape)
df_train.head()


Train shape: (18153, 21)
Test shape: (7780, 20)


Unnamed: 0,ID,ASI_category,Temperature,Precipitation,Rainfall,Snowfall,Soil_Temperature,Radiation,Wind_Speed,Wind_Gusts,...,Surface_Pressure,Relative_Humidity,Soil_Moisture,Dew_Point,Sunshine_Duration,Cloud_Cover,Precipitation_Hours,Wind_Direction,Weather_Code,Daylight_Duration
0,19554,Moderate,0.931231,0.000912,0.000912,0.0,0.757673,0.879671,0.179293,0.193029,...,0.538056,55,0.546243,17.564597,53252.08,12.136192,1,176.459082,51,58772.52
1,25205,Moderate,0.566323,0.096715,0.096715,0.0,0.291448,0.008913,0.588384,0.532172,...,0.568475,88,0.557803,5.692134,0.0,91.901341,16,232.433005,61,28143.12
2,771,Poor,0.018033,0.0,0.0,0.0,0.0,0.27734,0.247475,0.189008,...,0.70652,78,0.791908,-25.26442,30213.79,18.85967,0,44.6886,3,34621.43
3,1976,Good,0.717541,0.0,0.0,0.0,0.635669,0.796709,0.123737,0.134048,...,0.5475,57,0.473988,5.913865,44627.21,38.759757,0,333.640418,3,59192.17
4,14036,Moderate,0.82717,0.001825,0.001825,0.0,0.743855,0.781282,0.343434,0.391421,...,0.546378,50,0.459538,9.661455,45267.17,60.058955,1,86.996954,51,59956.03


In [23]:
# %%
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_train['ASI_category_encoded'] = le.fit_transform(df_train['ASI_category'])


In [24]:
def feature_engineering(df):
    # --- Base features ---
    df['Temp_Range_Impact'] = df['Temperature'] * df['Soil_Temperature']
    df['Humidity_Moisture'] = df['Relative_Humidity'] * df['Soil_Moisture']
    df['Effective_Radiation'] = df['Radiation'] * (df['Sunshine_Duration'] / (df['Daylight_Duration'] + 1e-6))
    df['Total_Precip'] = df['Rainfall'] + df['Snowfall'] + df['Precipitation']
    df['Wind_Intensity'] = df['Wind_Speed'] * df['Wind_Gusts']
    df['Pressure_Humidity_Interaction'] = df['Surface_Pressure'] * df['Relative_Humidity']

    # --- Extended interactions ---
    df['Temp_Diff_Air_Soil'] = df['Temperature'] - df['Soil_Temperature']
    df['Temp_Mean'] = (df['Temperature'] + df['Soil_Temperature']) / 2
    df['Temp_Humidity_Index'] = df['Temperature'] * df['Relative_Humidity']
    df['Radiation_Per_Hour'] = df['Radiation'] / (df['Sunshine_Duration'] + 1e-6)
    df['Sunshine_Ratio'] = df['Sunshine_Duration'] / (df['Daylight_Duration'] + 1e-6)
    df['Wind_Stress'] = df['Wind_Speed'] ** 2
    df['Wind_Ratio'] = df['Wind_Gusts'] / (df['Wind_Speed'] + 1e-6)
    df['Humidity_to_Pressure'] = df['Relative_Humidity'] / (df['Surface_Pressure'] + 1e-6)
    df['Radiation_to_Temp'] = df['Radiation'] / (df['Temperature'] + 1e-6)

    # --- Clean NaNs only for numeric columns ---
    df = df.replace([np.inf, -np.inf], np.nan)

    num_cols = df.select_dtypes(include=[np.number]).columns
    df[num_cols] = df[num_cols].fillna(df[num_cols].median())

    # --- Drop duplicate columns ---
    df = df.loc[:, ~df.columns.duplicated()]

    return df
df_train = feature_engineering(df_train)
df_test = feature_engineering(df_test)


In [25]:
# %%
from sklearn.preprocessing import StandardScaler

num_cols = df_train.select_dtypes(include=[np.number]).columns.drop('ASI_category_encoded')

scaler = StandardScaler()
df_train[num_cols] = scaler.fit_transform(df_train[num_cols])
df_test[num_cols] = scaler.transform(df_test[num_cols])


In [26]:
# %%
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectFromModel

X = df_train.drop(['ASI_category', 'ASI_category_encoded'], axis=1)
y = df_train['ASI_category_encoded']

xgb_model = XGBClassifier(
    random_state=42,
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='mlogloss'
)

xgb_model.fit(X, y)
selector = SelectFromModel(xgb_model, prefit=True, threshold='median')
selected_features = X.columns[selector.get_support()]

print("Number of selected features:", len(selected_features))
print("Selected features:", list(selected_features))


Number of selected features: 18
Selected features: ['Temperature', 'Precipitation', 'Rainfall', 'Soil_Temperature', 'Radiation', 'Relative_Humidity', 'Soil_Moisture', 'Dew_Point', 'Sunshine_Duration', 'Daylight_Duration', 'Temp_Range_Impact', 'Humidity_Moisture', 'Effective_Radiation', 'Temp_Diff_Air_Soil', 'Temp_Mean', 'Temp_Humidity_Index', 'Sunshine_Ratio', 'Radiation_to_Temp']


In [27]:
# %%
X_final = df_train[selected_features]
y_final = df_train['ASI_category_encoded']
X_test_final = df_test[selected_features]


In [28]:
# %% [Train/Test Split + XGBoost Evaluation - Tuned Version]

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
import numpy as np

# --- Split data ---
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y_final, test_size=0.2, stratify=y_final, random_state=42
)

# --- Define tuned model ---
model = XGBClassifier(
    n_estimators=150,
    learning_rate=0.01,
    max_depth=5,
    # subsample=0.5921834617441386,
    # colsample_bytree=0.6340862288557617,
    # gamma=1.399534210191034,
    # min_child_weight=3,
    random_state=42,
    use_label_encoder=False,
    # eval_metric="mlogloss",
    n_jobs=-1,
    objective="multi:softprob",
    num_class=len(np.unique(y_final))
)

# --- Train model ---
model.fit(X_train, y_train)

# --- Predictions ---
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

# --- Metrics ---
train_acc = accuracy_score(y_train, train_preds)
test_acc = accuracy_score(y_test, test_preds)

train_f1 = f1_score(y_train, train_preds, average="macro")
test_f1 = f1_score(y_test, test_preds, average="macro")

# --- Display Results ---
print("\nâœ… Model Performance:")
print(f"Training Accuracy: {train_acc:.4f}")
print(f"Test Accuracy     : {test_acc:.4f}")
print(f"Training F1 Score : {train_f1:.4f}")
print(f"Test F1 Score     : {test_f1:.4f}")

# print("\nðŸ“Š Classification Report (Test):")
# print(classification_report(y_test, test_preds, target_names=le.classes_))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



âœ… Model Performance:
Training Accuracy: 0.9406
Test Accuracy     : 0.9281
Training F1 Score : 0.9184
Test F1 Score     : 0.9026


In [29]:
# # %% [Train Final Model on Full Data and Create submission.csv]

# import xgboost as xgb
# import pandas as pd
# import numpy as np

# # --- Compute average best iteration from CV ---
# best_iter = int(np.mean(best_iterations))
# print(f"âœ… Using Best Iteration from CV: {best_iter}")

# # --- Prepare final training data ---
# X_final = X_final.loc[:, ~X_final.columns.duplicated()]
# y_final = y_final.astype(int)
# X_final = X_final.apply(pd.to_numeric, errors='coerce').fillna(0)

# # --- Convert to DMatrix ---
# dtrain_full = xgb.DMatrix(X_final, label=y_final)

# # --- Parameters ---
# params = {
#     "objective": "multi:softmax",
#     "num_class": len(np.unique(y_final)),
#     "eval_metric": "mlogloss",
#     "learning_rate": 0.03,
#     "max_depth": 8,
#     "subsample": 0.7,
#     "colsample_bytree": 0.7,
#     "gamma": 1.2,
#     "reg_alpha": 0.3,
#     "reg_lambda": 1.0,
#     "min_child_weight": 4,
#     "seed": 42,
#     "nthread": -1,
# }

# # --- Train final model on full data ---
# final_model = xgb.train(
#     params=params,
#     dtrain=dtrain_full,
#     num_boost_round=best_iter
# )

# # --- Prepare test set (use the already loaded one) ---
# test_ids = df_test["ID"].copy()

# # --- Apply same feature engineering ---
# df_test = feature_engineering(df_test)

# # --- Scale numeric features ---
# df_test[num_cols] = scaler.transform(df_test[num_cols])

# # --- Select same features ---
# X_test_final = df_test[selected_features].reindex(columns=X_final.columns, fill_value=0)

# # --- Convert to DMatrix ---
# dtest = xgb.DMatrix(X_test_final)

# # --- Predict ---
# test_preds = final_model.predict(dtest)

# # --- Map predictions back to original labels ---
# submission = pd.DataFrame({
#     "ID": test_ids,
#     "ASI_category": le.inverse_transform(test_preds.astype(int))
# })

# # --- Save submission ---
# submission.to_csv("submission.csv", index=False)
# print("âœ… submission.csv created successfully!")


In [30]:
# %% [Stacking Ensemble Model]

from sklearn.ensemble import StackingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# --- Define Base Models ---
base_models = [
    ("xgb", XGBClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="mlogloss",
        random_state=42,
        n_jobs=-1
    )),
    ("lgbm", LGBMClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )),
    ("cat", CatBoostClassifier(
        iterations=300,
        learning_rate=0.05,
        depth=6,
        verbose=0,
        random_seed=42
    )),
    ("rf", RandomForestClassifier(
        n_estimators=300,
        max_depth=8,
        random_state=42,
        n_jobs=-1
    ))
]

# --- Define Meta Model (Blender) ---
meta_model = LogisticRegression(max_iter=1000, multi_class="multinomial", solver="lbfgs")

# --- Build Stacking Ensemble ---
stacking_model = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5,
    n_jobs=-1,
    passthrough=False
)

# --- Train Ensemble ---
stacking_model.fit(X_train, y_train)

# --- Evaluate ---
train_preds = stacking_model.predict(X_train)
test_preds = stacking_model.predict(X_test)

train_acc = accuracy_score(y_train, train_preds)
test_acc = accuracy_score(y_test, test_preds)
train_f1 = f1_score(y_train, train_preds, average="macro")
test_f1 = f1_score(y_test, test_preds, average="macro")

print("\nâœ… Stacking Ensemble Performance:")
print(f"Training Accuracy: {train_acc:.4f}")
print(f"Test Accuracy     : {test_acc:.4f}")
print(f"Training F1 Score : {train_f1:.4f}")
print(f"Test F1 Score     : {test_f1:.4f}")





âœ… Stacking Ensemble Performance:
Training Accuracy: 0.9731
Test Accuracy     : 0.9446
Training F1 Score : 0.9622
Test F1 Score     : 0.9245


In [31]:
!pip install optuna -q


In [None]:
import optuna
from optuna.samplers import TPESampler
from sklearn.metrics import f1_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier

def tune_model_with_optuna(model_name, n_trials=30):
    def objective(trial):
        if model_name == "xgb":
            params = {
                # --- Core boosting parameters ---
                "n_estimators": trial.suggest_int("n_estimators", 300, 1500),
                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
                "max_depth": trial.suggest_int("max_depth", 3, 12),
                "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
                "gamma": trial.suggest_float("gamma", 0.0, 5.0),
                
                # --- Subsampling for regularization ---
                "subsample": trial.suggest_float("subsample", 0.5, 1.0),
                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
                "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5, 1.0),
                "colsample_bynode": trial.suggest_float("colsample_bynode", 0.5, 1.0),

                # --- Regularization parameters ---
                "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 5.0),   # L2 regularization
                "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 5.0),     # L1 regularization

                # --- Tree construction method ---
                "tree_method": trial.suggest_categorical("tree_method", ["hist", "approx", "auto"]),
                
                # --- Booster type ---
                "booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),

                # --- DART-specific (only applies if booster='dart') ---
                "sample_type": trial.suggest_categorical("sample_type", ["uniform", "weighted"]),
                "normalize_type": trial.suggest_categorical("normalize_type", ["tree", "forest"]),
                "rate_drop": trial.suggest_float("rate_drop", 0.0, 0.5),
                "skip_drop": trial.suggest_float("skip_drop", 0.0, 0.5),

                # --- Class imbalance & optimization ---
                "scale_pos_weight": trial.suggest_float("scale_pos_weight", 0.8, 2.0),
                "grow_policy": trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),

                # --- Misc ---
                "n_jobs": -1,
                "random_state": 42,
                "eval_metric": "mlogloss",
                "objective": "multi:softprob",
                "use_label_encoder": False,
                "num_class": len(np.unique(y_train)),
            }

            model = XGBClassifier(**params)

        elif model_name == "lgbm":
            params = {
                "n_estimators": trial.suggest_int("n_estimators", 100, 600),
                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
                "max_depth": trial.suggest_int("max_depth", 3, 10),
                "subsample": trial.suggest_float("subsample", 0.5, 1.0),
                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
                "min_child_samples": trial.suggest_int("min_child_samples", 10, 100),
                "random_state": 42,
                "n_jobs": -1
            }
            model = LGBMClassifier(**params)

        elif model_name == "cat":
            params = {
                "iterations": trial.suggest_int("iterations", 200, 800),
                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
                "depth": trial.suggest_int("depth", 4, 10),
                "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1.0, 10.0),
                "random_seed": 42,
                "verbose": 0
            }
            model = CatBoostClassifier(**params)

        elif model_name == "rf":
            params = {
                "n_estimators": trial.suggest_int("n_estimators", 100, 800),
                "max_depth": trial.suggest_int("max_depth", 5, 15),
                "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
                "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 5),
                "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
                "n_jobs": -1,
                "random_state": 42
            }
            model = RandomForestClassifier(**params)

        # --- Train & Evaluate ---
        model.fit(X_train, y_train)
        train_preds = model.predict(X_train)
        test_preds = model.predict(X_test)

        train_f1 = f1_score(y_train, train_preds, average="macro")
        test_f1 = f1_score(y_test, test_preds, average="macro")

        # Objective: minimize overfitting + bad generalization
        f1_gap = abs(train_f1 - test_f1)
        score = f1_gap + (1 - test_f1) * 0.5

        return score

    study = optuna.create_study(direction="minimize", sampler=TPESampler(seed=42))
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)

    print(f"\nâœ… Best {model_name.upper()} Params:")
    print(study.best_params)
    print(f"Best Objective Value: {study.best_value:.4f}")

    return study


In [33]:
# XGBoost
# xgb_study = tune_model_with_optuna("xgb", n_trials=30)

# LightGBM
# lgbm_study = tune_model_with_optuna("lgbm", n_trials=30)

# # CatBoost
# cat_study = tune_model_with_optuna("cat", n_trials=30)

# # Random Forest
# rf_study = tune_model_with_optuna("rf", n_trials=30)

In [34]:
# Best XGB Params:
# {'n_estimators': 109, 'learning_rate': 0.05041730562003292, 'max_depth': 4, 'subsample': 0.7586762826070333, 'colsample_bytree': 0.8293400793920931, 'gamma': 0.46689011567137706, 'min_child_weight': 5}
# Best Objective Value: 0.0560

# Best LGBM Params:
# {'n_estimators': 103, 'learning_rate': 0.015769130736262276, 'max_depth': 7, 'subsample': 0.9336953602399801, 'colsample_bytree': 0.9123161127609902, 'min_child_samples': 73}
# Best Objective Value: 0.0605

# Best CAT Params:
# {'iterations': 227, 'learning_rate': 0.05573763155224416, 'depth': 7, 'l2_leaf_reg': 4.51792861553295}
# Best Objective Value: 0.0448

# Best RF Params:
# {'n_estimators': 747, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 5, 'max_features': None}
# Best Objective Value: 0.0672


In [35]:
# %% [Stacking Ensemble Model - Tuned Base Models]

from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, f1_score

# --- Define Tuned Base Models ---
base_models = [
    ("xgb", XGBClassifier(
        n_estimators=109,
        learning_rate=0.05041730562003292,
        max_depth=4,
        subsample=0.7586762826070333,
        colsample_bytree=0.8293400793920931,
        gamma=0.46689011567137706,
        min_child_weight=5,
        random_state=42,
        n_jobs=-1,
        eval_metric="mlogloss",
        objective="multi:softprob",
        use_label_encoder=False,
        num_class=len(np.unique(y_final))
    )),
    ("lgbm", LGBMClassifier(
        n_estimators=103,
        learning_rate=0.015769130736262276,
        max_depth=7,
        subsample=0.9336953602399801,
        colsample_bytree=0.9123161127609902,
        min_child_samples=73,
        random_state=42,
        n_jobs=-1
    )),
    ("cat", CatBoostClassifier(
        iterations=227,
        learning_rate=0.05573763155224416,
        depth=7,
        l2_leaf_reg=4.51792861553295,
        verbose=0,
        random_seed=42
    )),
    ("rf", RandomForestClassifier(
        n_estimators=747,
        max_depth=9,
        min_samples_split=6,
        min_samples_leaf=5,
        max_features=None,
        random_state=42,
        n_jobs=-1
    ))
]

# --- Meta Model (Blender) ---
meta_model = LogisticRegression(
    max_iter=1000,
    multi_class="multinomial",
    solver="lbfgs",
    random_state=42
)

# --- Build Stacking Ensemble ---
stacking_model = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5,
    n_jobs=-1,
    passthrough=False
)

# --- Train Ensemble ---
stacking_model.fit(X_train, y_train)

# --- Evaluate ---
train_preds = stacking_model.predict(X_train)
test_preds = stacking_model.predict(X_test)

train_acc = accuracy_score(y_train, train_preds)
test_acc = accuracy_score(y_test, test_preds)
train_f1 = f1_score(y_train, train_preds, average="macro")
test_f1 = f1_score(y_test, test_preds, average="macro")

print("\nâœ… Stacking Ensemble Performance:")
print(f"Training Accuracy: {train_acc:.4f}")
print(f"Test Accuracy     : {test_acc:.4f}")
print(f"Training F1 Score : {train_f1:.4f}")
print(f"Test F1 Score     : {test_f1:.4f}")





âœ… Stacking Ensemble Performance:
Training Accuracy: 0.9505
Test Accuracy     : 0.9422
Training F1 Score : 0.9305
Test F1 Score     : 0.9204


In [36]:
from sklearn.metrics import log_loss

# --- Evaluate ---
train_preds = stacking_model.predict(X_train)
test_preds = stacking_model.predict(X_test)

# Also get class probabilities for log loss
train_probs = stacking_model.predict_proba(X_train)
test_probs = stacking_model.predict_proba(X_test)

# --- Metrics ---
train_acc = accuracy_score(y_train, train_preds)
test_acc = accuracy_score(y_test, test_preds)
train_f1 = f1_score(y_train, train_preds, average="macro")
test_f1 = f1_score(y_test, test_preds, average="macro")

# --- Log Loss ---
train_logloss = log_loss(y_train, train_probs)
test_logloss = log_loss(y_test, test_probs)

# --- Display Results ---
print("\nâœ… Stacking Ensemble Performance:")
print(f"Training Accuracy : {train_acc:.4f}")
print(f"Test Accuracy     : {test_acc:.4f}")
print(f"Training F1 Score : {train_f1:.4f}")
print(f"Test F1 Score     : {test_f1:.4f}")
print(f"Training Log Loss : {train_logloss:.4f}")
print(f"Test Log Loss     : {test_logloss:.4f}")
print(f"Î” F1 Gap          : {abs(train_f1 - test_f1):.4f}")
print(f"Î” LogLoss Gap     : {abs(train_logloss - test_logloss):.4f}")



âœ… Stacking Ensemble Performance:
Training Accuracy : 0.9505
Test Accuracy     : 0.9422
Training F1 Score : 0.9305
Test F1 Score     : 0.9204
Training Log Loss : 0.1499
Test Log Loss     : 0.1676
Î” F1 Gap          : 0.0101
Î” LogLoss Gap     : 0.0177


In [20]:
# %% [Train Final Stacking Ensemble on Full Data + Create Clean submission.csv]

print("ðŸš€ Training final stacking ensemble on the entire dataset...")

# --- Retrain the stacking model on the full dataset ---
stacking_model.fit(X_final, y_final)

# --- Prepare clean test data ---
test = pd.read_csv("test.csv")  # reload to restore original IDs
test_ids = test["ID"].copy()    # keep original IDs

# Reapply feature engineering & scaling (without touching ID)
df_test_fe = feature_engineering(test)
df_test_fe[num_cols] = scaler.transform(df_test_fe[num_cols])
X_test_final = df_test_fe[selected_features].reindex(columns=X_final.columns, fill_value=0)

# --- Predict on test set ---
print("ðŸ§  Generating predictions on test set...")
final_preds = stacking_model.predict(X_test_final)

# --- Decode numeric predictions back to original labels ---
final_preds = le.inverse_transform(final_preds)

# --- Create submission DataFrame ---
submission = pd.DataFrame({
    "ID": test_ids,
    "ASI_category": final_preds
})

# --- Save submission file ---
submission.to_csv("submission.csv", index=False)
print("\nâœ… Submission file created successfully: submission.csv")
print(submission.head())


ðŸš€ Training final stacking ensemble on the entire dataset...




ðŸ§  Generating predictions on test set...

âœ… Submission file created successfully: submission.csv
      ID ASI_category
0  15628         Good
1   9358     Moderate
2  12927     Moderate
3  23980         Poor
4   1032     Moderate
