In [2]:
!pip install pandas numpy scikit-learn joblib xgboost lightgbm catboost

Collecting xgboost
  Using cached xgboost-3.0.5-py3-none-win_amd64.whl.metadata (2.1 kB)
Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-win_amd64.whl.metadata (1.5 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.21-py3-none-any.whl.metadata (12 kB)
Collecting plotly (from catboost)
  Downloading plotly-6.3.0-py3-none-any.whl.metadata (8.5 kB)
Collecting narwhals>=1.15.1 (from plotly->catboost)
  Downloading narwhals-2.5.0-py3-none-any.whl.metadata (11 kB)
Using cached xgboost-3.0.5-py3-none-win_amd64.whl (56.8 MB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   -------------- ------------------------- 0.5/1.5 MB 3.4 MB/s eta 0:00:01
   ------------------------------------ --- 1.3/1.5 MB 3.7 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 3.6 MB/s  0:00:00
Downloa

In [4]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [5]:
df = pd.read_csv("../dataset/refined_sih_dataset_ML_ready.csv")

In [6]:
for col in df.select_dtypes(include=["object"]).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))

In [8]:
target = "rockfall_risk"
leaky_features = ["Displacement_mm", "PorePressure_kPa", "SeismicVibration_mm/s"]

X = df.drop(columns=[target] + leaky_features)
y = df[target]
groups = df["SUBDIVISION"]

# Scale features for MLP (not strictly needed for tree models)
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [10]:
models = {
    "XGBoost": XGBClassifier(
        n_estimators=200, max_depth=6, learning_rate=0.1,
        subsample=0.8, colsample_bytree=0.8,
        random_state=42, use_label_encoder=False, eval_metric="logloss"
    ),
    "LightGBM": LGBMClassifier(
        n_estimators=200, max_depth=6, learning_rate=0.1,
        subsample=0.8, colsample_bytree=0.8,
        random_state=42
    ),
    "AdaBoost": AdaBoostClassifier(
        n_estimators=200, random_state=42
    ),
    "CatBoost": CatBoostClassifier(
        iterations=200, depth=6, learning_rate=0.1,
        random_state=42, verbose=0
    ),
    "MLP": MLPClassifier(
        hidden_layer_sizes=(128, 64), activation="relu",
        solver="adam", max_iter=500, random_state=42
    ),
    "RandomForest": RandomForestClassifier(
        n_estimators=200, max_depth=10, random_state=42
    )
}

In [11]:
gkf = GroupKFold(n_splits=5)
results = {}

for model_name, model in models.items():
    print(f"\nüîπ Training {model_name}...")
    roc_auc_scores = []
    for fold, (train_idx, test_idx) in enumerate(gkf.split(X, y, groups=groups)):
        # Use scaled X only for MLP, otherwise raw X
        if model_name == "MLP":
            X_train, X_test = X_scaled.iloc[train_idx], X_scaled.iloc[test_idx]
        else:
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        model.fit(X_train, y_train)
        y_pred_prob = model.predict_proba(X_test)[:, 1]
        score = roc_auc_score(y_test, y_pred_prob)
        roc_auc_scores.append(score)
        print(f"   Fold {fold+1} ROC-AUC: {score:.4f}")

    results[model_name] = np.mean(roc_auc_scores)
    print(f"‚úÖ {model_name} Mean ROC-AUC: {results[model_name]:.4f}")


üîπ Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


   Fold 1 ROC-AUC: 0.8322


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


   Fold 2 ROC-AUC: 0.8230


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


   Fold 3 ROC-AUC: 0.8132


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


   Fold 4 ROC-AUC: 0.8364


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


   Fold 5 ROC-AUC: 0.8395
‚úÖ XGBoost Mean ROC-AUC: 0.8289

üîπ Training LightGBM...
[LightGBM] [Info] Number of positive: 293361, number of negative: 94439
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017752 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8647
[LightGBM] [Info] Number of data points in the train set: 387800, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.756475 -> initscore=1.133450
[LightGBM] [Info] Start training from score 1.133450
   Fold 1 ROC-AUC: 0.8165
[LightGBM] [Info] Number of positive: 301746, number of negative: 99904
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019005 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 870



   Fold 1 ROC-AUC: 0.7885




   Fold 2 ROC-AUC: 0.8273




   Fold 3 ROC-AUC: 0.8145




   Fold 4 ROC-AUC: 0.8181




   Fold 5 ROC-AUC: 0.8445
‚úÖ AdaBoost Mean ROC-AUC: 0.8186

üîπ Training CatBoost...
   Fold 1 ROC-AUC: 0.8231
   Fold 2 ROC-AUC: 0.8259
   Fold 3 ROC-AUC: 0.8507
   Fold 4 ROC-AUC: 0.8266
   Fold 5 ROC-AUC: 0.8513
‚úÖ CatBoost Mean ROC-AUC: 0.8355

üîπ Training MLP...
   Fold 1 ROC-AUC: 0.7915
   Fold 2 ROC-AUC: 0.8221
   Fold 3 ROC-AUC: 0.7980
   Fold 4 ROC-AUC: 0.8146
   Fold 5 ROC-AUC: 0.8253
‚úÖ MLP Mean ROC-AUC: 0.8103

üîπ Training RandomForest...
   Fold 1 ROC-AUC: 0.8254
   Fold 2 ROC-AUC: 0.8399
   Fold 3 ROC-AUC: 0.8338
   Fold 4 ROC-AUC: 0.8295
   Fold 5 ROC-AUC: 0.8476
‚úÖ RandomForest Mean ROC-AUC: 0.8352


In [12]:
best_model_name = max(results, key=results.get)
print("\nüèÜ Best Model:", best_model_name, "with ROC-AUC:", results[best_model_name])

best_model = models[best_model_name]


üèÜ Best Model: CatBoost with ROC-AUC: 0.8355341179685605


In [13]:
if best_model_name == "MLP":
    best_model.fit(X_scaled, y)  # use scaled features
else:
    best_model.fit(X, y)         # use raw features

In [14]:
joblib.dump(best_model, "../rockfall_xgb_final.pkl")
print("‚úÖ Final model saved as rockfall_xgb_final.pkl")

‚úÖ Final model saved as rockfall_xgb_final.pkl
