In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from lightgbm import early_stopping, log_evaluation
import warnings
warnings.filterwarnings('ignore')

In [None]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')
dig_df = pd.read_csv('../data/Dig-MNIST.csv')


pixel_columns = [col for col in train_df.columns if col.startswith("pixel")]

X = train_df[pixel_columns].values / 255.0
y = train_df["label"].values
X_test = test_df[pixel_columns].values / 255.0

X_dig = dig_df[pixel_columns].values / 255.0
y_dig = dig_df["label"].values

print("Train shape:", X.shape)
print("Test shape:", X_test.shape)
print("Dig-MNIST shape:", X_dig.shape)

Train shape: (60000, 784)
Test shape: (5000, 784)
Dig-MNIST shape: (10240, 784)


In [None]:
param_grid = [
    {
        "n_estimators": 1000,
        "learning_rate": 0.05,
        "num_leaves": 64,
        "max_depth": 10,
        "subsample": 1.0,
        "colsample_bytree": 1.0,
        "model_name": "LGBM_base"
    },
    {
        "n_estimators": 1000,
        "learning_rate": 0.03,
        "num_leaves": 127,
        "max_depth": 15,
        "subsample": 0.9,
        "colsample_bytree": 0.9,
        "model_name": "LGBM_deep_wide"
    },
    {
        "n_estimators": 500,
        "learning_rate": 0.1,
        "num_leaves": 31,
        "max_depth": 7,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "model_name": "LGBM_fast"
    },
]

results = []

for config in param_grid:
    print(f"\n🔧 Training model: {config['model_name']}")
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    test_preds = np.zeros((X_test.shape[0], 10))
    dig_preds = np.zeros((X_dig.shape[0], 10))
    val_scores = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"   Fold {fold+1}")
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        model = lgb.LGBMClassifier(
            objective="multiclass",
            num_class=10,
            random_state=42,
            **{k: v for k, v in config.items() if k != "model_name"}
        )

        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric="multi_logloss",
            callbacks=[early_stopping(20), log_evaluation(50)]
        )

        val_preds = model.predict(X_val)
        acc = accuracy_score(y_val, val_preds)
        val_scores.append(acc)

        test_preds += model.predict_proba(X_test) / skf.n_splits
        dig_preds += model.predict_proba(X_dig) / skf.n_splits

    final_preds = np.argmax(test_preds, axis=1)
    dig_final_preds = np.argmax(dig_preds, axis=1)
    dig_acc = accuracy_score(y_dig, dig_final_preds)

    print(f" Model: {config['model_name']}")
    print(f"   → Avg Val Accuracy: {np.mean(val_scores):.4f}")
    print(f"   → Dig-MNIST Accuracy: {dig_acc:.4f}")

    results.append({
        "model_name": config["model_name"],
        "val_acc": np.mean(val_scores),
        "dig_acc": dig_acc,
        "submission": final_preds
    })


🔧 Training model: LGBM_base
  🔁 Fold 1
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.077141 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 95180
[LightGBM] [Info] Number of data points in the train set: 48000, number of used features: 619
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
Training until validation scores don't improve for 20 rounds
[50]	valid_0's multi_logloss: 0.117958
[100]	valid_0's multi_logloss

In [6]:
best_model = max(results, key=lambda x: x["val_acc"])
submission = pd.DataFrame({
    "id": np.arange(1, len(best_model["submission"]) + 1),
    "label": best_model["submission"]
})
submission.to_csv(
    "../submissions/mldl_competition3_sharifbek_submission1.csv", index=False)

print(f"\n Best model: {best_model['model_name']}")
print(" Submission file saved!")

print("\n📊 Model Comparison:")
print("{:<20} {:<15} {:<20}".format(
    "Model Name", "Val Accuracy", "Dig-MNIST Accuracy"))
for r in results:
    print("{:<20} {:<15.4f} {:<20.4f}".format(
        r["model_name"], r["val_acc"], r["dig_acc"]))


 Best model: LGBM_fast
 Submission file saved!

📊 Model Comparison:
Model Name           Val Accuracy    Dig-MNIST Accuracy  
LGBM_base            0.9858          0.6751              
LGBM_deep_wide       0.9845          0.6579              
LGBM_fast            0.9863          0.6822              


In [9]:
!jupyter nbconvert --to html "mldl_competition3_sharifbek_submission1.ipynb"

[NbConvertApp] Converting notebook mldl_competition3_sharifbek_submission1.ipynb to html
[NbConvertApp] Writing 328729 bytes to mldl_competition3_sharifbek_submission1.html
