In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/results-file/results_df.csv
/kaggle/input/digit-recognizer/sample_submission.csv
/kaggle/input/digit-recognizer/train.csv
/kaggle/input/digit-recognizer/test.csv


In [2]:
!pip install optuna catboost xgboost lightgbm



In [3]:
import pandas as pd
import numpy as np
import optuna
import warnings
import time
import pickle

# Import models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import ast
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
import joblib

# Import scikit-learn helpers
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, cross_val_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import f1_score


In [4]:



# --- Load Data ---


print("Loading data...")
train_full = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')
test_set = pd.read_csv('/kaggle/input/digit-recognizer/test.csv')

# --- Prepare Full Training Data ---
X_train_full = train_full.drop(columns=['label'])
y_train_full = train_full['label']

# Normalize pixel values
X_train_full = X_train_full / 255.0
test_set = test_set / 255.0
print("Data loaded and normalized.")

print("\n--- Applying Feature Selection (Variance Threshold) ---")
print(f"Original number of features: {X_train_full.shape[1]}")
selector = VarianceThreshold(threshold=0.0)
X_train_full = selector.fit_transform(X_train_full)
test_set = selector.transform(test_set)
print(f"Features after removing zero-variance pixels: {X_train_full.shape[1]}")

# --- Create a smaller stratified subset for SVM tuning ---
# This is crucial to get results in a reasonable time
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.2, random_state=42, stratify=y_train_full
)
X_train_svm, _, y_train_svm, _ = train_test_split(
    X_train, y_train, train_size=8000, random_state=42, stratify=y_train
)

Loading data...
Data loaded and normalized.

--- Applying Feature Selection (Variance Threshold) ---
Original number of features: 784
Features after removing zero-variance pixels: 708


In [8]:
results_df = pd.read_csv('/kaggle/input/results-file/results_df.csv')

In [9]:
print("\n\n" + "="*60)
print("--- STAGE 2: Starting Deep Dive with K-Fold CV ---")
print("="*60)

top_models_to_tune = results_df["Model"].head(5).tolist() 
print(f"Identified Top 5 Models for Deep Dive: {top_models_to_tune}")



--- STAGE 2: Starting Deep Dive with K-Fold CV ---
Identified Top 5 Models for Deep Dive: ['LightGBM', 'XGBoost', 'CatBoost', 'RBF SVM', 'Random Forest']


In [10]:
results_df = pd.DataFrame(results_df)

In [11]:
top5_models_info = results_df.head()


In [14]:
top5_models_info = top5_models_info.rename(columns={'Best Params': 'best_params'})
display(top5_models_info['best_params'].head())

0    {'n_estimators': 741, 'learning_rate': 0.05580...
1    {'n_estimators': 470, 'learning_rate': 0.22153...
2    {'iterations': 332, 'learning_rate': 0.1133800...
3    {'C': 3570.118816933734, 'gamma': 0.0295855399...
4    {'n_estimators': 203, 'max_depth': 15, 'min_sa...
Name: best_params, dtype: object

In [15]:
import ast

In [16]:
spot_check_results = []
cv_splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [13]:
# ==============================================================================
# === FINAL ATTEMPT: "SPOT-CHECK" using simple cross_validate ==================
# ==============================================================================

print("\n\n" + "="*60)
print("--- Validating Top 5 Models with simple Stratified K-Fold CV ---")
print("This is the most reliable method. It may be slow.")
print("="*60)

import ast
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import f1_score # Ensure this is imported
import time

# We assume 'summary_fast_df' and 'fast_scan_studies' are available from Stage 1.

top_5_models_to_check = top5_models_info["Model"].head(5).tolist()
print(f"Top 5 models identified for validation: {top_5_models_to_check}")

spot_check_results = []
cv_splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for model_name in top_5_models_to_check:
    print(f"\n--- Spot-Checking: {model_name} ---")
    start_time = time.time()
    
    # Get and parse the parameter string
    param_string = top5_models_info[top5_models_info['Model'] == model_name]['best_params'].iloc[0]
    best_params = ast.literal_eval(param_string)
    
    model_instance = None
    
    # Initialize the base model with GPU/CPU settings
    if model_name == 'LightGBM':
        best_params['device'] = 'gpu'
        model_instance = lgb.LGBMClassifier(**best_params, random_state=42)
    elif model_name == 'XGBoost':
        best_params['tree_method'] = 'gpu_hist'
        model_instance = xgb.XGBClassifier(**best_params, random_state=42)
    elif model_name == 'CatBoost':
        best_params['task_type'] = 'GPU'
        best_params['verbose'] = 0
        model_instance = cb.CatBoostClassifier(**best_params, random_state=42)
    elif model_name == 'Random Forest':
        best_params['n_jobs'] = -1
        model_instance = RandomForestClassifier(**best_params, random_state=42)
    elif model_name == 'RBF SVM':
        model_instance = SVC(**best_params, kernel='rbf', random_state=42)

    if model_instance is None: continue
        
    print(f"Parameters to be validated: {best_params}")
    print("Performing 5-Fold CV using scikit-learn's cross_validate...")
    
    # Determine n_jobs to prevent CPU/GPU conflicts
    jobs = 1 if 'gpu' in str(type(model_instance)).lower() or 'task_type' in best_params else -1
    
    # --- Using the simple, robust cross_validate function ---
    scores_dict = cross_validate(
        estimator=model_instance,
        X=X_train_full,
        y=y_train_full,
        cv=cv_splitter,
        scoring=['accuracy', 'f1_macro'],
        n_jobs=jobs
    )
    
    duration = time.time() - start_time
    
    # Store results
    spot_check_results.append({
        "Model": model_name,
        "K-Fold Accuracy": scores_dict['test_accuracy'].mean(),
        "Accuracy Std Dev": scores_dict['test_accuracy'].std(),
        "K-Fold F1-Macro": scores_dict['test_f1_macro'].mean(),
        "Time (s)": f"{duration:.1f}"
    })
    
    print(f"Validation for {model_name} complete in {duration:.1f} seconds.")



--- Validating Top 5 Models with simple Stratified K-Fold CV ---
This is the most reliable method. It may be slow.
Top 5 models identified for validation: ['LightGBM', 'XGBoost', 'CatBoost', 'RBF SVM', 'Random Forest']

--- Spot-Checking: LightGBM ---
Parameters to be validated: {'n_estimators': 741, 'learning_rate': 0.05580379756884932, 'num_leaves': 21, 'device': 'gpu'}
Performing 5-Fold CV using scikit-learn's cross_validate...


 generated.
 generated.
.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 102844
[LightGBM] [Info] Number of data points in the train set: 33600, number of used features: 614
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 196 dense feature groups (6.28 MB) transferred to GPU in 0.015735 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score -2.319090
[LightGBM] [Info] Start training from score -2.193571
[LightGBM] [Info] Start training from score -2.307957
[LightGBM] [Info] Start training from score -2.267206
[LightGBM] [Info] Start training from score -2.333413
[LightGBM] [Info] Start training from score -2.403985
[LightGBM] [Info] Start training from score -2.317578
[LightGBM] [Info] Start training from score -2.255781
[LightGBM] [Info] Start training from sc


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.



    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.



    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.



    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace

Validation for XGBoost complete in 153.7 seconds.

--- Spot-Checking: CatBoost ---
Parameters to be validated: {'iterations': 332, 'learning_rate': 0.11338000708936166, 'depth': 8, 'task_type': 'GPU', 'verbose': 0}
Performing 5-Fold CV using scikit-learn's cross_validate...
Validation for CatBoost complete in 152.4 seconds.

--- Spot-Checking: RBF SVM ---
Parameters to be validated: {'C': 3570.118816933734, 'gamma': 0.029585539916797004}
Performing 5-Fold CV using scikit-learn's cross_validate...
Validation for RBF SVM complete in 596.3 seconds.

--- Spot-Checking: Random Forest ---
Parameters to be validated: {'n_estimators': 203, 'max_depth': 15, 'min_samples_leaf': 1, 'criterion': 'gini', 'min_samples_split': 9, 'min_impurity_decrease': 1.6989685296501598e-07, 'n_jobs': -1}
Performing 5-Fold CV using scikit-learn's cross_validate...








Validation for Random Forest complete in 80.9 seconds.


In [14]:
spot_check_results

[{'Model': 'LightGBM',
  'K-Fold Accuracy': 0.9793333333333333,
  'Accuracy Std Dev': 0.0014435720372594437,
  'K-Fold F1-Macro': 0.9792416027761398,
  'Time (s)': '1143.3'},
 {'Model': 'XGBoost',
  'K-Fold Accuracy': 0.9758095238095239,
  'Accuracy Std Dev': 0.0011698208091940898,
  'K-Fold F1-Macro': 0.9756887371729347,
  'Time (s)': '153.7'},
 {'Model': 'CatBoost',
  'K-Fold Accuracy': 0.9663809523809525,
  'Accuracy Std Dev': 0.0012154522874381311,
  'K-Fold F1-Macro': 0.9661844680791418,
  'Time (s)': '152.4'},
 {'Model': 'RBF SVM',
  'K-Fold Accuracy': 0.9823095238095239,
  'Accuracy Std Dev': 0.0005614202915500859,
  'K-Fold F1-Macro': 0.9821859177473218,
  'Time (s)': '596.3'},
 {'Model': 'Random Forest',
  'K-Fold Accuracy': 0.9606428571428574,
  'Accuracy Std Dev': 0.0015293797002620397,
  'K-Fold F1-Macro': 0.960408633987673,
  'Time (s)': '80.9'}]

In [17]:
# ==============================================================================
# === FINAL STEP: Building the Stacking Ensemble and Submitting ==================
# ==============================================================================

print("\n\n" + "="*60)
print("--- Building the Final Stacking Model ---")
print("Using the best parameters validated by the spot-check.")
print("="*60)

print("Initializing base models for the stacking ensemble...")

# --- RBF SVM ---
svm_params_str = top5_models_info[top5_models_info['Model'] == 'RBF SVM']['best_params'].iloc[0]
svm_params = ast.literal_eval(svm_params_str)
svm_best = SVC(**svm_params, kernel='rbf', random_state=42, probability=True) # probability=True is needed for stacking
print("  > RBF SVM initialized.")

# --- LightGBM ---
lgbm_params_str = top5_models_info[top5_models_info['Model'] == 'LightGBM']['best_params'].iloc[0]
lgbm_params = ast.literal_eval(lgbm_params_str)
lgbm_params['device'] = 'gpu'
lgbm_best = lgb.LGBMClassifier(**lgbm_params, random_state=42)
print("  > LightGBM initialized.")

# --- XGBoost ---
xgb_params_str = top5_models_info[top5_models_info['Model'] == 'XGBoost']['best_params'].iloc[0]
xgb_params = ast.literal_eval(xgb_params_str)
xgb_params['tree_method'] = 'gpu_hist'
xgb_best = xgb.XGBClassifier(**xgb_params, random_state=42)
print("  > XGBoost initialized.")

# --- Random Forest ---
rf_params_str = top5_models_info[top5_models_info['Model'] == 'Random Forest']['best_params'].iloc[0]
rf_params = ast.literal_eval(rf_params_str)
rf_best = RandomForestClassifier(**rf_params, random_state=42, n_jobs=-1)
print("  > Random Forest initialized.")



--- Building the Final Stacking Model ---
Using the best parameters validated by the spot-check.
Initializing base models for the stacking ensemble...
  > RBF SVM initialized.
  > LightGBM initialized.
  > XGBoost initialized.
  > Random Forest initialized.


In [18]:
estimators = [
    ('rbf_svm', svm_best),
    ('lightgbm', lgbm_best),
    ('xgboost', xgb_best),
    ('random_forest', rf_best)
]

cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=1000),
    cv=cv_strategy, # <-- Pass the StratifiedKFold object directly
    n_jobs=1,
    passthrough=False,
    verbose = 2
)

In [19]:
print("\nTraining the final Stacking Classifier on the full dataset...")
print("This will take a significant amount of time, especially due to the SVM.")
stacking_clf.fit(X_train_full, y_train_full)
print("--- Stacking model training complete! ---")



Training the final Stacking Classifier on the full dataset...
This will take a significant amount of time, especially due to the SVM.
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 105387
[LightGBM] [Info] Number of data points in the train set: 42000, number of used features: 620
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...




[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 197 dense feature groups (8.01 MB) transferred to GPU in 0.005501 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score -2.318908
[LightGBM] [Info] Start training from score -2.193517
[LightGBM] [Info] Start training from score -2.308076
[LightGBM] [Info] Start training from score -2.267264
[LightGBM] [Info] Start training from score -2.333535
[LightGBM] [Info] Start training from score -2.403985
[LightGBM] [Info] Start training from score -2.317699
[LightGBM] [Info] Start training from score -2.255838
[LightGBM] [Info] Start training from score -2.335748
[LightGBM] [Info] Start training from score -2.305446



    E.g. tree_method = "hist", device = "cuda"

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 52.6min finished


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 102844
[LightGBM] [Info] Number of data points in the train set: 33600, number of used features: 614
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 196 dense feature groups (6.28 MB) transferred to GPU in 0.004619 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score -2.319090
[LightGBM] [Info] Start training from score -2.193571
[LightGBM] [Info] Start training from score -2.307957
[LightGBM] [Info] Start training from score -2.267206
[LightGBM] [Info] Start training from score -2.333413
[LightGBM] [Info] Start training from score -2.403985
[LightGBM] [Info] Start training from score -2.317578
[LightGBM] [Info] Start training from score -2.255781
[LightGBM] [Info] Start training from sc

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 12.5min finished

    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.1min finished
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.0min finished


--- Stacking model training complete! ---


In [22]:
model_filename = 'final_stacking_model.joblib'
print(f"\nSaving the final trained model to '{model_filename}'...")
joblib.dump(stacking_clf, model_filename)
print("Model saved successfully.")


Saving the final trained model to 'final_stacking_model.joblib'...
Model saved successfully.


In [24]:
# --- Step 4: Generate the Final Submission File ---
print("\nGenerating final submission file...")
predictions = stacking_clf.predict(test_set)
submission = pd.DataFrame({'ImageId': range(1, len(predictions) + 1), 'Label': predictions})
submission.to_csv('submission_stacking_final.csv', index=False)

print("\nSubmission file 'submission_stacking_final.csv' created successfully!")
print("This model is now your FINAL baseline. Congratulations!")
print(submission.head())


Generating final submission file...

Submission file 'submission_stacking_final.csv' created successfully!
This model is now your FINAL baseline. Congratulations!
   ImageId  Label
0        1      2
1        2      0
2        3      9
3        4      9
4        5      3
