In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import optuna
import warnings
import time

# Import models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

# Import scikit-learn helpers
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, cross_val_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import ConvergenceWarning


In [None]:



# --- Load Data ---
print("Loading data...")
train_full = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')
test_set = pd.read_csv('/kaggle/input/digit-recognizer/test.csv')

# --- Prepare Full Training Data ---
X_train_full = train_full.drop(columns=['label'])
y_train_full = train_full['label']

# Normalize pixel values
X_train_full = X_train_full / 255.0
test_set = test_set / 255.0
print("Data loaded and normalized.")

print("\n--- Applying Feature Selection (Variance Threshold) ---")
print(f"Original number of features: {X_train_full.shape[1]}")
selector = VarianceThreshold(threshold=0.0)
X_train_full = selector.fit_transform(X_train_full)
test_set = selector.transform(test_set)
print(f"Features after removing zero-variance pixels: {X_train_full.shape[1]}")

# --- Create a smaller stratified subset for SVM tuning ---
# This is crucial to get results in a reasonable time
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.2, random_state=42, stratify=y_train_full
)
X_train_svm, _, y_train_svm, _ = train_test_split(
    X_train, y_train, train_size=8000, random_state=42, stratify=y_train
)

In [None]:
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=UserWarning)
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [None]:
# --- Objective Functions for Optuna ---
print("\n\n" + "="*60)
print("--- STAGE 1: Starting Fast Scan with Hold-Out Validation ---")
print("="*60)
# --- Corrected Objective Function for a More Robust Fast Scan ---

def objective_fast_scan(trial, model_name):
    """A single objective function for the fast scan with corrected multi-parameter search."""
    xt, yt, xv, yv = (X_train_svm, y_train_svm, X_val, y_val) if 'SVM' in model_name else (X_train, y_train, X_val, y_val)

    if model_name == 'Logistic Regression':
        # --- Corrected Conditional Logic for Solver and Penalty ---
        solver = trial.suggest_categorical('solver', ['lbfgs', 'saga'])
        params = {
            'C': trial.suggest_float('C', 1e-3, 1e3, log=True),
            'solver': solver,
            'max_iter': 1000, # Increased for better convergence
            'random_state': 42
        }
        # Suggest only valid penalties for the chosen solver
        if solver == "lbfgs":
            params['penalty'] = trial.suggest_categorical('penalty_lbfgs', ['l2', None])
        else: # solver == "saga"
            params['penalty'] = trial.suggest_categorical('penalty_saga', ['l1', 'l2', 'elasticnet', None])
            # If elasticnet is chosen, we must also suggest an l1_ratio
            if params['penalty'] == 'elasticnet':
                params['l1_ratio'] = trial.suggest_float('l1_ratio', 0.0, 1.0)
        
        model = LogisticRegression(**params)

    elif model_name == 'Naive Bayes':
        var_smoothing = trial.suggest_float('var_smoothing', 1e-10, 1e-1, log=True)
        model = GaussianNB(var_smoothing=var_smoothing)

    elif model_name == 'Random Forest':
        # --- Corrected Parameter Names and Choices ---
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 400),
            'max_depth': trial.suggest_int('max_depth', 8, 30),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
            'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']), # Corrected to list
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
            'min_impurity_decrease': trial.suggest_float('min_impurity_decrease', 1e-7, 1e-2, log=True) # Corrected name and range
        }
        model = RandomForestClassifier(**params, random_state=42, n_jobs=-1)

    elif model_name == 'XGBoost':
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 800),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'max_depth': trial.suggest_int('max_depth', 3, 7),
            'random_state': 42
        }
        model = xgb.XGBClassifier(**params)
        
    elif model_name == 'LightGBM':
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 800),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'num_leaves': trial.suggest_int('num_leaves', 20, 100),
            'random_state': 42
        }
        model = lgb.LGBMClassifier(**params)
        
    elif model_name == 'CatBoost':
        params = {
            'iterations': trial.suggest_int('iterations', 100, 800),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'depth': trial.suggest_int('depth', 4, 8),
            'random_state': 42,
            'verbose': 0
        }
        model = cb.CatBoostClassifier(**params)
        
    elif model_name == 'Linear SVM':
        C = trial.suggest_float('C', 1e-3, 1e3, log=True)
        model = SVC(C=C, kernel='linear', random_state=42)
        
    elif model_name == 'RBF SVM':
        C = trial.suggest_float('C', 1e-1, 1e4, log=True)
        gamma = trial.suggest_float('gamma', 1e-4, 1e-1, log=True)
        model = SVC(C=C, gamma=gamma, kernel='rbf', random_state=42)

    model.fit(xt, yt)
    return model.score(xv, yv)

In [None]:
N_TRIALS_FAST_SCAN = 10
models_to_scan = ["Logistic Regression", "Naive Bayes", "Random Forest", "XGBoost", "LightGBM", "CatBoost", "Linear SVM", "RBF SVM"]
fast_scan_results = []

for model_name in models_to_scan:
    print(f"\n--- Scanning {model_name} ---")
    study = optuna.create_study(direction='maximize')
    
    # Let Optuna run all trials at once and show its own progress bar
    study.optimize(
        lambda trial: objective_fast_scan(trial, model_name),
        n_trials=N_TRIALS_FAST_SCAN,
        show_progress_bar=True  # This is the key argument
    )
    
    fast_scan_results.append({"Model": model_name, "Best Accuracy (Hold-Out)": study.best_value})