In [1]:
# ============================================
# UNIVERSAL KAGGLE ML PIPELINE (Classification)
# Auto Model Selection + Imputer + Label Encode
# 100% Stable for ANY Dataset
# ============================================

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, log_loss
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Models
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# -----------------------------------------------
# LOAD DATASETS
# -----------------------------------------------
train_df = pd.read_csv("/kaggle/input/mse-2-ai-201-b-ai-c/train.csv")
test_df = pd.read_csv("/kaggle/input/mse-2-ai-201-b-ai-c/test.csv")
sample_submission = pd.read_csv("/kaggle/input/mse-2-ai-201-b-ai-c/sample_submission.csv")

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("Sample submission shape:", sample_submission.shape)

# -----------------------------------------------
# TARGET (last column)
# -----------------------------------------------
target = train_df.columns[-1]

X = train_df.drop(columns=[target])

# Encode target
le = LabelEncoder()
y = le.fit_transform(train_df[target])

# Test data
X_test_final = test_df.copy()

# -----------------------------------------------
# TRAINâ€“VALIDATION SPLIT
# -----------------------------------------------
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -----------------------------------------------
# DETECT COLUMN TYPES
# -----------------------------------------------
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

# -----------------------------------------------
# PREPROCESSOR (Impute + OneHot + Scale)
# -----------------------------------------------
preprocess = ColumnTransformer(
    transformers=[
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols),

        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="mean")),
            ("scaler", StandardScaler())
        ]), num_cols),
    ]
)

# -----------------------------------------------
# MODELS
# -----------------------------------------------
models = {
    "XGBoost": XGBClassifier(
        use_label_encoder=False,
        eval_metric="logloss",
        n_estimators=400,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        n_jobs=-1
    ),
    "LightGBM": LGBMClassifier(
        n_estimators=400,
        learning_rate=0.05,
        max_depth=-1,
        n_jobs=-1
    ),
    "CatBoost": CatBoostClassifier(
        iterations=500,
        learning_rate=0.05,
        depth=6,
        verbose=0,
        thread_count=-1
    ),
    "RandomForest": RandomForestClassifier(
        n_estimators=400,
        max_depth=None,
        n_jobs=-1
    ),
    "LogisticRegression": LogisticRegression(
        max_iter=2000,
        n_jobs=-1
    )
}

# -----------------------------------------------
# MODEL TRAINING LOOP
# -----------------------------------------------
best_model = None
best_logloss = np.inf
best_acc = 0.0
best_name = ""

for name, model in models.items():
    print(f"\nTraining {name} ...")

    pipe = Pipeline([
        ("pre", preprocess),
        ("model", model)
    ])

    pipe.fit(X_train, y_train)

    preds = pipe.predict(X_valid)
    prob = pipe.predict_proba(X_valid)

    acc = accuracy_score(y_valid, preds)
    ll = log_loss(y_valid, prob)

    print(f"===== {name} =====")
    print("Accuracy:", acc)
    print("LogLoss :", ll)

    if ll < best_logloss:
        best_logloss = ll
        best_acc = acc
        best_model = pipe
        best_name = name

# -----------------------------------------------
# BEST MODEL
# -----------------------------------------------
print("\n===============================")
print(" BEST MODEL SELECTED AUTOMATICALLY ")
print("===============================")
print("Model      :", best_name)
print("Accuracy   :", best_acc)
print("LogLoss    :", best_logloss)

# -----------------------------------------------
# FINAL PREDICTIONS
# -----------------------------------------------
final_preds = best_model.predict(X_test_final)

# -----------------------------------------------
# SAFE SUBMISSION (matches test.csv rows)
# -----------------------------------------------
submission = pd.DataFrame()

# Identify ID column for submission
id_col = sample_submission.columns[0]

if id_col in test_df.columns:
    submission[id_col] = test_df[id_col]
else:
    submission[id_col] = np.arange(len(test_df))

submission[target] = le.inverse_transform(final_preds)

submission.to_csv("submission_final.csv", index=False)
print("\nsubmission_final.csv saved!")
print(submission.head())

  if entities is not ():


Train shape: (1824, 17)
Test shape: (645, 17)
Sample submission shape: (450, 2)

Training XGBoost ...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


===== XGBoost =====
Accuracy: 0.8547945205479452
LogLoss : 0.5283829286306402

Training LightGBM ...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000569 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3628
[LightGBM] [Info] Number of data points in the train set: 1459, number of used features: 16
[LightGBM] [Info] Start training from score -0.666768
[LightGBM] [Info] Start training from score -0.886912
[LightGBM] [Info] Start training from score -2.594159




===== LightGBM =====
Accuracy: 0.852054794520548
LogLoss : 0.9165661294219809

Training CatBoost ...
===== CatBoost =====
Accuracy: 0.8575342465753425
LogLoss : 0.4381735543881773

Training RandomForest ...
===== RandomForest =====
Accuracy: 0.8575342465753425
LogLoss : 0.5348115623429283

Training LogisticRegression ...
===== LogisticRegression =====
Accuracy: 0.8054794520547945
LogLoss : 0.5516521802916667

 BEST MODEL SELECTED AUTOMATICALLY 
Model      : CatBoost
Accuracy   : 0.8575342465753425
LogLoss    : 0.4381735543881773

submission_final.csv saved!
   id              Class
0   1  Kirmizi_Pistachio
1   2     Siit_Pistachio
2   3  Kirmizi_Pistachio
3   4  Kirmizi_Pistachio
4   5  Kirmizi_Pistachio


  y = column_or_1d(y, warn=True)
