# Import

In [1]:
import os
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
# config
RANDOM_STATE = 42
N_SPLITS = 5
OUTPUT_PROBA = True  # If True, outputs probability for class 1; else outputs 0/1 label

SUBMISSION_PATH = 'submission.csv'
MODEL_PATH = 'best_model.joblib'

In [3]:
train = pd.read_csv('/kaggle/input/playground-series-s5e8/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e8/test.csv')

# Preprocessing

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 18 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   id         750000 non-null  int64 
 1   age        750000 non-null  int64 
 2   job        750000 non-null  object
 3   marital    750000 non-null  object
 4   education  750000 non-null  object
 5   default    750000 non-null  object
 6   balance    750000 non-null  int64 
 7   housing    750000 non-null  object
 8   loan       750000 non-null  object
 9   contact    750000 non-null  object
 10  day        750000 non-null  int64 
 11  month      750000 non-null  object
 12  duration   750000 non-null  int64 
 13  campaign   750000 non-null  int64 
 14  pdays      750000 non-null  int64 
 15  previous   750000 non-null  int64 
 16  poutcome   750000 non-null  object
 17  y          750000 non-null  int64 
dtypes: int64(9), object(9)
memory usage: 103.0+ MB


In [5]:
print("\nTarget distribution (y):")
print(train['y'].value_counts(normalize=True).rename('ratio'))


Target distribution (y):
y
0    0.879349
1    0.120651
Name: ratio, dtype: float64


In [8]:
id_col = 'id'
target = 'y'
feature_cols = [c for c in train.columns if c not in [target, id_col]]

X = train[feature_cols]
y = train[target]

cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

print(f"\nCategorical cols ({len(cat_cols)}):", cat_cols)
print(f"Numeric cols ({len(num_cols)}):", num_cols)


Categorical cols (9): ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
Numeric cols (7): ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']


In [9]:
# Compute class weight for imbalance (pos_weight ~ N_neg / N_pos)
pos_count = y.sum()
neg_count = len(y) - pos_count
scale_pos_weight = float(neg_count / pos_count)
print(f"\nClass imbalance: pos={pos_count}, neg={neg_count}, scale_pos_weight≈{scale_pos_weight:.2f}")


Class imbalance: pos=90488, neg=659512, scale_pos_weight≈7.29


In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median"))
        ]), num_cols),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=True))
        ]), cat_cols)
    ],
    remainder='drop',
)

# Models

In [20]:
models = []

In [21]:
# 1) Logistic Regression
logreg = LogisticRegression(max_iter=2000, n_jobs=None, class_weight='balanced')
models.append(("LogReg", logreg))

In [22]:
# 2) Random Forest
rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    n_jobs=-1,
    class_weight='balanced_subsample',
    random_state=RANDOM_STATE
)
models.append(("RandomForest", rf))

In [26]:
# 3) XGBoost
from xgboost import XGBClassifier
xgb = XGBClassifier(
    n_estimators=800,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    random_state=RANDOM_STATE,
    objective='binary:logistic',
    tree_method='hist',
    eval_metric='auc',
    # Handle imbalance
    scale_pos_weight=scale_pos_weight,
    n_jobs=-1,
)
models.append(("XGBoost", xgb))

In [29]:
# 4) LightGBM
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier(
    n_estimators=1200,
    learning_rate=0.03,
    num_leaves=64,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    random_state=RANDOM_STATE,
    # Handle imbalance
    scale_pos_weight=scale_pos_weight,
    objective='binary',
    n_jobs=-1,
)
models.append(("LightGBM", lgbm))

In [None]:
cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

cv_results = []

for name, est in models:
    pipe = Pipeline([
        ("pre", preprocessor),
        ("model", est)
    ])

    auc_scores = cross_val_score(pipe, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
    f1_scores = cross_val_score(pipe, X, y, scoring='f1', cv=cv, n_jobs=-1)

    cv_results.append({
        'model': name,
        'auc_mean': float(np.mean(auc_scores)),
        'auc_std': float(np.std(auc_scores)),
        'f1_mean': float(np.mean(f1_scores)),
        'f1_std': float(np.std(f1_scores)),
    })

cv_df = pd.DataFrame(cv_results).sort_values('auc_mean', ascending=False)
print("\nCV Results (sorted by AUC):")
print(cv_df)

best_model_name = cv_df.iloc[0]['model']
print(f"\nSelected best model: {best_model_name}")

# Full training

In [None]:
if OUTPUT_PROBA:
    va_pred_proba = best_pipe.predict_proba(X_va)[:, 1]
    va_pred = (va_pred_proba >= 0.5).astype(int)
else:
    va_pred_proba = None
    va_pred = best_pipe.predict(X_va)

val_auc = roc_auc_score(y_va, va_pred_proba) if va_pred_proba is not None else np.nan
val_f1 = f1_score(y_va, va_pred)
print(f"Validation AUC: {val_auc:.5f} | F1: {val_f1:.5f}")

print("\nRefitting best model on FULL training data...")
best_pipe.fit(X, y)

In [None]:
X_test = test[feature_cols]
OUTPUT_PROBA = False

test_pred = best_pipe.predict(X_test)
submission = pd.DataFrame({'id': test[id_col], 'y': test_pred.astype(int)})

submission.to_csv(SUBMISSION_PATH, index=False)
print(f"\nSaved submission to {SUBMISSION_PATH}")

In [None]:
dump(best_pipe, MODEL_PATH)
print(f"Saved trained model to {MODEL_PATH}")

# LightGBM only

In [11]:
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary",
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    n_jobs=-1
)

pipe = Pipeline([
    ("pre", preprocessor),
    ("model", lgbm)
])

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
auc_scores = cross_val_score(pipe, X, y, scoring="roc_auc", cv=cv, n_jobs=-1)

print("CV AUC:", auc_scores)
print("Mean AUC:", auc_scores.mean())



CV AUC: [0.96794391 0.967384   0.96745333]
Mean AUC: 0.9675937469809988


In [13]:
pipe.fit(X, y)
X_test = test[feature_cols]
y_pred = pipe.predict_proba(X_test)[:, 1]

submission = pd.DataFrame({
    id_col: test[id_col],
    "y": y_pred
})

submission.to_csv("submission.csv", index=False)
print("Saved submission.csv")

[LightGBM] [Info] Number of positive: 90488, number of negative: 659512
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031121 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1045
[LightGBM] [Info] Number of data points in the train set: 750000, number of used features: 51
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120651 -> initscore=-1.986283
[LightGBM] [Info] Start training from score -1.986283
Saved submission.csv
[LightGBM] [Info] Number of positive: 60326, number of negative: 439674
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.130153 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1044
[LightGBM] [Info] Number of data points in the train set: 500000, number of used featur