In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = '/content/drive/MyDrive/kaggle'

In [None]:
!pip install kaggle pandas scikit-learn transformers torch



In [None]:
!kaggle competitions download -c playground-series-s5e8

Downloading playground-series-s5e8.zip to /content
  0% 0.00/14.7M [00:00<?, ?B/s]
100% 14.7M/14.7M [00:00<00:00, 622MB/s]


In [None]:
import zipfile

# Define the path to your zip file
file_path = '/content/playground-series-s5e8.zip'  # Replace 'your_file.zip' with your file's name

# Unzip the file to a specific destination
with zipfile.ZipFile(file_path, 'r') as zip_ref:
    zip_ref.extractall('/content/drive/MyDrive/kaggle')

In [None]:
!unzip playground-series-s5e8.zip

Archive:  playground-series-s5e8.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [None]:
import pandas as pd
import numpy as np

!pip install catboost

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier, Pool

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_submission = pd.read_csv('sample_submission.csv')

In [None]:
df_train.head()

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,0,42,technician,married,secondary,no,7,no,no,cellular,25,aug,117,3,-1,0,unknown,0
1,1,38,blue-collar,married,secondary,no,514,no,no,unknown,18,jun,185,1,-1,0,unknown,0
2,2,36,blue-collar,married,secondary,no,602,yes,no,unknown,14,may,111,2,-1,0,unknown,0
3,3,27,student,single,secondary,no,34,yes,no,unknown,28,may,10,2,-1,0,unknown,0
4,4,26,technician,married,secondary,no,889,yes,no,cellular,3,feb,902,1,-1,0,unknown,1


In [None]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 18 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   id         750000 non-null  int64 
 1   age        750000 non-null  int64 
 2   job        750000 non-null  object
 3   marital    750000 non-null  object
 4   education  750000 non-null  object
 5   default    750000 non-null  object
 6   balance    750000 non-null  int64 
 7   housing    750000 non-null  object
 8   loan       750000 non-null  object
 9   contact    750000 non-null  object
 10  day        750000 non-null  int64 
 11  month      750000 non-null  object
 12  duration   750000 non-null  int64 
 13  campaign   750000 non-null  int64 
 14  pdays      750000 non-null  int64 
 15  previous   750000 non-null  int64 
 16  poutcome   750000 non-null  object
 17  y          750000 non-null  int64 
dtypes: int64(9), object(9)
memory usage: 103.0+ MB


In [None]:
df_test.head()

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,750000,32,blue-collar,married,secondary,no,1397,yes,no,unknown,21,may,224,1,-1,0,unknown
1,750001,44,management,married,tertiary,no,23,yes,no,cellular,3,apr,586,2,-1,0,unknown
2,750002,36,self-employed,married,primary,no,46,yes,yes,cellular,13,may,111,2,-1,0,unknown
3,750003,58,blue-collar,married,secondary,no,-1380,yes,yes,unknown,29,may,125,1,-1,0,unknown
4,750004,28,technician,single,secondary,no,1950,yes,no,cellular,22,jul,181,1,-1,0,unknown


In [None]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 17 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   id         250000 non-null  int64 
 1   age        250000 non-null  int64 
 2   job        250000 non-null  object
 3   marital    250000 non-null  object
 4   education  250000 non-null  object
 5   default    250000 non-null  object
 6   balance    250000 non-null  int64 
 7   housing    250000 non-null  object
 8   loan       250000 non-null  object
 9   contact    250000 non-null  object
 10  day        250000 non-null  int64 
 11  month      250000 non-null  object
 12  duration   250000 non-null  int64 
 13  campaign   250000 non-null  int64 
 14  pdays      250000 non-null  int64 
 15  previous   250000 non-null  int64 
 16  poutcome   250000 non-null  object
dtypes: int64(8), object(9)
memory usage: 32.4+ MB


In [None]:
# 📌 Feature Engineering
# --------------------------

def add_features(df):
    # Total contacts (campaign + previous)
    df["contacts_total"] = df["campaign"] + df["previous"]

    # Duration per contact (avoid division by zero)
    df["duration_per_contact"] = df["duration"] / (df["campaign"] + 1)

    # Was contacted before?
    df["was_contacted_before"] = (df["pdays"] != -1).astype(int)

    # Contacted recently (within 30 days)
    df["recent_contact"] = (df["pdays"] != -1) & (df["pdays"] < 30)
    df["recent_contact"] = df["recent_contact"].astype(int)

    # Month to number (jan=1, dec=12)
    month_map = {'jan':1,'feb':2,'mar':3,'apr':4,'may':5,'jun':6,
                 'jul':7,'aug':8,'sep':9,'oct':10,'nov':11,'dec':12}
    df["month_num"] = df["month"].map(month_map)

    # Seasonal features
    df["is_summer"] = df["month"].isin(["jun","jul","aug"]).astype(int)
    df["is_q4"] = df["month"].isin(["oct","nov","dec"]).astype(int)

    # Balance transformations
    df["balance_log"] = np.log1p(df["balance"].clip(lower=0))
    df["is_negative_balance"] = (df["balance"] < 0).astype(int)

    # Demographic interactions
    df["job_edu"] = df["job"] + "_" + df["education"]

    return df

In [None]:
df_train = add_features(df_train)
df_test = add_features(df_test)

In [None]:
# 📌 TARGET + FEATURE SELECTION
# ==============================================================
y = df_train['y']  # already 0/1
cat_cols = ['job','marital','education','default','housing','loan',
            'contact','month','poutcome','job_edu']

num_cols = ['age','balance','day','duration','campaign','pdays','previous',
            'contacts_total','duration_per_contact','was_contacted_before',
            'recent_contact','month_num','is_summer','is_q4','balance_log',
            'is_negative_balance']

features = cat_cols + num_cols

In [None]:
sample_frac = 0.2
df_sample = df_train.sample(frac=sample_frac, random_state=42).copy()
y_sample = df_sample['y']
X_sample = df_sample[features]

skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
oof_preds_sample = np.zeros(len(X_sample))

In [None]:
for fold, (trn_idx, val_idx) in enumerate(skf.split(X_sample, y_sample), 1):
    print(f"\n---- Fold {fold} (sample) ----")
    X_trn, X_val = X_sample.iloc[trn_idx], X_sample.iloc[val_idx]
    y_trn, y_val = y_sample.iloc[trn_idx], y_sample.iloc[val_idx]
    train_pool = Pool(X_trn, label=y_trn, cat_features=cat_cols)
    val_pool   = Pool(X_val, label=y_val, cat_features=cat_cols)

    model = CatBoostClassifier(
        iterations=800,
        learning_rate=0.03,
        depth=8,
        loss_function='Logloss',
        eval_metric='AUC',
        random_seed=42,
        verbose=100,
        early_stopping_rounds=50
    )
    model.fit(train_pool, eval_set=val_pool, use_best_model=True)
    oof_preds_sample[val_idx] = model.predict_proba(val_pool)[:, 1]


---- Fold 1 (sample) ----
0:	test: 0.9317953	best: 0.9317953 (0)	total: 800ms	remaining: 10m 38s
100:	test: 0.9545844	best: 0.9545844 (100)	total: 40.6s	remaining: 4m 41s
200:	test: 0.9576846	best: 0.9576846 (200)	total: 1m 19s	remaining: 3m 56s
300:	test: 0.9589135	best: 0.9589135 (300)	total: 1m 51s	remaining: 3m 5s
400:	test: 0.9596860	best: 0.9596860 (400)	total: 2m 23s	remaining: 2m 23s
500:	test: 0.9603323	best: 0.9603323 (500)	total: 2m 57s	remaining: 1m 46s
600:	test: 0.9607501	best: 0.9607511 (598)	total: 3m 32s	remaining: 1m 10s
700:	test: 0.9610360	best: 0.9610360 (700)	total: 4m 5s	remaining: 34.7s
799:	test: 0.9611952	best: 0.9611952 (799)	total: 4m 40s	remaining: 0us

bestTest = 0.9611951526
bestIteration = 799


---- Fold 2 (sample) ----
0:	test: 0.9297864	best: 0.9297864 (0)	total: 335ms	remaining: 4m 27s
100:	test: 0.9563587	best: 0.9563587 (100)	total: 31.2s	remaining: 3m 36s
200:	test: 0.9594041	best: 0.9594041 (200)	total: 1m 3s	remaining: 3m 8s
300:	test: 0.960603

In [None]:
print("\nSample OOF ROC AUC:", roc_auc_score(y_sample, oof_preds_sample))


Sample OOF ROC AUC: 0.9622655174934973


In [None]:
# --- Feature importance
feature_importances = model.get_feature_importance(train_pool)
sorted_feats = sorted(zip(features, feature_importances), key=lambda x: -x[1])
print("\nTop Feature Importances:")
for feat, score in sorted_feats[:10]:
    print(f"{feat:20}: {score:.2f}")


Top Feature Importances:
duration            : 38.10
month               : 8.46
contact             : 7.49
housing             : 6.29
duration_per_contact: 5.90
day                 : 5.56
balance             : 3.25
balance_log         : 2.98
age                 : 2.85
poutcome            : 2.82


In [None]:
# 📌 SELECT TOP FEATURES & RETRAIN FULL MODEL
# ==============================================================
top_n = 10
top_features = [f for f, _ in sorted_feats[:top_n]]
print("\nSelected top features:", top_features)

X_top = df_train[top_features]
X_test_top = df_test[top_features]
y_full = df_train['y']

skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
oof_preds_full = np.zeros(len(X_top))
test_preds_full = np.zeros(len(X_test_top))



Selected top features: ['duration', 'month', 'contact', 'housing', 'duration_per_contact', 'day', 'balance', 'balance_log', 'age', 'poutcome']


In [None]:
for fold, (trn_idx, val_idx) in enumerate(skf.split(X_top, y_full), 1):
    print(f"\n---- Fold {fold} (full/top features) ----")
    X_trn, X_val = X_top.iloc[trn_idx], X_top.iloc[val_idx]
    y_trn, y_val = y_full.iloc[trn_idx], y_full.iloc[val_idx]

    train_pool = Pool(X_trn, label=y_trn, cat_features=[c for c in top_features if c in cat_cols])
    val_pool   = Pool(X_val, label=y_val, cat_features=[c for c in top_features if c in cat_cols])
    test_pool  = Pool(X_test_top, cat_features=[c for c in top_features if c in cat_cols])

    model.fit(train_pool, eval_set=val_pool, use_best_model=True)
    oof_preds_full[val_idx] = model.predict_proba(val_pool)[:, 1]
    test_preds_full += model.predict_proba(test_pool)[:, 1] / skf.n_splits


---- Fold 1 (full/top features) ----
0:	test: 0.9140148	best: 0.9140148 (0)	total: 774ms	remaining: 10m 18s
100:	test: 0.9555928	best: 0.9555928 (100)	total: 1m 23s	remaining: 9m 41s
200:	test: 0.9591629	best: 0.9591629 (200)	total: 2m 47s	remaining: 8m 20s
300:	test: 0.9610250	best: 0.9610250 (300)	total: 4m 10s	remaining: 6m 55s
400:	test: 0.9618921	best: 0.9618921 (400)	total: 5m 34s	remaining: 5m 33s
500:	test: 0.9625263	best: 0.9625263 (500)	total: 7m 3s	remaining: 4m 12s
600:	test: 0.9630072	best: 0.9630072 (600)	total: 8m 31s	remaining: 2m 49s
700:	test: 0.9633520	best: 0.9633520 (700)	total: 10m 3s	remaining: 1m 25s
799:	test: 0.9636148	best: 0.9636148 (799)	total: 11m 34s	remaining: 0us

bestTest = 0.963614763
bestIteration = 799


---- Fold 2 (full/top features) ----
0:	test: 0.9145100	best: 0.9145100 (0)	total: 783ms	remaining: 10m 25s
100:	test: 0.9550181	best: 0.9550181 (100)	total: 1m 27s	remaining: 10m 8s
200:	test: 0.9588514	best: 0.9588514 (200)	total: 2m 57s	remainin

In [None]:
print("\nFull OOF ROC AUC (Top Features):", roc_auc_score(y_full, oof_preds_full))


Full OOF ROC AUC (Top Features): 0.9634741768361217


In [None]:
# 📌 SUBMISSION
# ==============================================================
df_submission['y'] = test_preds_full
df_submission.to_csv('submission_catboost_top_features.csv', index=False)

!kaggle competitions submit -c playground-series-s5e8 -f submission_catboost_top_features.csv -m "My submission from Colab"

100% 6.69M/6.69M [00:00<00:00, 16.2MB/s]
Successfully submitted to Binary Classification with a Bank Dataset