In [1]:
# --- prerequisites ---
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_val_score
import matplotlib.pyplot as plt

# --- load preprocessed data (saved by your pipeline) ---
train = pd.read_csv("train_preprocessed.csv")
test  = pd.read_csv("test_preprocessed.csv")

X_train = train.drop(columns=["deposit"])
y_train = train["deposit"]
X_test  = test.drop(columns=["deposit"])
y_test  = test["deposit"]

# --- helper to evaluate models ---
def evaluate_model(name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:,1]
    else:
        # xgboost sklearn wrapper has predict_proba; fallback:
        y_proba = model.predict(X_test)
    print(f"--- {name} ---")
    print(classification_report(y_test, y_pred, digits=4))
    try:
        auc = roc_auc_score(y_test, y_proba)
        print(f"AUC: {auc:.4f}")
    except:
        pass
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion matrix:\n", cm)
    print()

# --- 1) Logistic Regression (baseline) ---
lr = LogisticRegression(max_iter=2000, solver="liblinear")  # liblinear fine for baseline
lr.fit(X_train, y_train)
evaluate_model("Logistic Regression", lr, X_test, y_test)

# --- 2) Random Forest (baseline) ---
rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
evaluate_model("Random Forest", rf, X_test, y_test)

# feature importance (RF)
importances = rf.feature_importances_
feat_names = X_train.columns
imp_df = pd.DataFrame({"feature":feat_names, "importance":importances}).sort_values("importance", ascending=False).head(20)
print("Top RF features:\n", imp_df)

# --- 3) XGBoost (sklearn API) ---
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss", n_jobs=-1, random_state=42, n_estimators=200)
xgb_clf.fit(X_train, y_train)
evaluate_model("XGBoost", xgb_clf, X_test, y_test)

# XGBoost feature importance
xgb_imp = xgb_clf.get_booster().get_score(importance_type='gain')
xgb_imp_df = pd.DataFrame.from_dict(xgb_imp, orient='index', columns=['gain']).sort_values('gain', ascending=False).head(20)
print("Top XGBoost features:\n", xgb_imp_df)


--- Logistic Regression ---
              precision    recall  f1-score   support

           0     0.6634    0.8170    0.7323      1175
           1     0.7265    0.5397    0.6193      1058

    accuracy                         0.6856      2233
   macro avg     0.6950    0.6784    0.6758      2233
weighted avg     0.6933    0.6856    0.6787      2233

AUC: 0.7410
Confusion matrix:
 [[960 215]
 [487 571]]

--- Random Forest ---
              precision    recall  f1-score   support

           0     0.7004    0.8077    0.7502      1175
           1     0.7426    0.6163    0.6736      1058

    accuracy                         0.7170      2233
   macro avg     0.7215    0.7120    0.7119      2233
weighted avg     0.7204    0.7170    0.7139      2233

AUC: 0.7719
Confusion matrix:
 [[949 226]
 [406 652]]

Top RF features:
                 feature  importance
1               balance    0.165418
0                   age    0.148279
2                   day    0.135233
3              campaign 

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


--- XGBoost ---
              precision    recall  f1-score   support

           0     0.6962    0.7762    0.7340      1175
           1     0.7151    0.6238    0.6663      1058

    accuracy                         0.7040      2233
   macro avg     0.7056    0.7000    0.7002      2233
weighted avg     0.7051    0.7040    0.7019      2233

AUC: 0.7530
Confusion matrix:
 [[912 263]
 [398 660]]

Top XGBoost features:
                        gain
poutcome_success  34.444767
month_mar          8.225923
month_oct          5.523234
housing            4.811553
month_may          3.515455
month_feb          2.956156
loan               2.927634
month_sep          2.646837
month_num          2.602142
month_nov          2.507586
month_jun          2.383405
poutcome_other     2.281432
day                2.065992
job_student        2.065295
job_blue-collar    1.828633
marital_single     1.782439
month_jul          1.757674
marital_married    1.754774
month_aug          1.710684
age                

In [2]:
!pip install xgboost

