In [127]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Dataset

In [128]:
df = pd.read_csv("bank-full.csv", sep=";")

In [129]:
df.head(2)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no


In [130]:
df.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


## Features & prep

In [131]:
features = ["age", "job", "marital", "education", "balance", "housing", "contact", "day", "month", "duration", "campaign", "pdays", "previous", "poutcome", "y"]

In [132]:
df = df[features]
df.head(2)

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no


In [133]:
df.isna().sum() # ok no missings

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [134]:
df.dtypes

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [135]:
categorical_cols = df.select_dtypes(include=['object']).columns
categorical_cols

Index(['job', 'marital', 'education', 'housing', 'contact', 'month',
       'poutcome', 'y'],
      dtype='object')

In [136]:
for col in categorical_cols:
    print(f"{col}: # unique = {df[col].nunique()}, mode = {df[col].mode()[0]}")

# Q1 -> education: secondary

job: # unique = 12, mode = blue-collar
marital: # unique = 3, mode = married
education: # unique = 4, mode = secondary
housing: # unique = 2, mode = yes
contact: # unique = 3, mode = cellular
month: # unique = 12, mode = may
poutcome: # unique = 4, mode = unknown
y: # unique = 2, mode = no


In [137]:
numerical_cols = df.select_dtypes(include=['int64']).columns
numerical_cols

Index(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous'], dtype='object')

In [138]:
cm = df[numerical_cols].corr()
cm

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


In [139]:
cm = cm.unstack()

In [140]:
cm[cm != 1].sort_values(ascending=False) # Q2 -> previous & pdays

previous  pdays       0.454820
pdays     previous    0.454820
campaign  day         0.162490
day       campaign    0.162490
balance   age         0.097783
age       balance     0.097783
duration  balance     0.021560
balance   duration    0.021560
previous  balance     0.016674
balance   previous    0.016674
campaign  age         0.004760
age       campaign    0.004760
day       balance     0.004503
balance   day         0.004503
          pdays       0.003435
pdays     balance     0.003435
previous  age         0.001288
age       previous    0.001288
previous  duration    0.001203
duration  previous    0.001203
pdays     duration   -0.001565
duration  pdays      -0.001565
          age        -0.004648
age       duration   -0.004648
          day        -0.009120
day       age        -0.009120
balance   campaign   -0.014578
campaign  balance    -0.014578
pdays     age        -0.023758
age       pdays      -0.023758
day       duration   -0.030206
duration  day        -0.030206
campaign

## Target encoding

In [141]:
df.y = (df.y == "yes").astype(int)

In [142]:
df.y.head()

0    0
1    0
2    0
3    0
4    0
Name: y, dtype: int64

In [143]:
from sklearn.model_selection import train_test_split

In [144]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

df_train_full = df_train_full.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train_full = df_train_full.y
y_train = df_train.y
y_val = df_val.y
y_test = df_test.y

del df_train_full["y"]
del df_train["y"]
del df_val["y"]
del df_test["y"]

In [145]:
assert "y" not in df_train.columns
assert "y" not in df_val.columns
assert "y" not in df_test.columns

assert len(df_train) == len(y_train)
assert len(df_val) == len(y_val)
assert len(df_test) == len(y_test)

In [146]:
categorical_cols = df.drop(["y"], axis=1).select_dtypes(include=['object']).columns
categorical_cols

Index(['job', 'marital', 'education', 'housing', 'contact', 'month',
       'poutcome'],
      dtype='object')

In [147]:
from sklearn.metrics import mutual_info_score

In [148]:
for col in categorical_cols:
    print(f"{col}: {round(mutual_info_score(y_train, df_train[col]), 2)}") 

# Q3 -> poutcome

job: 0.01
marital: 0.0
education: 0.0
housing: 0.01
contact: 0.01
month: 0.03
poutcome: 0.03


## Model

In [149]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [150]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)

In [151]:
dv = DictVectorizer(sparse=False)
dv.fit(df_train.to_dict(orient="records"))

In [152]:
X_train = dv.transform(df_train.to_dict(orient="records"))
X_val = dv.transform(df_val.to_dict(orient="records"))
X_test = dv.transform(df_test.to_dict(orient="records"))

In [153]:
model.fit(X_train, y_train)

In [154]:
features = dict(zip(dv.get_feature_names_out(), model.coef_[0]))
features

{'age': np.float64(0.0007560903737006148),
 'balance': np.float64(1.0239553020929878e-05),
 'campaign': np.float64(-0.07814084474109283),
 'contact=cellular': np.float64(0.2537838620919929),
 'contact=telephone': np.float64(0.08125060157373194),
 'contact=unknown': np.float64(-1.3127137567910776),
 'day': np.float64(0.009198024125866738),
 'duration': np.float64(0.0041575846868236324),
 'education=primary': np.float64(-0.44427774771079187),
 'education=secondary': np.float64(-0.25052080428985285),
 'education=tertiary': np.float64(-0.0545585912680546),
 'education=unknown': np.float64(-0.22832214985659563),
 'housing=no': np.float64(-0.14263942186403844),
 'housing=yes': np.float64(-0.8350398712613015),
 'job=admin.': np.float64(0.0954729817941705),
 'job=blue-collar': np.float64(-0.24204889589842715),
 'job=entrepreneur': np.float64(-0.2639677224268222),
 'job=housemaid': np.float64(-0.33140003704332366),
 'job=management': np.float64(-0.08110956077560488),
 'job=retired': np.float64(

In [155]:
model.intercept_[0]

np.float64(-0.9776792931251067)

## Eval

In [156]:
eval_probs = model.predict_proba(X_val)[:, 1]
eval_preds = (eval_probs >= 0.5).astype(int)
eval_preds.astype(int)

array([0, 0, 0, ..., 0, 0, 0])

In [157]:
eval_acc = (eval_preds == y_val).mean()
round(eval_acc, 2) # Q4 -> 0.9 acc

np.float64(0.9)

In [158]:
df_pred_eval = pd.DataFrame()
df_pred_eval["actual"] = y_val
df_pred_eval["predicted"] = eval_preds
df_pred_eval["proba"] = eval_probs
df_pred_eval["correct"] = df_pred_eval.actual == df_pred_eval.predicted
df_pred_eval

Unnamed: 0,actual,predicted,proba,correct
0,0,0,0.012398,True
1,0,0,0.010117,True
2,1,0,0.154691,False
3,0,0,0.226219,True
4,1,0,0.443337,False
...,...,...,...,...
9037,0,0,0.022102,True
9038,1,0,0.265153,False
9039,0,0,0.055921,True
9040,0,0,0.009051,True


## Feature elimination

In [159]:
# iterate through all features, retrain model without this feature and calculate stats

benchmark_acc = eval_acc
results = dict()

for feature in df_train.columns:
    X_t = dv.transform(df_train.drop(columns=[feature]).to_dict(orient="records"))
    X_v = dv.transform(df_val.drop(columns=[feature]).to_dict(orient="records"))
    model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
    model.fit(X_t, y_train)
    eval_probs = model.predict_proba(X_v)[:, 1]
    eval_preds = (eval_probs >= 0.5).astype(int)
    eval_acc = (eval_preds == y_val).mean()
    print(f"feature: {feature}, acc: {round(eval_acc, 4)}, diff: {round(eval_acc - benchmark_acc, 4)}")
    results[feature] = eval_acc

results = pd.DataFrame(results.items(), columns=["feature", "acc"])


feature: age, acc: 0.901, diff: 0.0002
feature: job, acc: 0.9009, diff: 0.0001
feature: marital, acc: 0.9007, diff: -0.0001
feature: education, acc: 0.901, diff: 0.0002
feature: balance, acc: 0.9008, diff: 0.0
feature: housing, acc: 0.9011, diff: 0.0003
feature: contact, acc: 0.9008, diff: 0.0
feature: day, acc: 0.9012, diff: 0.0004
feature: month, acc: 0.8999, diff: -0.0009
feature: duration, acc: 0.8898, diff: -0.0109
feature: campaign, acc: 0.9007, diff: -0.0001
feature: pdays, acc: 0.9013, diff: 0.0006
feature: previous, acc: 0.9015, diff: 0.0007
feature: poutcome, acc: 0.8936, diff: -0.0072


In [173]:
results["diff"] = results.acc - benchmark_acc
results["abs_diff"] = results["diff"].abs()
results = results.sort_values(by="abs_diff", ascending=False)
results # Q5 -> balance

Unnamed: 0,feature,acc,diff,abs_diff
9,duration,0.889847,-0.010949,0.010949
13,poutcome,0.893608,-0.007189,0.007189
8,month,0.899912,-0.000885,0.000885
12,previous,0.90146,0.000664,0.000664
11,pdays,0.901349,0.000553,0.000553
7,day,0.901239,0.000442,0.000442
5,housing,0.901128,0.000332,0.000332
0,age,0.901017,0.000221,0.000221
3,education,0.901017,0.000221,0.000221
2,marital,0.900686,-0.000111,0.000111


## Regularization

In [161]:
reg_params = [0.01, 0.1, 1, 10, 100]

for r in reg_params:
    model = LogisticRegression(solver="liblinear", C=r, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    eval_probs = model.predict_proba(X_val)[:, 1]
    eval_preds = (eval_probs >= 0.5).astype(int)
    eval_acc = (eval_preds == y_val).mean()
    print(f"reg param: {r}, acc: {round(eval_acc, 3)}")

reg param: 0.01, acc: 0.899
reg param: 0.1, acc: 0.9
reg param: 1, acc: 0.901
reg param: 10, acc: 0.901
reg param: 100, acc: 0.901
