In [609]:
import pandas as pd

In [610]:
cols = [
    "age",
    "job",
    "marital",
    "education",
    "balance",
    "housing",
    "contact",
    "day",
    "month",
    "duration",
    "campaign",
    "pdays",
    "previous",
    "poutcome",
    "y"
]

In [611]:
df = pd.read_csv("./bank/bank-full.csv", delimiter=';')[cols]

In [612]:
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [613]:
df.dtypes

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [614]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numeric_cols = df.select_dtypes(include=['int64']).columns.tolist()

categorical_cols.remove('y')

In [615]:
display(categorical_cols)
display(numeric_cols)

['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [616]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

# Question 1:

In [617]:
df['education'].value_counts()

education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64

# Question 2:

In [618]:
cors_df = df[['age','balance','day','campaign','pdays','previous']].corr()

for col in cors_df.columns:
    display(cors_df[[col]])

Unnamed: 0,age
age,1.0
balance,0.097783
day,-0.00912
campaign,0.00476
pdays,-0.023758
previous,0.001288


Unnamed: 0,balance
age,0.097783
balance,1.0
day,0.004503
campaign,-0.014578
pdays,0.003435
previous,0.016674


Unnamed: 0,day
age,-0.00912
balance,0.004503
day,1.0
campaign,0.16249
pdays,-0.093044
previous,-0.05171


Unnamed: 0,campaign
age,0.00476
balance,-0.014578
day,0.16249
campaign,1.0
pdays,-0.088628
previous,-0.032855


Unnamed: 0,pdays
age,-0.023758
balance,0.003435
day,-0.093044
campaign,-0.088628
pdays,1.0
previous,0.45482


Unnamed: 0,previous
age,0.001288
balance,0.016674
day,-0.05171
campaign,-0.032855
pdays,0.45482
previous,1.0


# Target encoding

In [619]:
df.y = (df.y == 'yes').astype('int')

# Split the data

In [620]:
from sklearn.model_selection import train_test_split

In [621]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [622]:
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=42)

In [623]:
y_train = df_train.y.values
y_val = df_val.y.values

In [624]:
del df_train['y']
del df_val['y']

# Question 3: Mutual Information

In [625]:
from sklearn.metrics import mutual_info_score

In [626]:
mi = df_train[categorical_cols].apply(lambda col: mutual_info_score(col, y_train))
round(mi,2).sort_values(ascending=False)

month        0.03
poutcome     0.03
job          0.01
housing      0.01
contact      0.01
marital      0.00
education    0.00
dtype: float64

# One hot encoding

In [627]:
from sklearn.feature_extraction import DictVectorizer

In [628]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical_cols + numeric_cols].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical_cols + numeric_cols].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [629]:
train_dict

[{'job': 'admin.',
  'marital': 'single',
  'education': 'secondary',
  'housing': 'yes',
  'contact': 'unknown',
  'month': 'may',
  'poutcome': 'unknown',
  'age': 24,
  'balance': -507,
  'day': 27,
  'duration': 446,
  'campaign': 1,
  'pdays': -1,
  'previous': 0},
 {'job': 'admin.',
  'marital': 'married',
  'education': 'unknown',
  'housing': 'no',
  'contact': 'unknown',
  'month': 'jun',
  'poutcome': 'unknown',
  'age': 47,
  'balance': 989,
  'day': 19,
  'duration': 116,
  'campaign': 1,
  'pdays': -1,
  'previous': 0},
 {'job': 'technician',
  'marital': 'married',
  'education': 'tertiary',
  'housing': 'yes',
  'contact': 'cellular',
  'month': 'aug',
  'poutcome': 'unknown',
  'age': 55,
  'balance': 1097,
  'day': 7,
  'duration': 191,
  'campaign': 4,
  'pdays': -1,
  'previous': 0},
 {'job': 'housemaid',
  'marital': 'married',
  'education': 'primary',
  'housing': 'no',
  'contact': 'cellular',
  'month': 'aug',
  'poutcome': 'unknown',
  'age': 46,
  'balance': 0

# Train model:

In [630]:
from sklearn.linear_model import LogisticRegression

In [631]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [632]:
model.fit(X_train, y_train)

In [633]:
y_pred = model.predict_proba(X_val)[:, 1] # 1 is just get the second value, which means get the proability of positive

In [634]:
y_decision = (y_pred >= 0.5)

In [635]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = y_pred.astype(int)
df_pred['actual'] = y_val

In [636]:
df_pred['correct'] = df_pred.prediction == df_pred.actual
origin_accuracy = round(df_pred.correct.mean(),2)

# Question 5

In [637]:
dict(zip(dv.get_feature_names_out(), model.coef_[0].round(3)))

{'age': -0.001,
 'balance': 0.0,
 'campaign': -0.08,
 'contact=cellular': 0.285,
 'contact=telephone': 0.09,
 'contact=unknown': -1.326,
 'day': 0.009,
 'duration': 0.004,
 'education=primary': -0.447,
 'education=secondary': -0.248,
 'education=tertiary': -0.042,
 'education=unknown': -0.213,
 'housing=no': -0.119,
 'housing=yes': -0.832,
 'job=admin.': 0.121,
 'job=blue-collar': -0.17,
 'job=entrepreneur': -0.258,
 'job=housemaid': -0.329,
 'job=management': -0.107,
 'job=retired': 0.203,
 'job=self-employed': -0.304,
 'job=services': -0.146,
 'job=student': 0.343,
 'job=technician': -0.119,
 'job=unemployed': -0.035,
 'job=unknown': -0.151,
 'marital=divorced': -0.267,
 'marital=married': -0.477,
 'marital=single': -0.207,
 'month=apr': -0.048,
 'month=aug': -0.783,
 'month=dec': 0.415,
 'month=feb': -0.336,
 'month=jan': -1.144,
 'month=jul': -1.056,
 'month=jun': 0.309,
 'month=mar': 1.518,
 'month=may': -0.521,
 'month=nov': -1.036,
 'month=oct': 0.913,
 'month=sep': 0.819,
 'pda

The least useful features is:

- 'age': -0.001,
- 'balance': 0.0,
- 'previous': 0.005


## Train without age:

In [638]:
df_train_small = df_train.loc[:, df_train.columns != 'age']
df_val_small = df_val.loc[:, df_val.columns != 'age']
df_test_small = df_test.loc[:, df_test.columns != 'age']
categorical_cols_small = categorical_cols.copy()
numeric_cols_small = numeric_cols.copy()
numeric_cols_small.remove("age")

In [639]:
dv = DictVectorizer(sparse=False)

train_dict_small = df_train_small[categorical_cols_small+ numeric_cols_small].to_dict(orient='records')
X_train_small = dv.fit_transform(train_dict_small)

val_dict_small = df_val_small[categorical_cols_small + numeric_cols_small].to_dict(orient='records')
X_val_small = dv.transform(val_dict_small)

In [640]:
model_small = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_small.fit(X_train_small, y_train)

In [641]:
y_pred_small = model_small.predict_proba(X_val_small)[:,1]

In [642]:
df_pred_small = pd.DataFrame()
df_pred_small['probability'] = y_pred_small
df_pred_small['prediction'] = y_pred_small.astype(int)
df_pred_small['actual'] = y_val
df_pred_small['correct'] = df_pred_small.prediction == df_pred_small.actual

In [643]:
remove_age_accuracy = df_pred_small.correct.mean()
display(remove_age_accuracy)

0.8835455764075067

## Train without balance:

In [644]:
df_train_small = df_train.loc[:, df_train.columns != 'balance']
df_val_small = df_val.loc[:, df_val.columns != 'balance']
df_test_small = df_test.loc[:, df_test.columns != 'balance']
categorical_cols_small = categorical_cols.copy()
numeric_cols_small = numeric_cols.copy()
numeric_cols_small.remove("balance")

In [645]:
dv = DictVectorizer(sparse=False)

train_dict_small = df_train_small[categorical_cols_small+ numeric_cols_small].to_dict(orient='records')
X_train_small = dv.fit_transform(train_dict_small)

val_dict_small = df_val_small[categorical_cols_small + numeric_cols_small].to_dict(orient='records')
X_val_small = dv.transform(val_dict_small)

In [646]:
model_small = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_small.fit(X_train_small, y_train)

In [647]:
y_pred_small = model_small.predict_proba(X_val_small)[:,1]

In [648]:
df_pred_small = pd.DataFrame()
df_pred_small['probability'] = y_pred_small
df_pred_small['prediction'] = y_pred_small.astype(int)
df_pred_small['actual'] = y_val
df_pred_small['correct'] = df_pred_small.prediction == df_pred_small.actual
remove_balance_accuracy = df_pred_small.correct.mean()
display(remove_balance_accuracy)

0.8835455764075067

## Train without marital:

In [649]:
df_train_small = df_train.loc[:, df_train.columns != 'marital']
df_val_small = df_val.loc[:, df_val.columns != 'marital']
df_test_small = df_test.loc[:, df_test.columns != 'marital']
categorical_cols_small = categorical_cols.copy()
categorical_cols_small.remove('marital')
numeric_cols_small = numeric_cols.copy()

In [650]:
dv = DictVectorizer(sparse=False)

train_dict_small = df_train_small[categorical_cols_small+ numeric_cols_small].to_dict(orient='records')
X_train_small = dv.fit_transform(train_dict_small)

val_dict_small = df_val_small[categorical_cols_small + numeric_cols_small].to_dict(orient='records')
X_val_small = dv.transform(val_dict_small)

In [651]:
model_small = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_small.fit(X_train_small, y_train)

In [652]:
y_pred_small = model_small.predict_proba(X_val_small)[:,1]

In [653]:
df_pred_small = pd.DataFrame()
df_pred_small['probability'] = y_pred_small
df_pred_small['prediction'] = y_pred_small.astype(int)
df_pred_small['actual'] = y_val
df_pred_small['correct'] = df_pred_small.prediction == df_pred_small.actual
remove_marital_accuracy = df_pred_small.correct.mean()
display(remove_marital_accuracy)

0.8835455764075067

## Train without previous:

In [654]:
df_train_small = df_train.loc[:, df_train.columns != 'previous']
df_val_small = df_val.loc[:, df_val.columns != 'previous']
df_test_small = df_test.loc[:, df_test.columns != 'previous']
categorical_cols_small = categorical_cols.copy()
numeric_cols_small = numeric_cols.copy()
numeric_cols_small.remove('previous')

In [655]:
dv = DictVectorizer(sparse=False)

train_dict_small = df_train_small[categorical_cols_small+ numeric_cols_small].to_dict(orient='records')
X_train_small = dv.fit_transform(train_dict_small)

val_dict_small = df_val_small[categorical_cols_small + numeric_cols_small].to_dict(orient='records')
X_val_small = dv.transform(val_dict_small)

In [656]:
model_small = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_small.fit(X_train_small, y_train)

In [657]:
y_pred_small = model_small.predict_proba(X_val_small)[:,1]

In [658]:
df_pred_small = pd.DataFrame()
df_pred_small['probability'] = y_pred_small
df_pred_small['prediction'] = y_pred_small.astype(int)
df_pred_small['actual'] = y_val
df_pred_small['correct'] = df_pred_small.prediction == df_pred_small.actual
remove_previous_accuracy = df_pred_small.correct.mean()
display(remove_previous_accuracy)

0.8835455764075067

In [659]:

diff_accuracies = pd.Series(
  [remove_age_accuracy, remove_balance_accuracy, remove_marital_accuracy,remove_previous_accuracy],
  index=['age', 'balance','marital','previous']
)
display(diff_accuracies)

age         0.883546
balance     0.883546
marital     0.883546
previous    0.883546
dtype: float64