In [37]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt

In [38]:
df = pd.read_csv('bank-full.csv',sep=";")
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [39]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [40]:
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)
for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [41]:
del[df['default']]
del[df['loan']]

In [43]:
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [8]:
#df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')

In [44]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

##### Question 1 - What is the most frequent observation (mode) for the column education?

In [45]:
df.education.value_counts()
# Answer: secondary 23202

education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64

##### Question 2 - What are the two features that have the biggest correlation?

In [46]:
numerical = ['age', 'balance', 'day', 'campaign','pdays','previous']

In [47]:
# Assuming df_train is your DataFrame with numerical features
correlation_matrix = df[numerical].corr()

# Display the correlation matrix
print(correlation_matrix)

# Answer: pdays and previous

               age   balance       day  campaign     pdays  previous
age       1.000000  0.097783 -0.009120  0.004760 -0.023758  0.001288
balance   0.097783  1.000000  0.004503 -0.014578  0.003435  0.016674
day      -0.009120  0.004503  1.000000  0.162490 -0.093044 -0.051710
campaign  0.004760 -0.014578  0.162490  1.000000 -0.088628 -0.032855
pdays    -0.023758  0.003435 -0.093044 -0.088628  1.000000  0.454820
previous  0.001288  0.016674 -0.051710 -0.032855  0.454820  1.000000


#### Setting up the validation framework

* Perform the train/validation/test split with Scikit-Learn

In [48]:
from sklearn.model_selection import train_test_split

In [49]:
df.y = (df.y == 'yes').astype(int)

In [58]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25001, random_state=42)

In [59]:
len(df_train), len(df_val), len(df_test)

(27125, 9043, 9043)

In [60]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [61]:
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

del df_train['y']
del df_val['y']
del df_test['y']

##### Question 3 - Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only

In [71]:
from sklearn.metrics import mutual_info_score

def mutual_info_churn_score(series):
    return mutual_info_score(series, df_full_train.y)

In [72]:
categorical = list(df_full_train.dtypes[df_full_train.dtypes == 'object'].index)
categorical

['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

In [73]:
mi = df_full_train[categorical].apply(mutual_info_churn_score)
mi.sort_values(ascending=False)

poutcome     0.029257
month        0.024774
contact      0.014164
housing      0.009800
job          0.007765
education    0.002458
marital      0.002019
dtype: float64

In [74]:
# Answer : poutcome

#### 4 Question - Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

In [75]:
from sklearn.feature_extraction import DictVectorizer

In [81]:
dv = DictVectorizer(sparse=False)
train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)
val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

#### Logistic regression

In [82]:
from sklearn.linear_model import LogisticRegression

In [84]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [85]:
model.intercept_[0]

np.float64(-0.5095931028647195)

In [87]:
model.coef_[0].round(2)

array([ 0.  ,  0.  , -0.08,  0.33,  0.11, -0.95,  0.  , -0.28, -0.14,
        0.04, -0.12,  0.04, -0.55, -0.01, -0.12, -0.18, -0.29, -0.1 ,
        0.29, -0.22, -0.07,  0.23, -0.13,  0.24, -0.15, -0.17, -0.33,
       -0.01,  0.13, -0.71,  0.46, -0.39, -0.96, -0.7 ,  0.21,  1.18,
       -0.44, -0.84,  0.75,  0.8 , -0.  , -0.74, -0.48,  1.45, -0.75,
        0.01])

In [90]:
y_pred = model.predict_proba(X_val)[:, 1]

In [91]:
y_pred

array([0.04873691, 0.03591798, 0.0497742 , ..., 0.02367599, 0.08685575,
       0.09369874])