In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Data Preparation
- wget https://archive.ics.uci.edu/static/public/222/bank+marketing.zip

In [3]:
pip install wget

Note: you may need to restart the kernel to use updated packages.


In [4]:
data = 'https://archive.ics.uci.edu/static/public/222/bank+marketing.zip'

In [5]:
!python -m wget $data


Saved under bank+marketing.zip


In [6]:
from zipfile import ZipFile

with ZipFile('bank+marketing.zip','r') as zipfile:
   with ZipFile('bank.zip','r') as extrafile:
        extrafile.extractall()

In [7]:
df = pd.read_csv('bank-full.csv',sep=';')

## Question 1

What is the most frequent observation (mode) for the column `education`?
- unknown
- primary
- secondary
- tertiary
tiary

In [9]:
df.education.value_counts()

education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64

In [10]:
df.education.mode()

0    secondary
Name: education, dtype: object

## Question 2

Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?
- age and balance
- day and campaign
- day and pdays
- pdays and previous


Cleaning data

In [13]:
# Clean dataset
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [14]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [15]:
# Null value count

In [16]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [17]:
numerical_features = list(df.dtypes[df.dtypes == 'int64'].index)
numerical_features

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [18]:
df.y.value_counts()

y
no     39922
yes     5289
Name: count, dtype: int64

In [19]:
df.y = (df.y == 'yes').astype('int')
df.y

0        0
1        0
2        0
3        0
4        0
        ..
45206    1
45207    1
45208    1
45209    0
45210    0
Name: y, Length: 45211, dtype: int32

Splitting data

In [21]:
# Splitting dataset
from sklearn.model_selection import train_test_split

In [22]:
df_full_train,df_test = train_test_split(df,test_size=0.2,random_state=42)
df_train,df_val = train_test_split(df_full_train,test_size=0.25,random_state=42)

In [23]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [24]:
y_full_train = df_full_train.y.values
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

In [25]:
df_train.drop(columns='y',inplace=True)
df_val.drop(columns='y',inplace=True)
df_test.drop(columns='y',inplace=True)

In [26]:
df.y.isnull().sum()

0

In [79]:
df_train.shape

(27126, 16)

In [81]:
df_val.shape

(9042, 16)

In [83]:
df_test.shape

(9043, 16)

In [27]:
df_full_train[numerical_features].isnull().sum()

age         0
balance     0
day         0
duration    0
campaign    0
pdays       0
previous    0
dtype: int64

In [28]:
df_full_train[numerical_features].nunique()

age           77
balance     6652
day           31
duration    1493
campaign      47
pdays        529
previous      40
dtype: int64

In [38]:
# Correlation matrix
correlation_matrix = df_full_train[numerical_features].corr()
abs_corr_matrix = correlation_matrix.abs()

In [44]:
correlation_matrix

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.098921,-0.007882,-0.007414,0.00316,-0.023231,0.002397
balance,0.098921,1.0,0.000455,0.02559,-0.018566,0.002122,0.015291
day,-0.007882,0.000455,1.0,-0.025719,0.160599,-0.094405,-0.053229
duration,-0.007414,0.02559,-0.025719,1.0,-0.086526,-0.001179,0.002557
campaign,0.00316,-0.018566,0.160599,-0.086526,1.0,-0.089317,-0.0333
pdays,-0.023231,0.002122,-0.094405,-0.001179,-0.089317,1.0,0.440662
previous,0.002397,0.015291,-0.053229,0.002557,-0.0333,0.440662,1.0


In [46]:
abs_corr_matrix

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.098921,0.007882,0.007414,0.00316,0.023231,0.002397
balance,0.098921,1.0,0.000455,0.02559,0.018566,0.002122,0.015291
day,0.007882,0.000455,1.0,0.025719,0.160599,0.094405,0.053229
duration,0.007414,0.02559,0.025719,1.0,0.086526,0.001179,0.002557
campaign,0.00316,0.018566,0.160599,0.086526,1.0,0.089317,0.0333
pdays,0.023231,0.002122,0.094405,0.001179,0.089317,1.0,0.440662
previous,0.002397,0.015291,0.053229,0.002557,0.0333,0.440662,1.0


In [50]:
np.fill_diagonal(abs_corr_matrix.values, np.nan)

max_corr_value = abs_corr_matrix.max().max()
max_corr_value

0.4406621883723763

In [54]:
max_corr_features = abs_corr_matrix.stack().idxmax()
max_corr_features

('pdays', 'previous')

## Question 3

Calculate the mutual information score between `y` and other categorical variables in the dataset. Use the training set only.  
Round the scores to 2 decimals using `round(score, 2)`.

Which of these variables has the biggest mutual information score?
- contact
- education
- housing
- poutcome


In [89]:
from sklearn.metrics import mutual_info_score

In [91]:
categorical_features = list(df.dtypes[df.dtypes == 'object'].index)
categorical_features

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome']

In [93]:
df_train[categorical_features].head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome
0,technician,single,tertiary,no,yes,yes,cellular,aug,unknown
1,entrepreneur,married,secondary,no,yes,yes,cellular,nov,unknown
2,blue-collar,married,secondary,no,yes,no,cellular,may,unknown
3,housemaid,married,primary,no,no,no,cellular,aug,unknown
4,self-employed,married,tertiary,no,no,no,cellular,aug,unknown


In [95]:
df_train[categorical_features].nunique()

job          12
marital       3
education     4
default       2
housing       2
loan          2
contact       3
month        12
poutcome      4
dtype: int64

In [97]:
def mutual_score(series):
    return mutual_info_score(series,y_train)

In [99]:
mutual_info = df_train[categorical_features].apply(mutual_score)

In [101]:
mutual_info.sort_values(ascending=False).round(2)

poutcome     0.03
month        0.03
contact      0.01
housing      0.01
job          0.01
loan         0.00
education    0.00
marital      0.00
default      0.00
dtype: float64

## Question 4

Now let's train a logistic regression.  
Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.  
Fit the model on the training dataset.  
To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:

```python
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_s
```

Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

What accuracy did you get?

- 0.6
- 0.7
- 0.8
- 0.9tate=42)


In [104]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [106]:
def logistic_regression(xi):
    score = w0
    
    for j in range(len(w)):
        score = score + xi[j] * w[j]
        
    result = sigmoid(score)
    return result

In [116]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [114]:
dv = DictVectorizer(sparse=False)
train_dict = df_train[categorical_features+numerical_features].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical_features+numerical_features].to_dict(orient='records')
X_val = dv.fit_transform(val_dict)

In [118]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [120]:
model.fit(X_train,y_train)

In [122]:
model.intercept_[0]

-0.7668669010421106

In [124]:
model.coef_[0].round(3)

array([ 0.000e+00,  0.000e+00, -8.100e-02,  3.250e-01,  1.390e-01,
       -1.230e+00,  9.000e-03, -4.640e-01, -3.030e-01,  4.000e-03,
       -3.800e-01, -1.830e-01, -1.000e-03, -2.040e-01, -2.800e-02,
       -7.380e-01,  1.300e-01, -1.900e-01, -2.180e-01, -3.470e-01,
       -6.100e-02,  2.540e-01, -2.690e-01, -8.300e-02,  3.050e-01,
       -1.240e-01,  3.500e-02, -1.980e-01, -1.510e-01, -6.160e-01,
       -2.710e-01, -3.990e-01, -9.800e-02, -2.000e-03, -7.080e-01,
        4.220e-01, -2.930e-01, -1.217e+00, -9.390e-01,  3.140e-01,
        1.498e+00, -5.040e-01, -9.190e-01,  7.730e-01,  8.070e-01,
       -0.000e+00, -7.500e-01, -5.450e-01,  1.499e+00, -9.710e-01,
        8.000e-03])

In [126]:
y_pred = model.predict_proba(X_val)[:, 1]

In [128]:
y_decision = (y_pred >= 0.5)

In [130]:
(y_val == y_decision).mean()

0.900353904003539

## Question 5

Let's find the least useful feature using the feature elimination technique.  
Train a model with all these features (using the same parameters as in Q4).  
Now exclude each feature from this set and train a model without it. Record the accuracy for each model.  
For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

Which of the following features has the smallest difference?

- age
- balance
- marital
- previous

*Note: The difference doesn't have to be positive.*


In [189]:
from IPython.display import display

In [197]:
global_y = df_full_train.y.mean()

In [199]:
for c in categorical_features:
    print(c)
    df_group = df_full_train.groupby(c).y.agg(['mean', 'count'])
    df_group['diff'] = df_group['mean'] - global_y
    df_group['risk'] = df_group['mean'] / global_y
    display(df_group)
    print()
    print()

job


Unnamed: 0_level_0,mean,count,diff,risk
job,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
admin.,0.119894,4137,0.003824,1.032947
blue-collar,0.072252,7806,-0.043817,0.62249
entrepreneur,0.086587,1178,-0.029482,0.745997
housemaid,0.093812,1002,-0.022257,0.808243
management,0.135467,7500,0.019397,1.167117
retired,0.22259,1815,0.10652,1.917727
self-employed,0.122862,1286,0.006792,1.058518
services,0.090746,3350,-0.025323,0.781827
student,0.278976,742,0.162906,2.403524
technician,0.10876,6096,-0.00731,0.937024




marital


Unnamed: 0_level_0,mean,count,diff,risk
marital,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
divorced,0.11972,4143,0.003651,1.031451
married,0.100578,21804,-0.015492,0.866532
single,0.147637,10221,0.031568,1.271973




education


Unnamed: 0_level_0,mean,count,diff,risk
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
primary,0.086377,5476,-0.029693,0.744183
secondary,0.105303,18670,-0.010767,0.907238
tertiary,0.14897,10539,0.032901,1.28346
unknown,0.127444,1483,0.011375,1.098001




default


Unnamed: 0_level_0,mean,count,diff,risk
default,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.117128,35491,0.001059,1.009122
yes,0.060561,677,-0.055508,0.521768




housing


Unnamed: 0_level_0,mean,count,diff,risk
housing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.166407,16033,0.050337,1.433683
yes,0.075987,20135,-0.040082,0.654669




loan


Unnamed: 0_level_0,mean,count,diff,risk
loan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.125441,30333,0.009371,1.08074
yes,0.067352,5835,-0.048717,0.580275




contact


Unnamed: 0_level_0,mean,count,diff,risk
contact,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cellular,0.14827,23464,0.0322,1.277422
telephone,0.137128,2319,0.021059,1.181431
unknown,0.038613,10385,-0.077456,0.332675




month


Unnamed: 0_level_0,mean,count,diff,risk
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
apr,0.202645,2344,0.086576,1.745895
aug,0.110954,4966,-0.005115,0.955932
dec,0.464481,183,0.348411,4.001749
feb,0.156723,2112,0.040654,1.350256
jan,0.098678,1135,-0.017391,0.850167
jul,0.091927,5537,-0.024142,0.792
jun,0.098888,4318,-0.017181,0.851976
mar,0.516043,374,0.399973,4.445983
may,0.065219,10963,-0.05085,0.5619
nov,0.101481,3173,-0.014588,0.874315




poutcome


Unnamed: 0_level_0,mean,count,diff,risk
poutcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
failure,0.126891,3901,0.010821,1.093229
other,0.170341,1497,0.054271,1.467575
success,0.641322,1210,0.525253,5.525332
unknown,0.090392,29560,-0.025677,0.778779






## Question 6

Now let's train a regularized logistic regression.  
Let's try the following values of the parameter `C`: \[0.01, 0.1, 1, 10, 100\].  
Train models using all the features as in Q4.  
Calculate the accuracy on the validation dataset and round it to 3 decimal digits.

Which of these `C` leads to the best accuracy on the validation set?

- 0.01
- 0.1
- 1
- 10
- 100

*Note: If there are multiple options, select the smallest `C`.*
