In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Data Preparation
- wget https://archive.ics.uci.edu/static/public/222/bank+marketing.zip

In [2]:
pip install wget

Note: you may need to restart the kernel to use updated packages.


In [3]:
data = 'https://archive.ics.uci.edu/static/public/222/bank+marketing.zip'

In [4]:
!python -m wget $data -o bank_marketing.zip

-1 / unknown
Saved under bank_marketing (2).zip


In [5]:
from zipfile import ZipFile

with ZipFile('bank_marketing.zip','r') as zipfile:
   with ZipFile('bank.zip','r') as extrafile:
        extrafile.extractall()

In [6]:
df = pd.read_csv('bank-full.csv',sep=';')
df.head(10)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
5,35,management,married,tertiary,no,231,yes,no,unknown,5,may,139,1,-1,0,unknown,no
6,28,management,single,tertiary,no,447,yes,yes,unknown,5,may,217,1,-1,0,unknown,no
7,42,entrepreneur,divorced,tertiary,yes,2,yes,no,unknown,5,may,380,1,-1,0,unknown,no
8,58,retired,married,primary,no,121,yes,no,unknown,5,may,50,1,-1,0,unknown,no
9,43,technician,single,secondary,no,593,yes,no,unknown,5,may,55,1,-1,0,unknown,no


## Question 1

What is the most frequent observation (mode) for the column `education`?
- unknown
- primary
- secondary
- tertiary
tiary

In [7]:
df.education.value_counts()

education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64

In [None]:
df.education.mode()

0    secondary
Name: education, dtype: object

## Question 2

Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?
- age and balance
- day and campaign
- day and pdays
- pdays and previous


Cleaning data

In [9]:
# Clean dataset
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [10]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [11]:
# Null value count

In [12]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [None]:
numerical_features = list(df.dtypes[df.dtypes == 'int64'].index)
numerical_features

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [14]:
df.y.value_counts()

y
no     39922
yes     5289
Name: count, dtype: int64

In [15]:
df.y = (df.y == 'yes').astype('int')
df.y

0        0
1        0
2        0
3        0
4        0
        ..
45206    1
45207    1
45208    1
45209    0
45210    0
Name: y, Length: 45211, dtype: int64

Splitting data

In [16]:
# Splitting dataset
from sklearn.model_selection import train_test_split

In [17]:
df_full_train,df_test = train_test_split(df,test_size=0.2,random_state=42)
df_train,df_val = train_test_split(df_full_train,test_size=0.25,random_state=42)

In [18]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [19]:
y_full_train = df_full_train.y.values
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

In [20]:
df_train.drop(columns='y',inplace=True)
df_val.drop(columns='y',inplace=True)
df_test.drop(columns='y',inplace=True)

In [21]:
df.y.isnull().sum()

np.int64(0)

In [22]:
df_train.shape

(27126, 16)

In [23]:
df_val.shape

(9042, 16)

In [24]:
df_test.shape

(9043, 16)

In [25]:
df_full_train[numerical_features].isnull().sum()

age         0
balance     0
day         0
duration    0
campaign    0
pdays       0
previous    0
dtype: int64

In [26]:
df_full_train[numerical_features].nunique()

age           77
balance     6652
day           31
duration    1493
campaign      47
pdays        529
previous      40
dtype: int64

In [27]:
# Correlation matrix
correlation_matrix = df_full_train[numerical_features].corr()
abs_corr_matrix = correlation_matrix.abs()

In [28]:
correlation_matrix

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.098921,-0.007882,-0.007414,0.00316,-0.023231,0.002397
balance,0.098921,1.0,0.000455,0.02559,-0.018566,0.002122,0.015291
day,-0.007882,0.000455,1.0,-0.025719,0.160599,-0.094405,-0.053229
duration,-0.007414,0.02559,-0.025719,1.0,-0.086526,-0.001179,0.002557
campaign,0.00316,-0.018566,0.160599,-0.086526,1.0,-0.089317,-0.0333
pdays,-0.023231,0.002122,-0.094405,-0.001179,-0.089317,1.0,0.440662
previous,0.002397,0.015291,-0.053229,0.002557,-0.0333,0.440662,1.0


In [29]:
abs_corr_matrix

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.098921,0.007882,0.007414,0.00316,0.023231,0.002397
balance,0.098921,1.0,0.000455,0.02559,0.018566,0.002122,0.015291
day,0.007882,0.000455,1.0,0.025719,0.160599,0.094405,0.053229
duration,0.007414,0.02559,0.025719,1.0,0.086526,0.001179,0.002557
campaign,0.00316,0.018566,0.160599,0.086526,1.0,0.089317,0.0333
pdays,0.023231,0.002122,0.094405,0.001179,0.089317,1.0,0.440662
previous,0.002397,0.015291,0.053229,0.002557,0.0333,0.440662,1.0


In [None]:
np.fill_diagonal(abs_corr_matrix.values, np.nan)

max_corr_value = abs_corr_matrix.max().max()
max_corr_value

np.float64(0.4406621883723763)

In [31]:
max_corr_features = abs_corr_matrix.stack().idxmax()
max_corr_features

('pdays', 'previous')

## Question 3

Calculate the mutual information score between `y` and other categorical variables in the dataset. Use the training set only.  
Round the scores to 2 decimals using `round(score, 2)`.

Which of these variables has the biggest mutual information score?
- contact
- education
- housing
- poutcome


In [32]:
from sklearn.metrics import mutual_info_score

In [33]:
categorical_features = list(df.dtypes[df.dtypes == 'object'].index)
categorical_features

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome']

In [None]:
df_train[categorical_features].head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome
0,technician,single,tertiary,no,yes,yes,cellular,aug,unknown
1,entrepreneur,married,secondary,no,yes,yes,cellular,nov,unknown
2,blue-collar,married,secondary,no,yes,no,cellular,may,unknown
3,housemaid,married,primary,no,no,no,cellular,aug,unknown
4,self-employed,married,tertiary,no,no,no,cellular,aug,unknown


In [35]:
df_train[categorical_features].nunique()

job          12
marital       3
education     4
default       2
housing       2
loan          2
contact       3
month        12
poutcome      4
dtype: int64

In [36]:
def mutual_score(series):
    return mutual_info_score(series,y_train)

In [37]:
mutual_info = df_train[categorical_features].apply(mutual_score)

In [38]:
mutual_info.sort_values(ascending=False).round(2)

poutcome     0.03
month        0.03
contact      0.01
housing      0.01
job          0.01
loan         0.00
education    0.00
marital      0.00
default      0.00
dtype: float64

## Question 4

Now let's train a logistic regression.  
Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.  
Fit the model on the training dataset.  
To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:

`model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)`

Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

What accuracy did you get?

- 0.6
- 0.7
- 0.8
- 0.9


In [39]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [40]:
def logistic_regression(xi):
    score = w0
    
    for j in range(len(w)):
        score = score + xi[j] * w[j]
        
    result = sigmoid(score)
    return result

In [41]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [42]:
dv = DictVectorizer(sparse=False)
train_dict = df_train[categorical_features+numerical_features].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical_features+numerical_features].to_dict(orient='records')
X_val = dv.fit_transform(val_dict)

In [43]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [44]:
model.fit(X_train,y_train)

In [45]:
model.intercept_[0]

np.float64(-0.7116447549319027)

In [46]:
model.coef_[0].round(3)

array([-1.000e-03,  0.000e+00, -7.900e-02,  3.260e-01,  1.260e-01,
       -1.164e+00,  7.000e-03, -4.760e-01, -2.350e-01,  4.000e-03,
       -3.700e-01, -1.660e-01, -1.000e-02, -1.650e-01,  1.900e-02,
       -7.300e-01,  1.130e-01, -2.180e-01, -2.060e-01, -2.740e-01,
       -7.700e-02,  3.420e-01, -2.570e-01, -1.260e-01,  2.510e-01,
       -1.300e-01, -1.000e-03, -1.300e-01, -1.150e-01, -5.960e-01,
       -2.350e-01, -3.860e-01, -9.000e-02,  8.700e-02, -6.710e-01,
        3.060e-01, -2.860e-01, -9.020e-01, -8.600e-01,  2.770e-01,
        1.221e+00, -4.740e-01, -8.610e-01,  7.430e-01,  7.080e-01,
       -1.000e-03, -7.000e-01, -4.850e-01,  1.539e+00, -1.066e+00,
        6.000e-03])

In [47]:
y_pred = model.predict_proba(X_val)[:, 1]

In [48]:
y_decision = (y_pred >= 0.5)

In [59]:
global_accuracy = (y_val == y_decision).mean()
global_accuracy

np.float64(0.9001327140013271)

## Question 5

Let's find the least useful feature using the feature elimination technique.  
Train a model with all these features (using the same parameters as in Q4).  
Now exclude each feature from this set and train a model without it. Record the accuracy for each model.  
For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

Which of the following features has the smallest difference?

- age
- balance
- marital
- previous

*Note: The difference doesn't have to be positive.*


In [50]:
from IPython.display import display

In [51]:
global_y = df_full_train.y.mean()

In [61]:
results = []
accuracy_results = []
current_list = categorical_features + numerical_features

print(current_list)

for i,feature in enumerate(current_list):
    new_list = current_list[:i] + current_list[i+1:]

    print(new_list)
    
    train_dict = df_train[new_list].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)
    print(X_train.shape)

    val_dict = df_val[new_list].to_dict(orient='records')
    X_val = dv.fit_transform(val_dict)
    print(X_val.shape)

    model.fit(X_train,y_train)
    
    y_pred = model.predict_proba(X_val)[:, 1]
    y_decision = (y_pred >= 0.5)
    result = (y_val == y_decision).mean()

    results.append((feature,result))

for removed_feature, result in results:
    accuracy_results.append((removed_feature,global_accuracy - result))
    print(f"Removed feature  {removed_feature}: Accuracy = {global_accuracy - result}")

least_useful_feature = min(accuracy_results, key=lambda x: x[1])
print(least_useful_feature)

['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
['marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']


(27126, 39)
(9042, 39)
['job', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
(27126, 48)
(9042, 48)
['job', 'marital', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
(27126, 47)
(9042, 47)
['job', 'marital', 'education', 'housing', 'loan', 'contact', 'month', 'poutcome', 'age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
(27126, 49)
(9042, 49)
['job', 'marital', 'education', 'default', 'loan', 'contact', 'month', 'poutcome', 'age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
(27126, 49)
(9042, 49)
['job', 'marital', 'education', 'default', 'housing', 'contact', 'month', 'poutcome', 'age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
(27126, 49)
(9042, 49)
['job', 'marital', 'education', 'default', 'housing', 'loan', 'month', 'poutcome', 'age', 'balan

## Question 6

Now let's train a regularized logistic regression.  
Let's try the following values of the parameter `C`: \[0.01, 0.1, 1, 10, 100\].  
Train models using all the features as in Q4.  
Calculate the accuracy on the validation dataset and round it to 3 decimal digits.

Which of these `C` leads to the best accuracy on the validation set?

- 0.01
- 0.1
- 1
- 10
- 100

*Note: If there are multiple options, select the smallest `C`.*


In [71]:
c_list = [0.01,0.1,1,10,100]

accuracies = []

for c in c_list:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    
    dv = DictVectorizer(sparse=False)
    train_dict = df_train[categorical_features+numerical_features].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    val_dict = df_val[categorical_features+numerical_features].to_dict(orient='records')
    X_val = dv.fit_transform(val_dict)

    model.fit(X_train,y_train)
    y_pred = model.predict_proba(X_val)[:, 1]

    y_decision = (y_pred >= 0.5)

    accuracy = (y_val == y_decision).mean()

    accuracies.append((c,accuracy))

best_Accuracy = min(accuracies, key=lambda x: x[1])
print(best_Accuracy)


(0.01, np.float64(0.898363193983632))
