In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Data Preparation
- wget https://archive.ics.uci.edu/static/public/222/bank+marketing.zip

In [3]:
pip install wget

Note: you may need to restart the kernel to use updated packages.


In [4]:
data = 'https://archive.ics.uci.edu/static/public/222/bank+marketing.zip'

In [5]:
!python -m wget $data


Saved under bank+marketing.zip


In [6]:
from zipfile import ZipFile

with ZipFile('bank+marketing.zip','r') as zipfile:
   with ZipFile('bank.zip','r') as extrafile:
        extrafile.extractall()

In [7]:
df = pd.read_csv('bank-full.csv',sep=';')

## Question 1

What is the most frequent observation (mode) for the column `education`?
- unknown
- primary
- secondary
- tertiary
tiary

In [9]:
df.education.value_counts()

education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64

In [10]:
df.education.mode()

0    secondary
Name: education, dtype: object

## Question 2

Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?
- age and balance
- day and campaign
- day and pdays
- pdays and previous


Cleaning data

In [13]:
# Clean dataset
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [14]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [15]:
# Null value count

In [16]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [17]:
numerical_features = list(df.dtypes[df.dtypes == 'int64'].index)
numerical_features

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [18]:
df.y.value_counts()

y
no     39922
yes     5289
Name: count, dtype: int64

In [19]:
df.y = (df.y == 'yes').astype('int')
df.y

0        0
1        0
2        0
3        0
4        0
        ..
45206    1
45207    1
45208    1
45209    0
45210    0
Name: y, Length: 45211, dtype: int32

Splitting data

In [21]:
# Splitting dataset
from sklearn.model_selection import train_test_split

In [22]:
df_full_train,df_test = train_test_split(df,test_size=0.2,random_state=42)
df_train,df_val = train_test_split(df_full_train,test_size=0.25,random_state=42)

In [23]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [24]:
y_full_train = df_full_train.y.values
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

In [25]:
df_train.drop(columns='y',inplace=True)
df_val.drop(columns='y',inplace=True)
df_test.drop(columns='y',inplace=True)

In [26]:
df.y.isnull().sum()

0

In [27]:
df_full_train[numerical_features].isnull().sum()

age         0
balance     0
day         0
duration    0
campaign    0
pdays       0
previous    0
dtype: int64

In [28]:
df_full_train[numerical_features].nunique()

age           77
balance     6652
day           31
duration    1493
campaign      47
pdays        529
previous      40
dtype: int64

In [38]:
# Correlation matrix
correlation_matrix = df_full_train[numerical_features].corr()
abs_corr_matrix = correlation_matrix.abs()

In [44]:
correlation_matrix

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.098921,-0.007882,-0.007414,0.00316,-0.023231,0.002397
balance,0.098921,1.0,0.000455,0.02559,-0.018566,0.002122,0.015291
day,-0.007882,0.000455,1.0,-0.025719,0.160599,-0.094405,-0.053229
duration,-0.007414,0.02559,-0.025719,1.0,-0.086526,-0.001179,0.002557
campaign,0.00316,-0.018566,0.160599,-0.086526,1.0,-0.089317,-0.0333
pdays,-0.023231,0.002122,-0.094405,-0.001179,-0.089317,1.0,0.440662
previous,0.002397,0.015291,-0.053229,0.002557,-0.0333,0.440662,1.0


In [46]:
abs_corr_matrix

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.098921,0.007882,0.007414,0.00316,0.023231,0.002397
balance,0.098921,1.0,0.000455,0.02559,0.018566,0.002122,0.015291
day,0.007882,0.000455,1.0,0.025719,0.160599,0.094405,0.053229
duration,0.007414,0.02559,0.025719,1.0,0.086526,0.001179,0.002557
campaign,0.00316,0.018566,0.160599,0.086526,1.0,0.089317,0.0333
pdays,0.023231,0.002122,0.094405,0.001179,0.089317,1.0,0.440662
previous,0.002397,0.015291,0.053229,0.002557,0.0333,0.440662,1.0


In [50]:
np.fill_diagonal(abs_corr_matrix.values, np.nan)

max_corr_value = abs_corr_matrix.max().max()
max_corr_value

0.4406621883723763

In [54]:
max_corr_features = abs_corr_matrix.stack().idxmax()
max_corr_features

('pdays', 'previous')

## Question 3

Calculate the mutual information score between `y` and other categorical variables in the dataset. Use the training set only.  
Round the scores to 2 decimals using `round(score, 2)`.

Which of these variables has the biggest mutual information score?
- contact
- education
- housing
- poutcome


In [None]:
from sklearn.metrics import mutual_info_score

In [None]:
categorical_features = list(df.dtypes[df.dtypes == 'object'].index)
categorical_features

In [None]:
df_full_train[categorical_features].head()

In [None]:
df_full_train[categorical_features].nunique()

In [None]:
def mutual_score(series):
    return mutual_info_score(series,y_train)

In [None]:
mutual_info = df_train[categorical_features].apply(mutual_score)

In [None]:
mutual_info.sort_values(ascending=False).round(2)