In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("data/bank-full.csv", delimiter=";")
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [3]:
df.shape

(45211, 17)

In [4]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [5]:
cols_of_interest = ['age', 'job', 'marital', 'education', 'balance', 'housing',
      'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y']
df = df[cols_of_interest]
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [6]:

df.columns = df.columns.str.lower().str.replace(' ', '_')
categorical = list(df.dtypes[df.dtypes == 'object'].index)
for c in categorical:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [7]:
df.isna().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [8]:
# -> No missing values

In [9]:
df.education.value_counts()

education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64

In [10]:
df.education.mode()

0    secondary
Name: education, dtype: object

```
A1:
Mode: Secondary
```

In [11]:
# Correlation

In [12]:
df.dtypes

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [13]:
numerical = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [14]:
df[numerical].corr()
        

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


```
A2:
   pdays & previous
```

In [15]:
df['y'].value_counts()

y
no     39922
yes     5289
Name: count, dtype: int64

In [16]:
df['y'] = (df['y'] == 'yes').astype(int)

In [17]:
df['y'].value_counts()

y
0    39922
1     5289
Name: count, dtype: int64

In [18]:
# Split data

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
df_full_train, df_test = train_test_split(df, test_size=.2, random_state=42)

In [21]:
df_full_train.shape, df_test.shape

((36168, 15), (9043, 15))

In [22]:
df_train, df_val = train_test_split(df_full_train, test_size=20/80, random_state=42)

In [23]:
df_train.shape, df_val.shape

((27126, 15), (9042, 15))

In [24]:
y_train = df_train.pop('y')
y_val = df_val.pop('y')
y_test = df_test.pop('y')
y_full_train = df_full_train.pop('y')
y_train.shape, y_val.shape, y_test.shape, y_full_train.shape

((27126,), (9042,), (9043,), (36168,))

In [25]:
df_full_train = df_full_train.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [26]:
y_full_train = y_full_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [27]:
len(df_test), len(y_test)

(9043, 9043)

In [28]:
# Q3


In [30]:
df_train.dtypes

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
dtype: object

In [31]:
categorical = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

In [32]:
df_train[categorical].nunique()

job          12
marital       3
education     4
housing       2
contact       3
month        12
poutcome      4
dtype: int64

In [33]:
df_train[categorical].head()

Unnamed: 0,job,marital,education,housing,contact,month,poutcome
0,technician,single,tertiary,yes,cellular,aug,unknown
1,entrepreneur,married,secondary,yes,cellular,nov,unknown
2,blue-collar,married,secondary,yes,cellular,may,unknown
3,housemaid,married,primary,no,cellular,aug,unknown
4,self-employed,married,tertiary,no,cellular,aug,unknown


In [34]:
from sklearn.metrics import mutual_info_score
from IPython.display import display


In [35]:
for c in categorical:
    score = mutual_info_score(df_train[c], y_train)
    print(f"{c}: {score} => {round(score, 2)}")

job: 0.007316082778474635 => 0.01
marital: 0.0020495925927810216 => 0.0
education: 0.0026967549991295282 => 0.0
housing: 0.010343105891750026 => 0.01
contact: 0.013356062198247219 => 0.01
month: 0.025090033443650246 => 0.03
poutcome: 0.029532821290436224 => 0.03


```
A3:
    poutcome
```

In [36]:
# Q4

In [37]:
categorical

['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

In [38]:
train_dicts = df_train.to_dict(orient='records')
train_dicts[:2]

[{'age': 32,
  'job': 'technician',
  'marital': 'single',
  'education': 'tertiary',
  'balance': 1100,
  'housing': 'yes',
  'contact': 'cellular',
  'day': 11,
  'month': 'aug',
  'duration': 67,
  'campaign': 1,
  'pdays': -1,
  'previous': 0,
  'poutcome': 'unknown'},
 {'age': 38,
  'job': 'entrepreneur',
  'marital': 'married',
  'education': 'secondary',
  'balance': 0,
  'housing': 'yes',
  'contact': 'cellular',
  'day': 17,
  'month': 'nov',
  'duration': 258,
  'campaign': 1,
  'pdays': -1,
  'previous': 0,
  'poutcome': 'unknown'}]

In [39]:
from sklearn.feature_extraction import DictVectorizer

In [40]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

In [41]:
X_train[1].round(2)

array([ 38.,   0.,   1.,   1.,   0.,   0.,  17., 258.,   0.,   1.,   0.,
         0.,   0.,   1.,   0.,   0.,   1.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   1.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   1.,   0.,   0.,  -1.,   0.,   0.,
         0.,   1.,   0.])

In [42]:
# LR

In [43]:
from sklearn.linear_model import LogisticRegression

In [44]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [45]:
model.fit(X_train, y_train)

In [46]:
val_dicts = df_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [47]:
y_pred = model.predict_proba(X_val)[:, 1]
y_pred

array([0.01294119, 0.00972959, 0.15344079, ..., 0.05244611, 0.00909062,
       0.28165388])

In [48]:
td_decision = (y_pred >= 0.5)

In [49]:
accuracy = (y_val == td_decision).mean()
accuracy, round(accuracy, 2)

(np.float64(0.9007962840079629), np.float64(0.9))

```
A4:
  Accuracy: 0.9
```

In [52]:
# Q5

In [53]:
e_cols = cols_of_interest.copy()
e_cols.remove('y')
e_cols

['age',
 'job',
 'marital',
 'education',
 'balance',
 'housing',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome']

In [54]:
my_train_dicts = df_full_train[e_cols].to_dict(orient='records')

my_dv = DictVectorizer(sparse=False)
X_tr = my_dv.fit_transform(my_train_dicts)

my_model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
my_model.fit(X_tr, y_full_train)

my_test_dicts = df_test[e_cols].to_dict(orient='records')
X_te = my_dv.transform(my_test_dicts)
len(X_te)

my_y_pred = my_model.predict_proba(X_te)[:, 1]
len(my_y_pred)

my_td_decision = (my_y_pred >= 0.5)
original_accuracy = (y_test == my_td_decision).mean()
original_accuracy

np.float64(0.899037929890523)

In [55]:
len(y_test)

9043

In [56]:
len(df_test)

9043

In [57]:
for col in e_cols:
    my_cols = [c for c in e_cols if c != col]
    my_train_dicts = df_full_train[my_cols].to_dict(orient='records')
    
    my_dv = DictVectorizer(sparse=False)
    X_tr = my_dv.fit_transform(my_train_dicts)
    
    my_model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    my_model.fit(X_tr, y_full_train)
    
    my_test_dicts = df_test[my_cols].to_dict(orient='records')
    X_te = my_dv.transform(my_test_dicts)
    
    my_y_pred = my_model.predict_proba(X_te)[:, 1]
    my_td_decision = (my_y_pred >= 0.5)
    my_accuracy = (y_test == my_td_decision).mean()
    
    print(f"Accuracy: {my_accuracy}, Rounded: {round(my_accuracy, 2)}")
    print(f"Col: {col}  Accuracy diff: {orig_accuracy - my_accuracy}")
    print()

Accuracy: 0.8987061815769103, Rounded: 0.9
Col: age  Accuracy diff: 0.002090102431052565

Accuracy: 0.8982638504920933, Rounded: 0.9
Col: job  Accuracy diff: 0.0025324335158695144

Accuracy: 0.8993696782041358, Rounded: 0.9
Col: marital  Accuracy diff: 0.0014266058038270302

Accuracy: 0.8995908437465443, Rounded: 0.9
Col: education  Accuracy diff: 0.0012054402614185555

Accuracy: 0.8993696782041358, Rounded: 0.9
Col: balance  Accuracy diff: 0.0014266058038270302

Accuracy: 0.8981532677208891, Rounded: 0.9
Col: housing  Accuracy diff: 0.0026430162870737517

Accuracy: 0.898595598805706, Rounded: 0.9
Col: contact  Accuracy diff: 0.0022006852022568024

Accuracy: 0.8991485126617274, Rounded: 0.9
Col: day  Accuracy diff: 0.0016477713462355048

Accuracy: 0.8988167643481145, Rounded: 0.9
Col: month  Accuracy diff: 0.001979519659848328

Accuracy: 0.8909653875926131, Rounded: 0.89
Col: duration  Accuracy diff: 0.009830896415349732

Accuracy: 0.898595598805706, Rounded: 0.9
Col: campaign  Accurac

In [565]:
cols = ['balance', 'marital']
df_full_train[cols].dtypes

balance     int64
marital    object
dtype: object

In [566]:
df_full_train[cols].head()

Unnamed: 0,balance,marital
0,849,married
1,1415,married
2,3842,married
3,-119,single
4,3498,married


In [567]:
df_full_train.columns

Index(['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact',
       'day', 'month', 'duration', 'campaign', 'pdays', 'previous',
       'poutcome'],
      dtype='object')

In [58]:
# Q6

In [59]:
regs = [.01, .1, 1, 10, 100]

In [60]:
for r in regs:
    my_train_dicts = df_full_train[e_cols].to_dict(orient='records')
    
    my_dv = DictVectorizer(sparse=False)
    X_tr = my_dv.fit_transform(my_train_dicts)
    
    my_model = LogisticRegression(solver='liblinear', C=r, max_iter=1000, random_state=42)
    my_model.fit(X_tr, y_full_train)
    
    my_test_dicts = df_test[e_cols].to_dict(orient='records')
    X_te = my_dv.transform(my_test_dicts)
    
    my_y_pred = my_model.predict_proba(X_te)[:, 1]
    my_td_decision = (my_y_pred >= 0.5)
    my_accuracy = (y_test == my_td_decision).mean()
    
    print(f"Reg: {r} => Accuracy: {my_accuracy}")
    print()

Reg: 0.01 => Accuracy: 0.8969368572376424

Reg: 0.1 => Accuracy: 0.8991485126617274

Reg: 1 => Accuracy: 0.899037929890523

Reg: 10 => Accuracy: 0.8988167643481145

Reg: 100 => Accuracy: 0.8989273471193188



```
Reg : 0.1
```