In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
bank =  pd.read_csv('bank.csv', sep=';')

In [5]:
len(bank)

4521

In [6]:
bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [8]:
bank.shape

(4521, 17)

In [12]:
bank['y'].value_counts()

no     4000
yes     521
Name: y, dtype: int64

In [14]:
# get the percentage of yes(people that subscribed)
100 - (4000/4521)*100

11.523999115239988

In [20]:
# check if there are any missing values(NaN)
pd.isnull(bank).sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [52]:
newbank = bank.copy()

In [53]:
def change(data, column):
    new = []
    for value in data[column]:
        if value == 'yes':
            new.append(1)
        else:
            new.append(0)
    return new

In [54]:
# change 'yes, no' to '1, 0'
newbank['y'] = change(newbank, 'y')

In [55]:
newbank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,0
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,0
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,0
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,0
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,0


In [56]:
newbank.corr()['y']

age         0.045092
balance     0.017905
day        -0.011244
duration    0.401118
campaign   -0.061147
pdays       0.104087
previous    0.116714
y           1.000000
Name: y, dtype: float64

In [57]:
def oneHotEncoding(data, col):
    return pd.get_dummies(data, columns=col)

In [58]:
newbank = oneHotEncoding(newbank, ['job', 'marital', 'education', 'contact', 'month', 'poutcome', 'month'])

In [59]:
newbank['default'] = change(newbank, 'default')
newbank['housing'] = change(newbank, 'housing')
newbank['loan'] = change(newbank, 'loan')

In [61]:
newbank.shape

(4521, 61)

In [63]:
newbank.corr()['y']

age                    0.045092
default                0.001303
balance                0.017905
housing               -0.104683
loan                  -0.070517
day                   -0.011244
duration               0.401118
campaign              -0.061147
pdays                  0.104087
previous               0.116714
y                      1.000000
job_admin.             0.006568
job_blue-collar       -0.068147
job_entrepreneur      -0.015968
job_housemaid          0.004872
job_management         0.032634
job_retired            0.086675
job_self-employed     -0.003827
job_services          -0.024071
job_student            0.047809
job_technician        -0.010154
job_unemployed        -0.007312
job_unknown            0.019886
marital_divorced       0.034840
marital_married       -0.064643
marital_single         0.045815
education_primary     -0.027420
education_secondary   -0.028744
education_tertiary     0.056649
education_unknown     -0.008870
                         ...   
contact_

In [64]:
bank1 = newbank.copy()

In [65]:
newbank = newbank.drop('job_unknown', axis=1)
newbank = newbank.drop('education_unknown', axis=1)
newbank = newbank.drop('contact_unknown', axis=1)
newbank = newbank.drop('poutcome_unknown', axis=1)

In [67]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer

###### split the data in a stratified manner cos of the target imbalance

In [70]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(newbank, newbank['y']):
    realtrain = newbank.loc[train_index]
    realtest = newbank.loc[test_index]



In [71]:
realtrain.head()

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,...,month_dec,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep
2236,32,1,-18,1,0,12,155,3,-1,0,...,0,0,0,0,0,0,1,0,0,0
858,34,0,144,1,1,14,104,1,85,19,...,0,0,0,0,0,0,0,0,1,0
3531,44,0,0,0,0,22,54,2,-1,0,...,0,0,0,0,0,0,0,0,0,0
2737,34,0,8309,1,1,19,50,1,-1,0,...,0,0,0,0,0,0,0,1,0,0
1257,51,0,5050,0,1,16,75,7,-1,0,...,0,0,0,0,1,0,0,0,0,0


In [84]:
realtrain.shape

(3616, 57)

In [85]:
realtrain['y'].value_counts()

0    3199
1     417
Name: y, dtype: int64

In [72]:
X = realtrain.drop('y', axis=1)
y = realtrain['y']

In [97]:
col_name = X.columns

In [75]:
scaler = StandardScaler()
train_predictor = scaler.fit_transform(X)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


# BASELINE MODEL

In [73]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier

In [101]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.metrics import precision_score, recall_score

In [77]:
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier()
knn = KNeighborsClassifier(n_neighbors=3)
log_reg = LogisticRegression()

In [78]:
y_pred_forest = cross_val_predict(rfc, X, y, cv=3)
y_pred_tree = cross_val_predict(dtc, X, y, cv=3)

y_pred_reg = cross_val_predict(log_reg, X, y, cv=3)
y_pred_knn = cross_val_predict(knn, X, y, cv=3)



In [102]:
def my_model(ytrue, ypred):
    print(confusion_matrix(ytrue, ypred))
    print('precision: ', precision_score(ytrue, ypred))
    print("recall:", recall_score(ytrue, ypred))
    print("f1 score:", f1_score(ytrue, ypred))
    

In [103]:
my_model(y, y_pred_forest)

MATRIX:  [[3135   64]
 [ 317  100]]
precision:  0.6097560975609756
recall: 0.23980815347721823
f1 score: 0.34423407917383825


In [104]:
my_model(y, y_pred_tree)

MATRIX:  [[2942  257]
 [ 236  181]]
precision:  0.4132420091324201
recall: 0.434052757793765
f1 score: 0.42339181286549704


In [105]:
my_model(y, y_pred_reg)

MATRIX:  [[3121   78]
 [ 279  138]]
precision:  0.6388888888888888
recall: 0.33093525179856115
f1 score: 0.43601895734597157


In [106]:
my_model(y, y_pred_knn)

MATRIX:  [[3046  153]
 [ 337   80]]
precision:  0.34334763948497854
recall: 0.19184652278177458
f1 score: 0.24615384615384617


# GET NICE ATTRIBUTES

In [93]:
dtc.fit(X, y)
rfc.fit(X, y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [100]:
sorted(zip(col_name, dtc.feature_importances_), reverse=True)

[('previous', 0.025153493243931856),
 ('poutcome_success', 0.08642904775928416),
 ('poutcome_other', 0.0024953066818524322),
 ('poutcome_failure', 0.0),
 ('pdays', 0.04971547378254997),
 ('month_sep', 0.00480388759075785),
 ('month_sep', 0.0),
 ('month_oct', 0.01585860501773397),
 ('month_oct', 0.006727854490896258),
 ('month_nov', 0.0034851589991348143),
 ('month_nov', 0.0016069728583306554),
 ('month_may', 0.0038522343587131824),
 ('month_may', 0.002478335288273647),
 ('month_mar', 0.01312161908184366),
 ('month_mar', 0.0),
 ('month_jun', 0.005587047942293672),
 ('month_jun', 0.0),
 ('month_jul', 0.005679157306127066),
 ('month_jul', 0.0),
 ('month_jan', 0.0024155710250626174),
 ('month_jan', 0.0020330094161619757),
 ('month_feb', 0.01425921978275247),
 ('month_feb', 0.005683861698760702),
 ('month_dec', 0.0),
 ('month_dec', 0.0),
 ('month_aug', 0.0032390830351976324),
 ('month_aug', 0.0008100880432216282),
 ('month_apr', 0.0032758530285159813),
 ('month_apr', 0.002709695259546652),


In [99]:
sorted(zip(col_name, rfc.feature_importances_), reverse=True)

[('previous', 0.023734660225585947),
 ('poutcome_success', 0.04886757065477717),
 ('poutcome_other', 0.004763296338170484),
 ('poutcome_failure', 0.008612556804342536),
 ('pdays', 0.037164035832040365),
 ('month_sep', 0.004166109332202513),
 ('month_sep', 0.0038152649024357243),
 ('month_oct', 0.01054411904179867),
 ('month_oct', 0.009859989143790814),
 ('month_nov', 0.00696765759095207),
 ('month_nov', 0.006542764159213311),
 ('month_may', 0.008873057291518778),
 ('month_may', 0.006064911903408782),
 ('month_mar', 0.009608749676825363),
 ('month_mar', 0.005650546004779609),
 ('month_jun', 0.007723601227012418),
 ('month_jun', 0.0034183101901524154),
 ('month_jul', 0.00665499365328404),
 ('month_jul', 0.004241538046198751),
 ('month_jan', 0.002330944931820771),
 ('month_jan', 0.0009668662510669467),
 ('month_feb', 0.006413935320280331),
 ('month_feb', 0.0038313348371586294),
 ('month_dec', 0.0018554896709968131),
 ('month_dec', 0.000952710813544856),
 ('month_aug', 0.010916436974775701

In [None]:
def change(data, column):
    new = []
    for value in data[column]:
        if value == 'yes':
            new.append(1)
        else:
            new.append(0)
    return new