In [34]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [35]:
data = pd.read_csv('bank-additional-full.csv', sep=';', na_values=['nonexistent', 'unknown', 999])

In [36]:
data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,,0,,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,,no,no,telephone,may,mon,...,1,,0,,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,,0,,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,,0,,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,,0,,1.1,93.994,-36.4,4.857,5191.0,no


In [37]:
data.isna().sum()

age                   0
job                 330
marital              80
education          1731
default            8597
housing             990
loan                990
contact               0
month                 0
day_of_week           0
duration              2
campaign              0
pdays             39673
previous              0
poutcome          35563
emp.var.rate          0
cons.price.idx        0
cons.conf.idx         0
euribor3m             0
nr.employed           0
y                     0
dtype: int64

In [38]:
data.drop(['pdays', 'poutcome'], axis=1, inplace=True)

In [39]:
impute = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [40]:
categoricos = ['job', 'marital', 'education', 'default', 'housing', 'loan']

In [41]:
data[categoricos] = impute.fit_transform(data[categoricos])

In [42]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

In [44]:
data['duration'] = imp_mean.fit_transform(data[['duration']])

In [45]:
data.isna().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
previous          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

In [46]:
data.education.unique()

array(['basic.4y', 'high.school', 'basic.6y', 'basic.9y',
       'professional.course', 'university.degree', 'illiterate'],
      dtype=object)

In [47]:
edu_map = {'illiterate':0, 'basic.4y':1, 'basic.6y':2, 'basic.9y':3, 'high.school':4, 'professional.course':5, 'university.degree':6}
data['education'] = data['education'].map(edu_map)

In [49]:
data = pd.get_dummies(data, drop_first=True)

In [51]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)

In [53]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((24712, 39), (16476, 39), (24712,), (16476,))

# Gradient Boosting

In [54]:
model_gb = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100, max_depth=5, random_state=42)
model_gb.fit(X_train, y_train)
model_gb.score(X_test, y_test)

0.9156348628307842

# Random Forest

In [55]:
model_rf = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=42)
model_rf.fit(X_train, y_train)
model_rf.score(X_test, y_test)

0.9068948773974266

# Decision Tree

In [56]:
model_dt = DecisionTreeClassifier(max_depth=7, random_state=42)
model_dt.fit(X_train, y_train)
model_dt.score(X_test, y_test)

0.9090191794124788

In [57]:
scale = StandardScaler()
X_train_std = scale.fit_transform(X_train)
X_test_std = scale.transform(X_test)

# Gradient Boosting STD

In [58]:
model_gb_std = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100, max_depth=5, random_state=42)
model_gb_std.fit(X_train_std, y_train)
model_gb_std.score(X_test_std, y_test)

0.9156955571740714

# Random Forest STD

In [59]:
model_rf_std = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=42)
model_rf_std.fit(X_train_std, y_train)
model_rf_std.score(X_test_std, y_test)

0.9068948773974266

# Decision Tree STD

In [60]:
model_dt_std = DecisionTreeClassifier(max_depth=7, random_state=42)
model_dt_std.fit(X_train_std, y_train)
model_dt_std.score(X_test_std, y_test)

0.9091405680990532