In [82]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [6]:
data = pd.read_csv('bank-additional-full.csv', sep=';', na_values=['nonexistent', 'unknown', 999])

In [7]:
data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,,0,,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,,no,no,telephone,may,mon,...,1,,0,,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,,0,,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,,0,,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,,0,,1.1,93.994,-36.4,4.857,5191.0,no


In [8]:
data.isna().sum()

age                   0
job                 330
marital              80
education          1731
default            8597
housing             990
loan                990
contact               0
month                 0
day_of_week           0
duration              2
campaign              0
pdays             39673
previous              0
poutcome          35563
emp.var.rate          0
cons.price.idx        0
cons.conf.idx         0
euribor3m             0
nr.employed           0
y                     0
dtype: int64

In [9]:
data.drop(['pdays', 'poutcome'], axis=1, inplace=True)

In [13]:
data.dropna(subset=['job', 'marital', 'education', 'default', 'housing', 'loan', 'duration'], inplace=True)

In [20]:
data = pd.get_dummies(data, drop_first=True)

In [21]:
data.describe()

Unnamed: 0,age,duration,campaign,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,job_blue-collar,...,month_mar,month_may,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,y_yes
count,30486.0,30486.0,30486.0,30486.0,30486.0,30486.0,30486.0,30486.0,30486.0,30486.0,...,30486.0,30486.0,30486.0,30486.0,30486.0,30486.0,30486.0,30486.0,30486.0,30486.0
mean,39.030112,259.435577,2.521452,0.194286,-0.071597,93.523283,-40.602332,3.459843,5160.810211,0.186118,...,0.015811,0.319228,0.114676,0.021059,0.016237,0.205931,0.209768,0.195303,0.200912,0.12655
std,10.333765,261.65429,2.720237,0.522803,1.610416,0.585383,4.789331,1.777251,75.159343,0.389208,...,0.124744,0.466185,0.318635,0.143583,0.126388,0.404387,0.40715,0.39644,0.400689,0.332474
min,17.0,0.0,1.0,0.0,-3.4,92.201,-50.8,0.634,4963.6,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,31.0,103.0,1.0,0.0,-1.8,93.075,-42.7,1.313,5099.1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,37.0,180.5,2.0,0.0,1.1,93.444,-41.8,4.856,5191.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,45.0,321.0,3.0,0.0,1.4,93.994,-36.4,4.961,5228.1,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,95.0,4918.0,43.0,7.0,1.4,94.767,-26.9,5.045,5228.1,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [31]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)

In [37]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((18291, 44), (12195, 44), (18291,), (12195,))

# Gradient Boosting

In [81]:
model_gb = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100, max_depth=5, random_state=42)
model_gb.fit(X_train, y_train)
model_gb.score(X_test, y_test)

0.9043050430504305

# Random Forest

In [80]:
model_rf = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=42)
model_rf.fit(X_train, y_train)
model_rf.score(X_test, y_test)

0.8934809348093481

# Decision Tree

In [91]:
model_dt = DecisionTreeClassifier(max_depth=7, random_state=42)
model_dt.fit(X_train, y_train)
model_dt.score(X_test, y_test)

0.8994669946699467

# Standardização

In [93]:
scale = StandardScaler()
X_train_std = scale.fit_transform(X_train)
X_test_std = scale.transform(X_test)

# Gradient Boosting STD

In [107]:
model_gb_std = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100, max_depth=5, random_state=42)
model_gb_std.fit(X_train_std, y_train)
model_gb_std.score(X_test_std, y_test)

0.9043050430504305

# Random Forest STD

In [101]:
model_rf_std = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=42)
model_rf_std.fit(X_train_std, y_train)
model_rf_std.score(X_test_std, y_test)

0.8935629356293563

# Decision Tree STD

In [99]:
model_dt_std = DecisionTreeClassifier(max_depth=7, random_state=42)
model_dt_std.fit(X_train_std, y_train)
model_dt_std.score(X_test_std, y_test)

0.8994669946699467