In [3]:
import pandas as pd
import numpy as np
import scipy as sc
import seaborn as sns
from copy import deepcopy
from collections import OrderedDict

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score, accuracy_score

from xgboost import XGBClassifier

import matplotlib.pyplot as plt
import scikitplot as skplt
%matplotlib inline

pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000
pd.set_option('precision',2)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
%matplotlib inline

In [4]:
lst = ['marital', 'education', 'job', 'contact', 'poutcome']
def make_dummies(df, lst):
    temp = deepcopy(df)
    Y = df['y']
    temp = temp.drop(['y'], axis=1)
    for col in temp:
        if temp[col].dtype == 'object':
            tmp = pd.get_dummies(temp[col], prefix=col)
            num_col = len(tmp.columns)
            tmp = tmp.iloc[:,0:num_col-1]
            temp.drop([col], axis=1,inplace=True)
            lst = [temp, tmp]
            temp = pd.concat(lst, axis=1)
    return temp, Y

In [5]:
df_bankFull = pd.read_csv("bank-full.csv", delimiter=";")
df_bank = pd.read_csv("bank.csv", delimiter=";")

#### Standardizing Numerical Variables

- Standardizing the data is essential for clustering algorithms and dimension reduction.
- It's not necessary for tree based algorithms.

In [6]:
dfBankFull = deepcopy(df_bankFull)
cols_to_norm = ['age', 'balance', 'duration', 'day','campaign', 'pdays','previous']
dfBankFull[cols_to_norm] = dfBankFull[cols_to_norm]\
    .apply(lambda x: (x - x.min()) / (x.max()-x.min()))

In [7]:
mainDF, y = make_dummies(dfBankFull, lst)

#### Downsampling Majority Class:

- In the cell directly below you can see the big imbalance between the two classes we're trying to predict. We're going to need to fix this so when it's fed into our models there is no prediction bias toward any one class.

In [8]:
y.value_counts()

no     39922
yes     5289
Name: y, dtype: int64

In [9]:
from sklearn.utils import resample

In [10]:
frames = [mainDF, y]
df = pd.concat(frames, axis=1)
df_maj = df[df.y=='no']
df_min = df[df.y=='yes']
df_majority_downsampled = resample(df_maj, replace=False,
                                  n_samples=5289,
                                  random_state=101)

frames_resampled = [df_majority_downsampled,df_min]
df_resampled = pd.concat(frames_resampled)

In [11]:
df_resampled['y'].value_counts()

no     5289
yes    5289
Name: y, dtype: int64

In [12]:
x_train, x_test, y_train, y_test = train_test_split(df_resampled.drop(['y'], axis=1),
                                                    df_resampled['y'], test_size=.3, random_state=101)

# XGBoost

In [39]:
model = XGBClassifier()
model.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, eta=0.001, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

In [40]:
y_pred = model.predict(x_test)

print(accuracy_score(y_test, y_pred))

0.8509766855702583


  if diff:
