### Packages and dataset

In [19]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import skew,boxcox,yeojohnson
import sklearn
from sklearn.preprocessing import LabelEncoder
import category_encoders
from category_encoders.target_encoder import TargetEncoder
from sklearn.model_selection import train_test_split,cross_val_score,KFold,GridSearchCV,cross_validate
from sklearn.preprocessing import StandardScaler
import time
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier


In [4]:
#Reading csv
df = pd.read_csv("bank_marketing_weka_dataset.csv")

#Separate features
X = df.drop(['y'], axis=1)

#Target variable
y = df['y']

#Transform 'y' to binary
y = y.map(dict(yes=1, no=0))

In [5]:
#Features initial state
X.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79.0,1,-1.0,0,unknown
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220.0,1,339.0,4,failure
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185.0,1,330.0,1,failure
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199.0,4,-1.0,0,unknown
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226.0,1,-1.0,0,unknown


## Target variable balance

In [6]:
y.value_counts()

0    4000
1     521
Name: y, dtype: int64

# Features engineering

## Continuous variables

In [7]:
conti = X.select_dtypes(include=np.number)
skewed = []
for column in conti.columns:
    if conti[column].skew()>2:
        skewed.append(column)
    print(f'{column.capitalize()} skew = {conti[column].skew()}')

Age skew = 0.6995014533559305
Balance skew = 6.596430546295735
Day skew = 0.09462690455175846
Duration skew = 2.7724197169136455
Campaign skew = 4.74391446894159
Pdays skew = 2.71707136656418
Previous skew = 5.875258677896706


It is known that ML algorithms tend to perform worse if they have to deal with skewed data. Here we arbitrally have chosen columns with skewness above 2 and transform them, to reduce that parameter.

In [8]:
for column in skewed:
    if X[column].min()>0:
        print(pd.Series(boxcox(X[column])[0]).skew(),f'{column} boxcox')
        X[column] = pd.Series(boxcox(X[column])[0])
    else:
        print(pd.Series(yeojohnson(X[column])[0]).skew(),f'{column} yeojohnson')
        X[column] = pd.Series(yeojohnson(X[column])[0])

1.580521502422124 balance yeojohnson
0.015631020933611555 duration boxcox
0.16509037675445096 campaign boxcox
1.6651521750550364 pdays yeojohnson
1.664879334841969 previous yeojohnson


Ideally we would like to use boxcox transormation for all our chosen data, but this type of transormation does not work with negative values. Instead of trying to add some constant to these columns with non-positive values we decided to use yeojohnson transormation which is capable of dealing with negative data. As we can see in all of our columns we have managed to reduce skewness massively.

## Categorical variables

In [9]:
#List of categorical variables
X.select_dtypes(include='object').columns

Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'poutcome'],
      dtype='object')

### Binary values
Variables: 'default', 'housing', 'loan'\
Values: 'yes', 'no'\
Method: map to yes=1, no=0

In [10]:
X = X.replace('yes', 1)
X = X.replace('no', 0)

### Month variable
Values: string abbreviations\
Method: transform to numerical

In [11]:
X.month = X.month.map(dict(
jan=1, feb=2, mar=3, apr=4, may=5, jun=6,
jul=7, aug=8, sep=9, oct=10, nov=11, dec=12))

### Variables: 'job', 'marital', 'education', 'poutcom', 'contact'

Thanks to EDA we know that:

- 'unknown' values are present
- 'job', 'education', 'poutcom' seem to have impact on target variable
- values distributions have no dangerous disparities (in terms of using the target encoding method)

Hence, we decide to:

 - leave the 'unknown' values, as it represents best the idea of the missing information in our case
 - apply target encoding method to variables related with target variable
 - apply one-hot encoding to other variables
 
Expecting target encoding to reflect the relationship of variables with target variable - where it is present according to EDA - and one-hot method to neutrally pass on values of other variables.\
\
For second approach to compere results we decide to apply target encoding for all of these features.

In [12]:
#Approach 1
X1 = X.copy()

#Target encoding
te = TargetEncoder()
X1.loc[:, ['job', 'education', 'poutcome']] = te.fit_transform(X1.loc[:, ['job', 'education', 'poutcome']], y)

#One-hot encoding
X1 = pd.get_dummies(X1, columns=['marital', 'contact'])
#cols = X.columns

In [13]:
#Approach 2
X2 = X.copy()

#Target encoding for all variables
te2 = TargetEncoder()
cols_to_te = ['job', 'education', 'poutcome', 'marital', 'contact']
X2.loc[:, cols_to_te] = te2.fit_transform(X2.loc[:, cols_to_te], y)

Scaling the data just for sake we might use alghoritm that require standarized input.

In [14]:
scaler = StandardScaler()
scaler2 = StandardScaler()
X1 = scaler.fit_transform(X1)
X2 = scaler2.fit_transform(X2)

# Modeling

In [17]:
def modeling(X,y):
    
    #Train-test split
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state = 42)
    
    #cv folds
    kfold = KFold(n_splits=5, random_state=42)
    
    #cv scores
    scoring = {'accuracy': 'accuracy',
               'recall': 'recall_macro',
               'auc': 'roc_auc',}
    
    #Logistic regression
    logistic = LogisticRegressionCV()
    #cross_val_score(logistic,X_train,y_train,scoring='roc_auc',cv=kfold).mean()
    r_log = cross_validate(logistic,X_train,y_train,scoring=scoring, cv=kfold)
    
    #xgb
    params = {'colsample_bytree': 0.8, 'gamma': 1.5, 'max_depth': 5, 'min_child_weight': 1, 'subsample': 0.8,'n_estimators':1000}
    xg = xgb.XGBClassifier(objective='binary:logistic',nthread=4,n_jobs=4,params=params)
    #kfold = KFold(n_splits=5, random_state=42)
    #cross_val_score(xg, X_train, y_train, cv=kfold,scoring='roc_auc').mean()
    r_xgb = cross_validate(xg, X_train, y_train, cv=kfold, scoring=scoring)
    
    #random forest
    params = {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 2000}
    tree = RandomForestClassifier(**params)
    #kfold = KFold(n_splits=5, random_state=42)
    r_ranforest = cross_validate(tree,X_train,y_train,scoring=scoring,cv=kfold)
    
    return([r_log, r_ranforest, r_xgb])

In [21]:
result = modeling(X1,y)



In [22]:
result2 = modeling(X2,y)



# Scores

Scores means from cross-validation are presented below for three tested algorithms and two features engineering approaches. We observe satisfying accuracy and auc, but recall needs to be improved in futer. Its low score is propably caused by imbalance in target variable.\
Algorithms results are suprisingly very similar. XGBoost performed best in recall score.

In [23]:
r1 = pd.DataFrame(result[0]).mean()
r2 = pd.DataFrame(result2[0]).mean()
r = pd.DataFrame({'X1': r1.values, 'X2': r2.values}, index=r1.axes)
print("Logistic regression:")
r

Logistic regression:


Unnamed: 0,X1,X2
fit_time,0.202935,0.18991
score_time,0.003106,0.002705
test_accuracy,0.897949,0.897672
test_recall,0.640133,0.63461
test_auc,0.880112,0.881423


In [24]:
r1 = pd.DataFrame(result[1]).mean()
r2 = pd.DataFrame(result2[1]).mean()
r = pd.DataFrame({'X1': r1.values, 'X2': r2.values}, index=r1.axes)
print("Random forest:")
r

Random forest:


Unnamed: 0,X1,X2
fit_time,5.472812,5.838744
score_time,0.559627,0.576394
test_accuracy,0.893523,0.894075
test_recall,0.602834,0.62031
test_auc,0.902316,0.902169


In [25]:
r1 = pd.DataFrame(result[2]).mean()
r2 = pd.DataFrame(result2[2]).mean()
r = pd.DataFrame({'X1': r1.values, 'X2': r2.values}, index=r1.axes)
print("XGBoost:")
r

XGBoost:


Unnamed: 0,X1,X2
fit_time,0.103365,0.093803
score_time,0.00591,0.005651
test_accuracy,0.892691,0.892692
test_recall,0.66344,0.668237
test_auc,0.89463,0.892627


## Features importances

In [26]:
model.fit(X_train,y_train)
feat_importances = pd.Series(model.feature_importances_, index=cols)
feat_importances.nlargest(20).plot(kind='barh')

NameError: name 'model' is not defined

In [None]:
########################
# zostawiłem na razie to co niżej wstaw gdzie uważasz

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state = 42)

In [None]:
#params = {'colsample_bytree': 0.8, 'gamma': 1.5, 'max_depth': 5, 'min_child_weight': 1, 'subsample': 0.8,'n_estimators':1000}
#xg = xgb.XGBClassifier(objective='binary:logistic',nthread=4,n_jobs=4,params=params)
#kfold = KFold(n_splits=5, random_state=42)
#cross_val_score(xg, X_train, y_train, cv=kfold,scoring='roc_auc').mean()

In [None]:
#xg.fit(X_train,y_train)
#feat_importances = pd.Series(xg.feature_importances_, index=cols)
#feat_importances.nlargest(20).plot(kind='barh')
#Tak jak sie mozna bylo spodziewac poutcome ma najwiekszy wplyw na klasyfikator

In [None]:
params = {'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 500,'max_features':'auto'}
tree = RandomForestClassifier(**params)
kfold = KFold(n_splits=10, random_state=42)
cross_val_score(tree,X_train,y_train,scoring='roc_auc',cv=kfold).mean()

In [None]:
logistic = LogisticRegression(C=0.001,solver='saga')
kfold = KFold(n_splits=5, random_state=42)
cross_val_score(logistic,X_train,y_train,scoring='roc_auc',cv=kfold).mean()

In [None]:
logistic.fit(X_train,y_train)
pred_logistic = logistic.predict(X_test)
score_logistic = roc_auc_score(pred_logistic,y_test)
score_logistic
#? xD