# Importing

In [150]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score


In [2]:
df= pd.read_csv('cleaned_bank.csv')

In [3]:
try:
    del df['Unnamed: 0']
except:
    pass

In [4]:
df.head()

Unnamed: 0,age,marital,education,default,housing,loan,contact,month,campaign,pdays,...,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,y,pcall,age2,young_student,rejecter,old_retired
0,56,1,0,0,0,0,1,9.529957,1,0,...,0,1.1,93.994,-36.4,0,0,256,0,0,0
1,57,1,1,1,0,0,1,9.529957,1,0,...,0,1.1,93.994,-36.4,0,0,289,0,1,0
2,37,1,1,0,1,0,1,9.529957,1,0,...,0,1.1,93.994,-36.4,0,0,9,0,1,0
3,40,1,0,0,0,0,1,9.529957,1,0,...,0,1.1,93.994,-36.4,0,0,0,0,0,0
4,56,1,1,0,0,1,1,9.529957,1,0,...,0,1.1,93.994,-36.4,0,0,256,0,1,0


# Split + Scale + Fixing Class Imbalance

In [5]:
X=df.drop('y', axis=1)
y=pd.DataFrame(df.y)

In [6]:
cols=X.columns

In [7]:
X_train, X_test1, y_train, y_test1=train_test_split(X,y,test_size=.3)

In [8]:
mms= MinMaxScaler()

In [9]:
X_train=pd.DataFrame(mms.fit_transform(X_train), columns=cols)

In [10]:
X_test1=pd.DataFrame(mms.transform(X_test1), columns=cols)

In [11]:
ys=np.array(y_train['y'])

In [12]:
X_train['y']=ys

In [13]:
X_train1=X_train[X_train.y>.5]

In [14]:
X_train2= X_train[X_train.y<.5].sample(len(X_train1))

In [15]:
X_train=pd.concat([X_train1, X_train2])

In [16]:
y_train=X_train['y']

In [17]:
del X_train['y']

# Lasso Logistic

In [18]:
logistic = LogisticRegression()

In [19]:
penalty = ['l1']

# Create regularization hyperparameter space
#After trying many values for C, the search seems to prefer 10**-0.5
C = C_param_range = [10**-2, 10**-1.5, 10**-1, 10**-.5, 10**0]

# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

In [20]:
clf_gs = GridSearchCV(logistic, hyperparameters, cv=20, scoring='f1') 

In [21]:
logreg_model = clf_gs.fit(X_train, y_train)

In [22]:
print('Best Penalty:', logreg_model.best_estimator_.get_params()['penalty'])
print('Best C:', logreg_model.best_estimator_.get_params()['C'])

Best Penalty: l1
Best C: 1


In [23]:
y_pred_train = logreg_model.best_estimator_.predict(X_train)
f1_score(y_train, y_pred_train)

0.7087276550998949

In [24]:
logreg_model_params = list(zip(cols, logreg_model.best_estimator_.coef_[0]))

logreg_model_params.sort(key = lambda x: abs(x[1]), reverse=True)
logreg_model_params

[('emp.var.rate', -2.9785691941494945),
 ('cons.price.idx', 2.5166730984241616),
 ('age2', 1.4527915696814693),
 ('pdays', 1.4172547136957607),
 ('month', -1.2557307705696443),
 ('cons.conf.idx', 0.7998349775777995),
 ('contact', -0.7542709895472122),
 ('pcall', -0.4991464232690159),
 ('campaign', -0.48927285384772945),
 ('poutcome', 0.3630240468608659),
 ('education', 0.30381611331211084),
 ('young_student', 0.2475300786719836),
 ('previous', -0.2287770824517244),
 ('old_retired', 0.15148022297054367),
 ('rejecter', -0.08591224622379912),
 ('marital', -0.0813119629821231),
 ('default', -0.049051020642534575),
 ('housing', -0.04245560339102883),
 ('loan', 0.016084982301390424),
 ('age', 0.0)]

# Ridge Logistic

In [25]:
logistic2 = LogisticRegression()

In [26]:
penalty = ['l2']


C = C_param_range = [10**-1, 10**-.5, 10**0, 10**.5, 10**1]

hyperparameters = dict(C=C, penalty=penalty)

In [27]:
#Scaling down correlated columns
corr_2= X_train.corr()**2
for col in corr_2:
    X_train[col]=X_train[col]/(X_train[col].max()*((corr_2[col].sum())**.5))
    X_test1[col]= X_test1[col]/(X_test1[col].max()*((corr_2[col].sum())**.5))

In [28]:
clf_gs2 = GridSearchCV(logistic2, hyperparameters, cv=20, scoring='f1') 

In [29]:
logreg_model2 = clf_gs2.fit(X_train, y_train)

In [30]:
print('Best Penalty:', logreg_model2.best_estimator_.get_params()['penalty'])
print('Best C:', logreg_model2.best_estimator_.get_params()['C'])

Best Penalty: l2
Best C: 3.1622776601683795


In [31]:
y_pred_train2 = logreg_model2.best_estimator_.predict(X_train)
f1_score(y_train, y_pred_train2)

0.7093717093717093

In [32]:
logreg_model_params2 = list(zip(cols, logreg_model2.best_estimator_.coef_[0]))

logreg_model_params2.sort(key = lambda x: abs(x[1]), reverse=True)
logreg_model_params2

[('emp.var.rate', -4.808680972449352),
 ('cons.price.idx', 3.432781278825641),
 ('pdays', 2.339944492183521),
 ('age2', 2.118275910538086),
 ('month', -1.6327901260440336),
 ('contact', -0.9851620959313085),
 ('cons.conf.idx', 0.8813142305448045),
 ('poutcome', 0.843124647388883),
 ('pcall', -0.775869567196003),
 ('previous', -0.7702899880080383),
 ('campaign', -0.5643047537140631),
 ('education', 0.35686404714031555),
 ('young_student', 0.301905316783956),
 ('old_retired', 0.2731414491408258),
 ('age', -0.10669540153486812),
 ('rejecter', -0.10624039203520794),
 ('marital', -0.09309695481980591),
 ('default', -0.06225434779372901),
 ('housing', -0.04805654298467595),
 ('loan', 0.02249563801811785)]

# KNN

In [33]:
knn=KNeighborsClassifier()

In [34]:
X_test1_knn=X_test1.copy()


In [35]:
for l in logreg_model_params2:
    X_train[l[0]]=X_train[l[0]]*((abs(l[1]))**.5)
    X_test1_knn[l[0]]=X_test1_knn[l[0]]*((abs(l[1]))**.5)

In [36]:
for col in corr_2:
    X_train[col]=X_train[col]/(((corr_2[col].sum())**.25))
    X_test1_knn[col]=X_test1_knn[col]/(((corr_2[col].sum())**.25))

In [37]:
neighbors= [5,7,9]
grid_params_KNN = [{'n_neighbors':neighbors, 'weights': [ 'uniform']}] 


In [38]:
gs_KNN_poly = GridSearchCV(knn, grid_params_KNN, cv=5, scoring='f1') 

In [39]:
gs_KNN_poly.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'n_neighbors': [5, 7, 9], 'weights': ['uniform']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1', verbose=0)

In [40]:
# Best params
print('Best params: %s' % gs_KNN_poly.best_params_)
# Best training data accuracy
# Predict on test data with best params
y_pred_KNN = gs_KNN_poly.predict(X_train)
# Test data accuracy of model with best par
print('Train set F1 score for best params: %.3f ' % f1_score(y_train, y_pred_KNN))

Best params: {'n_neighbors': 9, 'weights': 'uniform'}
Train set F1 score for best params: 0.750 


# Decision Trees

In [41]:
dt= DecisionTreeClassifier()


In [42]:
#Bringing the data back to regular scale
for col in corr_2:
    X_train[col]=X_train[col]/(X_train[col].max()*((corr_2[col].sum())**.5))
    X_test1[col]= X_test1[col]/(X_test1[col].max()*((corr_2[col].sum())**.5))

In [43]:
parameters={ 'max_depth': [9,12, 15], 'criterion': ['entropy', 'gini'],
            'min_samples_leaf':[15,20,25,35],'min_impurity_decrease':[.001,.0005,0.0003] }

In [44]:
clf_tree=DecisionTreeClassifier()
gs_dt = GridSearchCV(DecisionTreeClassifier(),
                  param_grid= parameters,
                  scoring='f1', cv=5, return_train_score=True)
gs_dt.fit(X_train, y_train)
results = gs_dt.cv_results_

In [45]:
print(gs_dt.best_params_)


{'criterion': 'entropy', 'max_depth': 9, 'min_impurity_decrease': 0.001, 'min_samples_leaf': 15}


In [46]:
#Decision Tree has highest f1 score
y_pred_dt = gs_dt.best_estimator_.predict(X_train)

print("f1:",f1_score(y_train, y_pred_dt))

f1: 0.7088156723063223


# Random Forest

In [47]:
rfc=RandomForestClassifier()

In [48]:
param_grid = { 
    'n_estimators': [500],
    'max_features': [0.4, 0.5, .6],
    'max_depth' : [10, 12, 14],
    'min_samples_leaf': [0.04,0.05],
    'criterion': [ 'entropy']
}

In [49]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5,n_jobs=-1)
CV_rfc.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [500], 'max_features': [0.4, 0.5, 0.6], 'max_depth': [10, 12, 14], 'min_samples_leaf': [0.04, 0.05], 'criterion': ['entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [50]:
CV_rfc.best_params_

{'criterion': 'entropy',
 'max_depth': 12,
 'max_features': 0.6,
 'min_samples_leaf': 0.05,
 'n_estimators': 500}

In [51]:
rfc_pred = CV_rfc.predict(X_train)


In [52]:
print('Test F1 score: ', f1_score(y_train, rfc_pred))

Test F1 score:  0.6912242686890575


# XG Boost

In [53]:
clf= xgb.XGBClassifier()

In [113]:
param_grid = {
    "learning_rate": [0.1],
    'max_depth': [3],
    'min_child_weight': [40],
    'subsample': [.8],
    'n_estimators': [500]
}

In [117]:
grid_clf = GridSearchCV(clf, param_grid, scoring='f1', cv=None, n_jobs=1)
grid_clf.fit(X_train, y_train)


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'learning_rate': [0.1], 'max_depth': [3], 'min_child_weight': [40], 'subsample': [0.8], 'n_estimators': [500]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1', verbose=0)

In [118]:
best_parameters_xg = grid_clf.best_params_
print("Grid Search found the following optimal parameters: ")
for param_name in sorted(best_parameters_xg.keys()):
    print("%s: %r" % (param_name, best_parameters_xg[param_name]))

training_preds = grid_clf.predict(X_train)

Grid Search found the following optimal parameters: 
learning_rate: 0.1
max_depth: 3
min_child_weight: 40
n_estimators: 500
subsample: 0.8


  if diff:


In [119]:
training_accuracy = f1_score(y_train, training_preds)
training_accuracy

0.7292798317855266

# Val-Train Split

In [121]:
X_val, X_test, y_val, y_test= train_test_split(X_test1, y_test1, test_size=.5)

In [122]:
X_val_k=X_val.copy()
X_test_k=X_test.copy()

In [123]:
for l in logreg_model_params2:
    X_val_k[l[0]]=X_val_k[l[0]]*((abs(l[1]))**.5)
    X_test_k[l[0]]=X_test_k[l[0]]*((abs(l[1]))**.5)

In [124]:
for col in corr_2:
    X_val_k[col]=X_val_k[col]/(((corr_2[col].sum())**.25))
    X_test_k[col]=X_test_k[col]/(((corr_2[col].sum())**.25))

# Validation

In [128]:
y_pred_val_2 = logreg_model2.best_estimator_.predict(X_val)
f1_score(y_val, y_pred_val_2)

0.4452759588400374

In [129]:
y_val_KNN = gs_KNN_poly.predict(X_val)

In [130]:
f1_score(y_val, y_val_KNN)

0.3228498074454429

In [131]:
y_val_dt = gs_dt.best_estimator_.predict(X_val)
f1_score(y_val, y_val_dt)

0.45503159941662624

In [134]:
y_val_rf= rfc_pred = CV_rfc.predict(X_val)

f1_score(y_val, y_val_rf)

0.46169354838709675

In [136]:
y_val_xg = grid_clf.predict(X_val)

  if diff:


In [137]:
f1_score(y_val, y_val_xg)

0.4436054742803209

In [148]:
ensemble_predict= (np.sign((y_pred_val_2+ y_val_KNN  + y_val_dt + y_val_rf + y_val_xg)-2.5)+1)/2

In [149]:
f1_score(y_val, ensemble_predict)

0.459048553212359