In [65]:
import numpy as np
import pandas as pd
import sklearn.linear_model as lm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("bank_altered.csv").drop("Unnamed: 0", axis=1)
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,y
0,58,management,married,tertiary,0,2143,1,0,unknown,5,5,261,1,-1,0,0
1,44,technician,single,secondary,0,29,1,0,unknown,5,5,151,1,-1,0,0
2,33,entrepreneur,married,secondary,0,2,1,1,unknown,5,5,76,1,-1,0,0
3,47,blue-collar,married,unknown,0,1506,1,0,unknown,5,5,92,1,-1,0,0
4,35,management,married,tertiary,0,231,1,0,unknown,5,5,139,1,-1,0,0


## The contact column of the above dataframe contains an unacceptable number of unknown values.  I will drop this column.

In [3]:
df = df.drop(['contact'], axis=1)

## I plan to first fit a multiple logistic regression model to the data, but a number of the attributes are categorical with multiple levels.  I will break these categorical attributes into binary dummy variables to facilitate fitting the logistic regression model.

In [4]:
df = df.drop(['job', 'marital', 'education'], axis=1).join(pd.get_dummies(df[['job', 'marital', 'education']]))
df.head()

Unnamed: 0,age,default,balance,housing,loan,day,month,duration,campaign,pdays,...,job_student,job_technician,job_unemployed,marital_divorced,marital_married,marital_single,education_primary,education_secondary,education_tertiary,education_unknown
0,58,0,2143,1,0,5,5,261,1,-1,...,0,0,0,0,1,0,0,0,1,0
1,44,0,29,1,0,5,5,151,1,-1,...,0,1,0,0,0,1,0,1,0,0
2,33,0,2,1,1,5,5,76,1,-1,...,0,0,0,0,1,0,0,1,0,0
3,47,0,1506,1,0,5,5,92,1,-1,...,0,0,0,0,1,0,0,0,0,1
4,35,0,231,1,0,5,5,139,1,-1,...,0,0,0,0,1,0,0,0,1,0


## I will now store the predictor and target variables as X and y, respectively.

In [5]:
X = df.drop('y', axis=1).values
y = df['y'].values

## Now I will fit and validate the model using KFold validation.

In [7]:
skf = StratifiedKFold(n_splits = 5, random_state = 3)

In [48]:
def f1_func(model, X, y):
    score = 0
    for train, test in StratifiedKFold(n_splits=5, random_state=3).split(X, y):
        model.fit(X[train], y[train])
        score += f1_score(model.predict(X[test]), y[test])
    return score/5

In [70]:
f1_func(lm.LogisticRegression(class_weight='balanced'), X, y)

0.41637356277314347

In [66]:
score_list = []
Cs = [0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 15, 20, 50]
for see in Cs:
    score_list.append(f1_func(lm.LogisticRegression(C=see, class_weight='balanced'), X, y))

In [67]:
np.max(score_list)

0.42555965928536094

In [52]:
best_C = Cs[score_list.index(np.max(score_list))]

In [58]:
blarg = []
for train, test in StratifiedKFold(n_splits=5, random_state=3).split(X,y):
    blarg.append([train, test])

In [68]:
log_reg = lm.LogisticRegression(C=best_C, class_weight='balanced').fit(X, y)

In [69]:
print(classification_report(log_reg.predict(X), y))

             precision    recall  f1-score   support

          0       0.81      0.97      0.88     33235
          1       0.79      0.35      0.49     11685

avg / total       0.80      0.81      0.78     44920

