In [1]:
import pandas as pd
from warnings import filterwarnings
filterwarnings('ignore')

## Helper functions

In [2]:
def dummy_data(data, categorical):
    for x in categorical:
        dummies = pd.get_dummies(data[x], prefix=x, dummy_na=False)
        data = data.drop(x,1)
        data = pd.concat([data, dummies],axis=1)
    return data

In [3]:
data = pd.read_csv('bank-full.csv')

In [4]:
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [5]:
data['y'] = data.y.map({'no':0, 'yes':1})

In [6]:
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,0


In [7]:
cat = ['job', 'marital', 'education', 'default','housing', 'loan', 'contact', 'month', 'poutcome']

In [8]:
data = dummy_data(data,cat)

In [9]:
data.shape

(45211, 52)

In [10]:
data.age.describe()

count    45211.000000
mean        40.936210
std         10.618762
min         18.000000
25%         33.000000
50%         39.000000
75%         48.000000
max         95.000000
Name: age, dtype: float64

In [11]:
data.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,y,job_admin.,job_blue-collar,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,58,2143,5,261,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,44,29,5,151,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,33,2,5,76,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,47,1506,5,92,1,-1,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
4,33,1,5,198,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [13]:
feat = [ col for col in data.columns if col not in ['y']]
X = data[feat]
y = data['y']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, stratify=y, random_state=0)

In [20]:
model = LogisticRegression()
model.fit(X_train,y_train)
predict = model.predict_proba(X_test)[:, 1]

In [21]:
predict

array([0.00758615, 0.02350407, 0.03086616, ..., 0.03269216, 0.07261965,
       0.06263853])

In [37]:
roc_auc_score(y_test,predict)

0.9074125861830368

In [15]:
from sklearn.externals import joblib

file = joblib.dump(model,'logistic_regression_v1.pk')

print('done!!')

done!!


In [56]:
log = joblib.load('logistic_regression_v1.pk')
prediction = log.predict(X_test)
accuracy_score(y_test, prediction)

0.9021498717154738

In [None]:
from sklearn.svm import SVC

l = SVC()
l.fit(X_train,y_train)
predict = l.predict_proba(X_test)