In [1]:
import numpy as np
import pandas as pd
from sqlalchemy import create_engine

In [2]:
engine = create_engine('postgresql://ubuntu@18.218.203.43:5432/metis-project-3')
df = pd.read_sql_table('bank', con=engine)

# Data Exploration
---

### Checking for null values

In [3]:
df.isnull().any()

age          False
job          False
marital      False
education    False
default      False
balance      False
housing      False
loan         False
contact      False
day          False
month        False
duration     False
campaign     False
pdays        False
previous     False
poutcome     False
y            False
dtype: bool

In [4]:
df.sample(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
3800,32,technician,married,secondary,no,3333,no,no,cellular,8,aug,50,4,-1,0,unknown,no
69,32,technician,single,tertiary,no,360,no,no,cellular,19,nov,164,2,-1,0,unknown,no
4447,49,blue-collar,single,primary,no,2146,yes,no,cellular,7,may,1516,2,353,2,other,yes
2591,34,management,married,tertiary,no,2892,yes,yes,cellular,30,jul,1165,7,-1,0,unknown,no
240,34,admin.,single,secondary,no,1,yes,no,cellular,22,jul,483,7,-1,0,unknown,no


In [5]:
for col in df:
    print(col,sorted(df[col].unique()),'\n')

age [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 83, 84, 86, 87] 

job ['admin.', 'blue-collar', 'entrepreneur', 'housemaid', 'management', 'retired', 'self-employed', 'services', 'student', 'technician', 'unemployed', 'unknown'] 

marital ['divorced', 'married', 'single'] 

education ['primary', 'secondary', 'tertiary', 'unknown'] 

default ['no', 'yes'] 

balance [-3313, -2082, -1746, -1680, -1400, -1313, -1310, -1224, -1212, -1206, -1202, -1164, -1148, -1053, -988, -970, -967, -966, -938, -932, -921, -905, -892, -888, -887, -872, -852, -849, -839, -824, -770, -762, -759, -735, -715, -710, -703, -701, -679, -674, -673, -665, -650, -635, -632, -630, -626, -617, -614, -612, -606, -589, -588, -587, -584, -583, -568, -566, -563, -559, -558, -552, -551, -547, -540, -535, -522, -518, -

In [6]:
df.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0
mean,41.170095,1422.657819,15.915284,263.961292,2.79363,39.766645,0.542579
std,10.576211,3009.638142,8.247667,259.856633,3.109807,100.121124,1.693562
min,19.0,-3313.0,1.0,4.0,1.0,-1.0,0.0
25%,33.0,69.0,9.0,104.0,1.0,-1.0,0.0
50%,39.0,444.0,16.0,185.0,2.0,-1.0,0.0
75%,49.0,1480.0,21.0,329.0,3.0,-1.0,0.0
max,87.0,71188.0,31.0,3025.0,50.0,871.0,25.0


### Checking for imbalance

In [7]:
y_count = df.y.value_counts()

In [8]:
print(f'Class 0: {y_count[0]}')
print(f'Class 1: {y_count[1]}')
print(f'Proportion: {round(y_count[0] / y_count[1], 2)} : 1')
print(f'Percentage of Majority Class: {round(y_count[0] / sum(y_count), 4)*100}')

Class 0: 4000
Class 1: 521
Proportion: 7.68 : 1
Percentage of Majority Class: 88.48


In [9]:
y_count.plot(kind='bar', title='Class Count', rot=0)

<matplotlib.axes._subplots.AxesSubplot at 0x117728d68>

### Data split into train, validation and test

In [10]:
from sklearn import datasets, model_selection

In [11]:
y = df['y'].values
X = pd.get_dummies(df.drop('y', 1))

In [12]:
X_train, X_testval, y_train, y_testval = model_selection.train_test_split(
    X,
    y,
    test_size=.5,
    stratify=y,
)

In [13]:
X_test, X_val, y_test, y_val = model_selection.train_test_split(
    X_testval,
    y_testval,
    test_size=.5,
    stratify=y_testval,
)

### Logistic Regression

In [14]:
from sklearn import linear_model, metrics

logreg = linear_model.LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_val)


print(metrics.accuracy_score(y_val, y_pred))

0.900972590627763




In [15]:
from sklearn import preprocessing, pipeline

logreg = pipeline.Pipeline([
    ("scale", preprocessing.StandardScaler()),
    ("logistic", linear_model.LogisticRegression())
])
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_val)


print(metrics.accuracy_score(y_val, y_pred))

0.9027409372236959


