In [51]:
import pandas as pd
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.utils import shuffle

In [2]:
filename = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter13/Dataset/bank-full.csv'
bankData = pd.read_csv(filename,sep=";")
bankData.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
rob_scaler = RobustScaler()

In [5]:
bankData['ageScaled'] = rob_scaler.fit_transform(bankData['age'].values.reshape(-1, 1))
bankData['balScaled'] = rob_scaler.fit_transform(bankData['balance'].values.reshape(-1, 1))
bankData['durScaled'] = rob_scaler.fit_transform(bankData['duration'].values.reshape(-1, 1))

In [6]:
bankData.drop(['age', 'balance', 'duration'], 1, inplace=True)

In [7]:
bankData.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,y,ageScaled,balScaled,durScaled
0,management,married,tertiary,no,yes,no,unknown,5,may,1,-1,0,unknown,no,1.266667,1.25,0.375
1,technician,single,secondary,no,yes,no,unknown,5,may,1,-1,0,unknown,no,0.333333,-0.308997,-0.134259
2,entrepreneur,married,secondary,no,yes,yes,unknown,5,may,1,-1,0,unknown,no,-0.4,-0.328909,-0.481481
3,blue-collar,married,unknown,no,yes,no,unknown,5,may,1,-1,0,unknown,no,0.533333,0.780236,-0.407407
4,unknown,single,unknown,no,no,no,unknown,5,may,1,-1,0,unknown,no,-0.4,-0.329646,0.083333


In [8]:
bankCat = pd.get_dummies(bankData[['job','marital','education','default','housing','loan','contact','month','poutcome']])

In [9]:
bankNum = bankData[['ageScaled','balScaled','day','durScaled','campaign','pdays','previous']]

In [15]:
X = pd.concat([bankCat, bankNum], 1)
y = bankData['y']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

In [20]:
bankModel = LogisticRegression(max_iter=1000)

In [21]:
bankModel.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [24]:
preds = bankModel.predict(X_test)
print(bankModel.score(X_test, y_test))

0.9004718372161604


In [26]:
print(confusion_matrix(preds, y_test))
print('\n', classification_report(preds, y_test))

[[11706  1058]
 [  292   508]]

               precision    recall  f1-score   support

          no       0.98      0.92      0.95     12764
         yes       0.32      0.64      0.43       800

    accuracy                           0.90     13564
   macro avg       0.65      0.78      0.69     13564
weighted avg       0.94      0.90      0.92     13564



In [29]:
print('Percentage of negative class :',(y_train[y_train=='yes'].value_counts()/len(y_train) ) * 100)
print('Percentage of positive class :',(y_train[y_train=='no'].value_counts()/len(y_train) ) * 100)

Percentage of negative class : yes    11.764148
Name: y, dtype: float64
Percentage of positive class : no    88.235852
Name: y, dtype: float64


## Undersampling and Classification on Our Banking Dataset to Find the Optimal Result

In [30]:
filename = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter13/Dataset/bank-full.csv'
bankData = pd.read_csv(filename,sep=";")
bankData.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [31]:
rob_scaler = RobustScaler()

In [33]:
bankData['ageScaled'] = rob_scaler.fit_transform(bankData['age'].values.reshape(-1, 1))
bankData['balScaled'] = rob_scaler.fit_transform(bankData['balance'].values.reshape(-1, 1))
bankData['durScaled'] = rob_scaler.fit_transform(bankData['duration'].values.reshape(-1, 1))

In [34]:
bankData.drop(['age', 'balance', 'duration'], 1, inplace=True)

In [35]:
bankCat = pd.get_dummies(bankData[['job','marital','education','default','housing','loan','contact','month','poutcome']])

In [36]:
bankNum = bankData[['ageScaled','balScaled','day','durScaled','campaign','pdays','previous']]

In [37]:
X = pd.concat([bankCat, bankNum], 1)
y = bankData['y']

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

In [39]:
trainData = pd.concat([X_train, y_train], 1)

In [40]:
trainData.head()

Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,poutcome_success,poutcome_unknown,ageScaled,balScaled,day,durScaled,campaign,pdays,previous,y
19100,1,0,0,0,0,0,0,0,0,0,...,0,1,0.8,-0.162979,5,0.236111,1,-1,0,no
37958,1,0,0,0,0,0,0,0,0,0,...,0,0,0.733333,-0.238938,14,0.865741,2,289,19,no
12451,0,1,0,0,0,0,0,0,0,0,...,0,1,0.0,0.385693,1,1.347222,3,-1,0,no
18263,0,0,0,0,1,0,0,0,0,0,...,0,1,1.333333,-0.330383,31,-0.592593,8,-1,0,no
5128,0,0,0,0,0,0,0,1,0,0,...,0,1,-0.466667,-0.14233,21,-0.435185,2,-1,0,no


In [42]:
ind = trainData[trainData['y'] == 'yes'].index
minData = trainData.loc[ind]
minData.shape

(3723, 52)

In [47]:
ind1 = trainData[trainData['y'] == 'no'].index
majData = trainData.loc[ind1]
majData.shape

(27924, 52)

In [49]:
majSample = majData.sample(n=len(ind), random_state=123)

In [50]:
balancedData = pd.concat([minData, majSample], 0)

In [52]:
balancedData = shuffle(balancedData)
balancedData.head()

Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,poutcome_success,poutcome_unknown,ageScaled,balScaled,day,durScaled,campaign,pdays,previous,y
37047,1,0,0,0,0,0,0,0,0,0,...,0,1,-0.266667,-0.330383,13,4.134259,1,-1,0,yes
21632,0,0,0,0,1,0,0,0,0,0,...,0,1,-0.533333,-0.330383,19,-0.115741,4,-1,0,no
11558,0,0,0,1,0,0,0,0,0,0,...,0,1,0.133333,0.315634,19,-0.472222,1,-1,0,no
28037,0,0,0,0,0,0,0,0,0,1,...,0,1,0.2,-0.330383,28,3.527778,2,-1,0,no
39439,1,0,0,0,0,0,0,0,0,0,...,1,0,0.266667,0.210177,25,0.472222,1,187,3,yes


In [53]:
X_train_new = balancedData.drop('y', 1)
y_train_new = balancedData.y

In [54]:
y_train_new.head()

37047    yes
21632     no
11558     no
28037     no
39439    yes
Name: y, dtype: object

In [55]:
bankModel1 = LogisticRegression(max_iter=1000)
bankModel1.fit(X_train_new, y_train_new)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [56]:
preds = bankModel1.predict(X_test)

In [57]:
bankModel1.score(X_test, y_test)

0.8464317310527868

In [58]:
print(confusion_matrix(preds, y_test))
print('\n', classification_report(preds, y_test))

[[10203   288]
 [ 1795  1278]]

               precision    recall  f1-score   support

          no       0.85      0.97      0.91     10491
         yes       0.82      0.42      0.55      3073

    accuracy                           0.85     13564
   macro avg       0.83      0.69      0.73     13564
weighted avg       0.84      0.85      0.83     13564



## SMOTE bank dataset

In [59]:
print("Before OverSampling count of yes: {}".format(sum(y_train=='yes')))
print("Before OverSampling count of no: {} \n".format(sum(y_train=='no')))

Before OverSampling count of yes: 3723
Before OverSampling count of no: 27924 

