In [1]:
# import all modules used in this study

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [2]:
# load both the train and test csv data files into respective variables

train_data = pd.read_csv('train.csv', sep=';')
test_data = pd.read_csv('test.csv', sep=';')

In [3]:
# display size and content of the train datasheet

train_data

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [4]:
# display size and content of the test datasheet which is ~10% the size of the train dataset

test_data

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,33,services,married,secondary,no,-333,yes,no,cellular,30,jul,329,5,-1,0,unknown,no
4517,57,self-employed,married,tertiary,yes,-3313,yes,yes,unknown,9,may,153,1,-1,0,unknown,no
4518,57,technician,married,secondary,no,295,no,no,cellular,19,aug,151,11,-1,0,unknown,no
4519,28,blue-collar,married,secondary,no,1137,no,no,cellular,6,feb,129,4,211,3,other,no


In [5]:
train_data.education[train_data['education'] == 'unknown'].count()

1857

In [6]:
test_data.education[train_data['education'] == 'unknown'].count()

251

In [7]:
'''
Proceed with minor cleanup of the datasets by removing rows in
both the train and test datasets with unknown listed as the value
for education representing 1857 / 45211 = ~4.1% of the train csv file
and 251 / 4521 = ~5.6% of the test csv file.
'''

train_data = train_data[train_data['education'] != 'unknown']
test_data = test_data[test_data['education'] != 'unknown']

In [8]:
'''
Define function which will convert default, housing, loan, and y columns
into categorical values of 0 and 1 corresponding to no and yes.

Subsequently convert primary, seconday, and tertiary values under education,
and single, married, divorced values under to marital to categorical 0, 1, and 2.
'''

def data_categorical(data):
    data.y = pd.Categorical(data.y).codes
    data.default = pd.Categorical(data.default).codes
    data.housing = pd.Categorical(data.housing).codes
    data.loan = pd.Categorical(data.loan).codes

    data.education.replace(['primary', 'secondary', 'tertiary'], [0, 1, 2], inplace=True)
    data.marital.replace(['single', 'married', 'divorced'], [0, 1, 2], inplace=True)

In [9]:
# Proceed with the categorical conversion of the datasets as defined in the above function

data_categorical(train_data)
data_categorical(test_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


In [10]:
# Set the X_train, y_train, X_test, y_test variables to corresponding independent and dependent variables

X_train = train_data[['age', 'marital', 'education', 'default', 'balance', 'housing', 'loan']]
y_train = train_data['y']
X_test = test_data[['age', 'marital', 'education', 'default', 'balance', 'housing', 'loan']]
y_test = test_data['y']

In [11]:
# define logistic regression modeling and test prediction accuracy

logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
predictions = logmodel.predict(X_test)

print(metrics.accuracy_score(y_test, predictions))

0.8841716658975542


In [12]:
# define k nearest neighbor modeling and test prediction accuracy

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

print(metrics.accuracy_score(y_test, predictions))

0.9155514536225197


In [13]:
# define decision tree modeling and test prediction accuracy

DT = DecisionTreeClassifier()
DT.fit(X_train, y_train)
predictions = DT.predict(X_test)

print(metrics.accuracy_score(y_test, predictions))

0.9803876326718967


In [14]:
# define random forest modeling and test prediction accuracy

rfc = RandomForestClassifier(n_estimators=600)
rfc.fit(X_train,y_train)
predictions = rfc.predict(X_test)

print(metrics.accuracy_score(y_test, predictions))

0.9808491001384403


The random forest algorithm yielded the highest prediction accuracy on the supplied test dataset.