## Preprocessing of data

### Loading Libraries

In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

### Taking input data

In [2]:
df_bank = pd.read_csv("bank-full.csv")
df_bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [3]:
df_bank.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


### Transforming and cleaning data

In [4]:
# Label Encoding

df_bank['marital'] = df_bank['marital'].map( {'single': 0, 'married': 1, 'divorced': 2} ).astype(int)
df_bank['education'] = df_bank['education'].map( {'unknown': 0, 'primary': 1, 'secondary': 2,'tertiary': 3} ).astype(int)
df_bank['default'] = df_bank['default'].map( {'yes': 1, 'no': 0}).astype(int)
df_bank['housing'] = df_bank['housing'].map( {'yes': 1, 'no': 0}).astype(int)
df_bank['loan'] = df_bank['loan'].map( {'yes': 1, 'no': 0}).astype(int)
df_bank['contact'] = df_bank['contact'].map( {'unknown': 0, 'cellular': 1, 'telephone': 2} ).astype(int)
df_bank['poutcome'] = df_bank['poutcome'].map( {'unknown': 0, 'failure': 1, 'success': 2,'other': 3} ).astype(int)
df_bank['y'] = df_bank['y'].map({'yes':1,'no':0}).astype(int)

In [5]:
# Making job profiles and dropping unnecessary columns

def job_pro(x):
    if x["job"] == "retired" or x['job'] == 'student':
        return "B"
    if x["job"] == "admin." or x['job'] == 'services':
        return "D"
    if x["job"] == "blue-collar" or x['job'] == 'management' or x['job'] == 'technician':
        return "C"
    return "A"
df_bank.apply(lambda x:job_pro(x), axis = 1)
df_bank["job_profile"] = df_bank.apply(lambda x:job_pro(x), axis = 1)
def jobA(x):
    if x["job_profile"] == "A":
        return 1
    return 0
df_bank["job_A"] = df_bank.apply(lambda x:jobA(x), axis = 1)
def jobB(x):
    if x["job_profile"] == "B":
        return 1
    return 0
df_bank["job_B"] = df_bank.apply(lambda x:jobB(x), axis = 1)
def jobC(x):
    if x["job_profile"] == "C":
        return 1
    return 0
df_bank["job_C"] = df_bank.apply(lambda x:jobC(x), axis = 1)
def jobD(x):
    if x["job_profile"] == "D":
        return 1
    return 0
df_bank["job_D"] = df_bank.apply(lambda x:jobD(x), axis = 1)
df_bank = df_bank.drop(['job','month','job_profile'],axis=1)

In [6]:
# Standardising the columns balance and duration

from sklearn import preprocessing
X = df_bank[['balance','duration']]
std_scale = preprocessing.StandardScaler().fit(X)
X_std = std_scale.transform(X)
x_stds = pd.DataFrame(data = X_std)
df_bank = pd.concat([df_bank,x_stds],axis=1)
df_bank.rename(columns={0:"Balance",1:"Duration"},inplace=True)
df_bank = df_bank.drop(['balance','duration'],axis=1)

## After pre-processing of data is finished, we can now proceed with training models on the data

### Splitting Data into training and testing dataset

In [7]:
# Train Test splitting

from sklearn.model_selection import train_test_split
X_train = df_bank.drop('y',axis=1)
Y_train = df_bank['y']
X_Train, X_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.30, random_state=42)

### Running different models and checking accuracies on training and test datasets

In [8]:
#LogisticRegression
logReg = LogisticRegression()
logReg.fit(X_Train,y_train)
acc_log = round(logReg.score(X_Train,y_train) * 100, 2)
acc_log_test = round(logReg.score(X_test,y_test) * 100, 2)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
# Decision Tree
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_Train, y_train)
Y_pred = decision_tree.predict(X_Train)
acc_decision_tree = round(decision_tree.score(X_Train, y_train) * 100, 2)
acc_decision_tree_test = round(decision_tree.score(X_test, y_test) * 100, 2)

In [10]:
# Random Forest
random_forestTest = RandomForestClassifier(n_estimators=100)
random_forestTest.fit(X_Train, y_train)
Y_predTest = random_forestTest.predict(X_Train)
acc_random_forest = round(random_forestTest.score(X_Train, y_train) * 100, 2)
acc_random_forest_test = round(random_forestTest.score(X_test, y_test) * 100, 2)

In [11]:
# k- Nearest Neighbor
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_Train, y_train)
Y_pred = knn.predict(X_Train)
acc_knn = round(knn.score(X_Train, y_train) * 100, 2)
acc_knn_test = round(knn.score(X_test, y_test) * 100, 2)

In [12]:
# Gaussian Naive Bayes
gaussian = GaussianNB()
gaussian.fit(X_Train, y_train)
Y_pred = gaussian.predict(X_Train)
acc_gaussian = round(gaussian.score(X_Train, y_train) * 100, 2)
acc_gaussian_test = round(gaussian.score(X_test, y_test) * 100, 2)

In [13]:
# Support Vector Machines
svc = SVC()
svc.fit(X_Train, y_train)
Y_pred = svc.predict(X_Train)
acc_svc = round(svc.score(X_Train, y_train) * 100, 2)
acc_svc_test = round(svc.score(X_test, y_test) * 100, 2)

In [14]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes','Decision Tree'],
    'Score': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_decision_tree],
    'TestScore': [acc_svc_test,acc_knn_test,acc_log_test,
                  acc_random_forest_test,acc_gaussian_test,acc_decision_tree_test]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score,TestScore
3,Random Forest,100.0,89.96
5,Decision Tree,100.0,86.11
1,KNN,92.64,88.3
2,Logistic Regression,89.23,88.93
0,Support Vector Machines,88.34,88.22
4,Naive Bayes,82.87,83.01


### Since Random Forest gave the highest accuracies for both training and testing datasets, proceeding with the same

In [15]:
random_forestTest = RandomForestClassifier(random_state=10)
random_forestTest.fit(X_train, Y_train)
Y_predTest = random_forestTest.predict(X_train)
acc_random_forest = round(random_forestTest.score(X_train, Y_train) * 100, 2)
acc_random_forest_test = round(random_forestTest.score(X_train, Y_train) * 100, 2)
print("Training Score =",acc_random_forest)
print("Test Score =",acc_random_forest_test)

Training Score = 100.0
Test Score = 100.0


### Adding a column of predicted probabilities to the dataset against each customer and taking a csv file as output for the same

In [16]:
Y_probTest = random_forestTest.predict_proba(X_train)
yy = pd.DataFrame(Y_probTest)
yy.rename(columns={0:"prob Y=0",1:"prob Y=1"},inplace=True)
Y_predTest = random_forestTest.predict(X_train)
yyy=pd.DataFrame(Y_predTest)
yyy.rename(columns={0:"pred Y"},inplace=True)
df_bank = pd.concat([df_bank,yyy,yy],axis=1)
df_bank.to_csv("bank_predt_RF.csv",index=False)

### After this, we sorted the predicted probabilities column and got the list of the customers with highest chances of buying the term policy