In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle as pk
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [2]:
models = []
acc = []
precision = []
recall = []
f1 = []

In [3]:
# Importing the dataset
dataset = pd.read_csv('Social_Network_Ads.csv', index_col=0)
dataset.head()

Unnamed: 0_level_0,Gender,Age,EstimatedSalary,Purchased
User ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
15624510,Male,19,19000,0
15810944,Male,35,20000,0
15668575,Female,26,43000,0
15603246,Female,27,57000,0
15804002,Male,19,76000,0


In [4]:
X = dataset.iloc[:, [1, 2]].values
y = dataset.iloc[:, 3].values

In [5]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, 
                                                    random_state = 0)

In [6]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Logistic Regression

In [7]:
# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state = 0)
lr.fit(X_train, y_train)
models.append('Logistic Regression')

In [8]:
#Save the model on disk
pk.dump(lr, open('logistic_regression.sav','wb'))

In [9]:
lr.coef_

array([[2.06169269, 1.10338288]])

In [10]:
lr.intercept_

array([-0.92421803])

In [11]:
lr.n_iter_

array([5], dtype=int32)

In [12]:
lr.predict(X_test)

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1], dtype=int64)

In [13]:
probs = lr.predict_proba(X_test)
probs[:,0]

array([0.8835313 , 0.82871699, 0.8004837 , 0.90739447, 0.89926493,
       0.99169199, 0.98410063, 0.26459889, 0.99384437, 0.4907332 ,
       0.96238257, 0.96947917, 0.83490307, 0.62448827, 0.98369807,
       0.65886488, 0.7144286 , 0.98708287, 0.01150228, 0.95626217,
       0.90763174, 0.03668135, 0.71749074, 0.11706887, 0.99524108,
       0.02710673, 0.91943183, 0.92088349, 0.80168341, 0.83888382,
       0.97879075, 0.70119816, 0.06769477, 0.84353432, 0.98491826,
       0.99630128, 0.97919709, 0.93195984, 0.97118522, 0.4448538 ,
       0.93130023, 0.71096146, 0.93904903, 0.95886141, 0.19345382,
       0.9725232 , 0.69445268, 0.07165789, 0.99068214, 0.1393124 ,
       0.00989222, 0.96057081, 0.87708921, 0.57783027, 0.01932703,
       0.67330527, 0.91360278, 0.95550802, 0.50860878, 0.99631168,
       0.97865378, 0.06541544, 0.99028301, 0.60285451, 0.99802191,
       0.01850645, 0.96035612, 0.9725232 , 0.77974568, 0.5299776 ,
       0.40192565, 0.79006863, 0.98819632, 0.73944348, 0.92922

In [14]:
#Load Model to predict
loaded_model_lr = pk.load(open('logistic_regression.sav', 'rb'))

## Decision Trees

In [15]:
# Fitting Decision Tree Classification to the Training set
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(criterion = 'entropy', 
                                    random_state = 0)
dt.fit(X_train, y_train)
models.append('Decision Trees')

## Support Vector Machines

In [16]:
# Fitting SVM to the Training set
from sklearn.svm import SVC
svc = SVC(kernel = 'rbf', random_state = 0)
svc.fit(X_train, y_train)
models.append('SVM')

## Random Forest Classifier

In [17]:
# Fitting Random Forest Classification to the Training set
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 350, criterion = 'entropy', 
                                    random_state = 0)
rf.fit(X_train, y_train)
models.append('Random Forest')

## AdaBoost Classifier

In [18]:
# Fitting AdaBoost Classification to the Training set
from sklearn.ensemble import AdaBoostClassifier
adb = AdaBoostClassifier(base_estimator=dt, n_estimators=50, 
                         algorithm='SAMME.R', random_state=40)
adb.fit(X_train, y_train)
models.append('AdaBoost')

## Average Ensemble

In [19]:
# Fitting Voting Classifier Classification to the Training set
from sklearn.ensemble import VotingClassifier
vc = VotingClassifier(estimators=[('Logistic Regression',lr),
                                   ('SVM',svc),
                                   ('Decision Tree',dt),
                                   ('Random Forest',rf),
                                   ('AdaBoost',adb)], 
                       voting='hard')
                       #flatten_transform=True)
vc.fit(X_train, y_train)
models.append('Average Ensemble')

In [20]:
# Fitting Voting Classifier Classification to the Training set
from sklearn.ensemble import VotingClassifier
vc2 = VotingClassifier(estimators=[('Logistic Regression',lr),
                                   ('SVM',svc),
                                   ('Decision Tree',dt),
                                   ('Random Forest',rf),
                                   ('AdaBoost',adb)],
                      voting='soft',
                      flatten_transform=True, 
                      weights=[1,5,2,4,3])
vc2.fit(X_train, y_train)

VotingClassifier(estimators=[('Logistic Regression',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=0, solver='warn',
                                                 tol=0.0001, verbose=0,
                                                 warm_start=False)),
                             ('SVM',
                              SVC(C=1.0, cache_size=200, class_weight=None,
                                  coef0=0.0, decisi...
                                                                                       criterion='entropy',
 

### Evaluation through Confusion Matrix

In [21]:
from sklearn.metrics import (confusion_matrix, accuracy_score, precision_score, 
                             recall_score, f1_score)

In [22]:
print('Confusion Matrix for LR: \n',confusion_matrix(y_test, lr.predict(X_test)))
print('Accuracy for LR: \n',accuracy_score(y_test, lr.predict(X_test)))
acc.append(accuracy_score(y_test, lr.predict(X_test)))
print('Precision for LR: \n',precision_score(y_test, lr.predict(X_test)))
precision.append(precision_score(y_test, lr.predict(X_test)))
print('Recall for LR: \n',recall_score(y_test, lr.predict(X_test)))
recall.append(recall_score(y_test, lr.predict(X_test)))
print('f1_score for LR: \n',f1_score(y_test, lr.predict(X_test)))
f1.append(f1_score(y_test, lr.predict(X_test)))

Confusion Matrix for LR: 
 [[65  3]
 [ 8 24]]
Accuracy for LR: 
 0.89
Precision for LR: 
 0.8888888888888888
Recall for LR: 
 0.75
f1_score for LR: 
 0.8135593220338982


In [23]:
print('Confusion Matrix for DTrees: \n',confusion_matrix(y_test, dt.predict(X_test)))
print('Accuracy for DTrees: \n',accuracy_score(y_test, dt.predict(X_test)))
acc.append(accuracy_score(y_test, dt.predict(X_test)))
print('Precision for DTrees: \n',precision_score(y_test, dt.predict(X_test)))
precision.append(precision_score(y_test, dt.predict(X_test)))
print('Recall for DTrees: \n',recall_score(y_test, dt.predict(X_test)))
recall.append(recall_score(y_test, dt.predict(X_test)))
print('f1_score for DTrees: \n',f1_score(y_test, dt.predict(X_test)))
f1.append(f1_score(y_test, dt.predict(X_test)))

Confusion Matrix for DTrees: 
 [[62  6]
 [ 3 29]]
Accuracy for DTrees: 
 0.91
Precision for DTrees: 
 0.8285714285714286
Recall for DTrees: 
 0.90625
f1_score for DTrees: 
 0.8656716417910447


In [24]:
print('Confusion Matrix for SVM: \n',confusion_matrix(y_test, svc.predict(X_test)))
print('Accuracy for SVM: \n',accuracy_score(y_test, svc.predict(X_test)))
acc.append(accuracy_score(y_test, svc.predict(X_test)))
print('Precision for SVM: \n',precision_score(y_test, svc.predict(X_test)))
precision.append(precision_score(y_test, svc.predict(X_test)))
print('Recall for SVM: \n',recall_score(y_test, svc.predict(X_test)))
recall.append(recall_score(y_test, svc.predict(X_test)))
print('f1_score for SVM: \n',f1_score(y_test, svc.predict(X_test)))
f1.append(f1_score(y_test, svc.predict(X_test)))

Confusion Matrix for SVM: 
 [[64  4]
 [ 3 29]]
Accuracy for SVM: 
 0.93
Precision for SVM: 
 0.8787878787878788
Recall for SVM: 
 0.90625
f1_score for SVM: 
 0.8923076923076922


In [25]:
print('Confusion Matrix for RF: \n',confusion_matrix(y_test, rf.predict(X_test)))
print('Accuracy for RF: \n',accuracy_score(y_test, rf.predict(X_test)))
acc.append(accuracy_score(y_test, rf.predict(X_test)))
print('Precision for RF: \n',precision_score(y_test, rf.predict(X_test)))
precision.append(precision_score(y_test, rf.predict(X_test)))
print('Recall for RF: \n',recall_score(y_test, rf.predict(X_test)))
recall.append(recall_score(y_test, rf.predict(X_test)))
print('f1_score for RF: \n',f1_score(y_test, rf.predict(X_test)))
f1.append(f1_score(y_test, rf.predict(X_test)))

Confusion Matrix for RF: 
 [[64  4]
 [ 4 28]]
Accuracy for RF: 
 0.92
Precision for RF: 
 0.875
Recall for RF: 
 0.875
f1_score for RF: 
 0.875


In [26]:
print('Confusion Matrix for ADB: \n',confusion_matrix(y_test, adb.predict(X_test)))
print('Accuracy for ADB: \n',accuracy_score(y_test, adb.predict(X_test)))
acc.append(accuracy_score(y_test, adb.predict(X_test)))
print('Precision for ADB: \n',precision_score(y_test, adb.predict(X_test)))
precision.append(precision_score(y_test, adb.predict(X_test)))
print('Recall for ADB: \n',recall_score(y_test, adb.predict(X_test)))
recall.append(recall_score(y_test, adb.predict(X_test)))
print('f1_score for ADB: \n',f1_score(y_test, adb.predict(X_test)))
f1.append(f1_score(y_test, adb.predict(X_test)))

Confusion Matrix for ADB: 
 [[61  7]
 [ 3 29]]
Accuracy for ADB: 
 0.9
Precision for ADB: 
 0.8055555555555556
Recall for ADB: 
 0.90625
f1_score for ADB: 
 0.8529411764705882


In [27]:
print('Confusion Matrix for VC: \n',confusion_matrix(y_test, vc.predict(X_test)))
print('Accuracy for VC: \n',accuracy_score(y_test, vc.predict(X_test)))
acc.append(accuracy_score(y_test, vc.predict(X_test)))
print('Precision for VC: \n',precision_score(y_test, vc.predict(X_test)))
precision.append(precision_score(y_test, vc.predict(X_test)))
print('Recall for VC: \n',recall_score(y_test, vc.predict(X_test)))
recall.append(recall_score(y_test, vc.predict(X_test)))
print('f1_score for VC: \n',f1_score(y_test, vc.predict(X_test)))
f1.append(f1_score(y_test, vc.predict(X_test)))

Confusion Matrix for VC: 
 [[62  6]
 [ 4 28]]
Accuracy for VC: 
 0.9
Precision for VC: 
 0.8235294117647058
Recall for VC: 
 0.875
f1_score for VC: 
 0.8484848484848485


In [28]:
model_dict = {'Models': models,
             'Accuracies': acc,
             'Precision': precision,
             'Recall': recall,
             'f1-score': f1}

In [29]:
model_df = pd.DataFrame(model_dict)
model_df

Unnamed: 0,Models,Accuracies,Precision,Recall,f1-score
0,Logistic Regression,0.89,0.888889,0.75,0.813559
1,Decision Trees,0.91,0.828571,0.90625,0.865672
2,SVM,0.93,0.878788,0.90625,0.892308
3,Random Forest,0.92,0.875,0.875,0.875
4,AdaBoost,0.9,0.805556,0.90625,0.852941
5,Average Ensemble,0.9,0.823529,0.875,0.848485


In [30]:
model_df = model_df.sort_values(['Accuracies', 'f1-score', 'Recall', 'Precision'],
                               ascending=False)

In [31]:
model_df

Unnamed: 0,Models,Accuracies,Precision,Recall,f1-score
2,SVM,0.93,0.878788,0.90625,0.892308
3,Random Forest,0.92,0.875,0.875,0.875
1,Decision Trees,0.91,0.828571,0.90625,0.865672
4,AdaBoost,0.9,0.805556,0.90625,0.852941
5,Average Ensemble,0.9,0.823529,0.875,0.848485
0,Logistic Regression,0.89,0.888889,0.75,0.813559


## Hyper parameter tuning

In [32]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = lr, 
                             X = X_train, 
                             y = y_train, 
                             cv = 10)
acMean = accuracies.mean()
acStd = accuracies.std()

In [33]:
acMean

0.827081942899518

In [34]:
acStd

0.09362431332005731

In [35]:
accuracies

array([0.80645161, 0.86666667, 0.73333333, 0.83333333, 0.7       ,
       0.66666667, 0.86666667, 0.93333333, 0.93333333, 0.93103448])

In [None]:
# ---------------------------------------- GPU ---------------------------------------------------------

In [36]:
# Applying Grid Search to find the best model and the best parameters
from sklearn.model_selection import GridSearchCV
parameters = {"n_estimators": [100, 200, 300],
              "criterion":['gini','entropy'],
              "max_depth": [8, 16, 32],
              "min_samples_split": [10, 20, 30],
              "min_samples_leaf": [1, 5, 15],
              "min_weight_fraction_leaf": [0.1, 0.05, 0.005]}
grid_search = GridSearchCV(estimator = rf,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
best_accuracy = grid_search.best_score_

In [None]:
best_accuracy

In [None]:
best_parameters = grid_search.best_params_

In [None]:
best_parameters

In [None]:
# Fitting Final Model on training set
from sklearn.ensemble import RandomForestClassifier
tunedRF = RandomForestClassifier(n_estimators = best_parameters["n_estimators"],
                                 criterion = best_parameters["criterion"],
                                 max_depth = best_parameters["max_depth"],
                                 min_samples_split = best_parameters["min_samples_split"],
                                 min_samples_leaf = best_parameters["min_samples_leaf"],
                                 min_weight_fraction_leaf = best_parameters["min_weight_fraction_leaf"])
tunedRF.fit(X_train, y_train)

In [None]:
print('Confusion Matrix for Tuned RF: \n',confusion_matrix(y_test, tunedRF.predict(X_test)))
print('Accuracy for Tuned RF: \n',accuracy_score(y_test, tunedRF.predict(X_test)))
acc.append(accuracy_score(y_test, tunedRF.predict(X_test)))
print('Precision for Tuned RF: \n',precision_score(y_test, tunedRF.predict(X_test)))
precision.append(precision_score(y_test, tunedRF.predict(X_test)))
print('Recall for Tuned RF: \n',recall_score(y_test, tunedRF.predict(X_test)))
recall.append(recall_score(y_test, tunedRF.predict(X_test)))
print('f1_score for Tuned RF: \n',f1_score(y_test, tunedRF.predict(X_test)))
f1.append(f1_score(y_test, tunedRF.predict(X_test)))