In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [2]:
df = pd.read_csv('datasets/Social_Network_Ads.csv')
print('Social network ads data\n',df)

Social network ads data
       User ID  Gender   Age  EstimatedSalary  Purchased
0    15624510    Male  19.0          19000.0          0
1    15810944    Male  35.0          20000.0          0
2    15668575  Female  26.0          43000.0          0
3    15603246  Female  27.0          57000.0          0
4    15804002    Male  19.0          76000.0          0
..        ...     ...   ...              ...        ...
395  15691863  Female  46.0          41000.0          1
396  15706071    Male  51.0          23000.0          1
397  15654296  Female  50.0          20000.0          1
398  15755018    Male  36.0          33000.0          0
399  15594041  Female  49.0          36000.0          1

[400 rows x 5 columns]


In [3]:
# separate the input and output variable
x = df[['Age', 'EstimatedSalary']] #2inputs
y = df['Purchased']#output

In [4]:
# feature scalling
from sklearn.preprocessing  import MinMaxScaler
scaler = MinMaxScaler()
x_scaled = scaler.fit_transform(x)

In [5]:
#cross validation
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, random_state = 0)

In [6]:
#build the model
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

l_rg = LogisticRegression()
dt_clf = DecisionTreeClassifier(random_state = 0)
rf_clf = RandomForestClassifier(n_estimators=10, random_state = 0)
knn_clf =  KNeighborsClassifier(n_neighbors=5)
sv_clf = SVC(random_state = 0, kernel = 'rbf')
nb_clf = GaussianNB()
#train the algorithnm
l_rg.fit(x_train, y_train)
dt_clf.fit(x_train, y_train)
rf_clf.fit(x_train, y_train)
knn_clf.fit(x_train, y_train)
sv_clf.fit(x_train, y_train)
nb_clf.fit(x_train, y_train)

In [7]:
l_rg_pred =l_rg.predict(x_test)
dt_clf_pred = dt_clf.predict(x_test)
rf_clf_pred = rf_clf.predict(x_test)
knn_clf_pred = knn_clf.predict(x_test)
sv_clf_pred =sv_clf.predict(x_test)
nb_clf_pred = nb_clf.predict(x_test)

In [8]:
from sklearn.metrics import accuracy_score

In [9]:
print('logreg:', accuracy_score(y_test, l_rg_pred))
print('dt_clf_pred:', accuracy_score(y_test, dt_clf_pred))
print('rf_clf_pred:', accuracy_score(y_test, rf_clf_pred))
print('knn_clf_pred:', accuracy_score(y_test, knn_clf_pred))
print('sv_clf_pred:', accuracy_score(y_test, sv_clf_pred))
print('nb_clf_pred:', accuracy_score(y_test, nb_clf_pred))

logreg: 0.89
dt_clf_pred: 0.9
rf_clf_pred: 0.93
knn_clf_pred: 0.93
sv_clf_pred: 0.93
nb_clf_pred: 0.9


In [10]:
# create voting classifier
from sklearn.ensemble import VotingClassifier
vt = VotingClassifier(estimators=[('logreg',l_rg),
                                 ('dt',dt_clf),
                                 ('rf',rf_clf),
                                 ('knn',knn_clf),
                                 ('svm', sv_clf),
                                 ('nb',nb_clf)])

In [11]:
vt.fit(x_train, y_train)

In [12]:
y_pred = vt.predict(x_test)

In [13]:
accuracy_score(y_test, y_pred)

0.94

In [14]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96        68
           1       0.91      0.91      0.91        32

    accuracy                           0.94       100
   macro avg       0.93      0.93      0.93       100
weighted avg       0.94      0.94      0.94       100

