In [1]:
# import all the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# load the dataset
data = pd.read_csv('Social_Network_Ads.csv')

In [3]:
#print the data
print(data)

      User ID  Gender  Age  EstimatedSalary  Purchased
0    15624510    Male   19            19000          0
1    15810944    Male   35            20000          0
2    15668575  Female   26            43000          0
3    15603246  Female   27            57000          0
4    15804002    Male   19            76000          0
..        ...     ...  ...              ...        ...
395  15691863  Female   46            41000          1
396  15706071    Male   51            23000          1
397  15654296  Female   50            20000          1
398  15755018    Male   36            33000          0
399  15594041  Female   49            36000          1

[400 rows x 5 columns]


In [4]:
# print the first five rows of the dataset
print(data.head())

    User ID  Gender  Age  EstimatedSalary  Purchased
0  15624510    Male   19            19000          0
1  15810944    Male   35            20000          0
2  15668575  Female   26            43000          0
3  15603246  Female   27            57000          0
4  15804002    Male   19            76000          0


In [5]:
# print the last five rows of the dataset
print(data.tail())

      User ID  Gender  Age  EstimatedSalary  Purchased
395  15691863  Female   46            41000          1
396  15706071    Male   51            23000          1
397  15654296  Female   50            20000          1
398  15755018    Male   36            33000          0
399  15594041  Female   49            36000          1


In [6]:
# shape of the dataset
print(data.shape)

(400, 5)


In [7]:
# To check if there are any null values in the dataset
print(data.isnull().sum())

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64


## So from the above observation, it is clear that there are no NULL values in the dataset

In [9]:
# Divide the dataset into independent and dependent variables
X = data.drop(columns = ['User ID','Gender', 'Purchased'])
print(X)

     Age  EstimatedSalary
0     19            19000
1     35            20000
2     26            43000
3     27            57000
4     19            76000
..   ...              ...
395   46            41000
396   51            23000
397   50            20000
398   36            33000
399   49            36000

[400 rows x 2 columns]


In [10]:
Y = data['Purchased']
print(Y)

0      0
1      0
2      0
3      0
4      0
      ..
395    1
396    1
397    1
398    0
399    1
Name: Purchased, Length: 400, dtype: int64


In [12]:
# Divide the independent and dependent variables into training and testing data
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.3, random_state = 42)

In [14]:
# As there is a huge gap rande in the values of Age and EstimatedSalary, they should be brought in one range, so for we need standard Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
# standardize the input of Age and Estimated Salary in one range using z-score
sc.fit_transform(X_train, X_test)

array([[-0.84252154,  0.1301563 ],
       [ 0.04175763,  0.2777019 ],
       [ 0.72953032, -1.31579061],
       [ 1.61380949,  1.10395728],
       [ 0.82778356, -1.40431797],
       [-1.43204099, -1.25677236],
       [-0.05649561,  0.1301563 ],
       [ 0.43477059, -0.16493491],
       [-0.2530021 ,  0.01211982],
       [ 1.31904976,  2.22530386],
       [ 0.14001087,  0.74984783],
       [-1.33378775,  0.54328399],
       [ 2.00682245,  0.72033871],
       [-1.23553451, -1.43382709],
       [ 0.33651735, -0.34198963],
       [-0.94077478,  0.54328399],
       [ 0.43477059,  0.2777019 ],
       [ 0.43477059,  1.10395728],
       [ 0.82778356,  0.74984783],
       [ 0.9260368 ,  1.25150288],
       [-0.44950858, -1.25677236],
       [-1.82505395, -1.34529973],
       [ 1.12254328,  0.54328399],
       [-0.64601506, -1.64039093],
       [-0.7442683 ,  0.24819278],
       [ 1.02429004,  2.07775825],
       [-0.54776182,  1.36953936],
       [-0.05649561,  0.01211982],
       [-1.9233072 ,

In [16]:
# import the KNN libraries to perform KNN implementation
from sklearn.neighbors import KNeighborsClassifier

KNN = KNeighborsClassifier(n_neighbors = 5, metric = 'euclidean', p = 2)

# train the model for KNN using the training data
KNN.fit(X_train, Y_train)

KNeighborsClassifier(metric='euclidean')

In [18]:
# predict the output for the test data
Y_pred_test = KNN.predict(X_test)
print(Y_pred_test)

[1 1 0 1 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 1 0 1 0 0
 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 1 1 1 0 0 1 0 0 0
 1 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 1 1 1 1 0 0
 0 0 0 0 0 0 1 0 0]


In [19]:
# perform the confusion matrix between the actual output and the predicted output
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, Y_pred_test)
print(cm)

[[69  4]
 [18 29]]


In [20]:
# perform the accuracy score between the actual output and the predicted output
from sklearn.metrics import accuracy_score
ac = accuracy_score(Y_test, Y_pred_test)
print("Accuracy of the model is:", ac)

Accuracy of the model is: 0.8166666666666667


In [21]:
# perform the classification report between the actual output and the predicted output
from sklearn.metrics import classification_report
cr = classification_report(Y_test, Y_pred_test)
print(cr)

              precision    recall  f1-score   support

           0       0.79      0.95      0.86        73
           1       0.88      0.62      0.73        47

    accuracy                           0.82       120
   macro avg       0.84      0.78      0.79       120
weighted avg       0.83      0.82      0.81       120

