In [1]:
# import all the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# load the dataset
data = pd.read_csv('Social_Network_Ads.csv')

In [3]:
# print the data
print(data)

      User ID  Gender  Age  EstimatedSalary  Purchased
0    15624510    Male   19            19000          0
1    15810944    Male   35            20000          0
2    15668575  Female   26            43000          0
3    15603246  Female   27            57000          0
4    15804002    Male   19            76000          0
..        ...     ...  ...              ...        ...
395  15691863  Female   46            41000          1
396  15706071    Male   51            23000          1
397  15654296  Female   50            20000          1
398  15755018    Male   36            33000          0
399  15594041  Female   49            36000          1

[400 rows x 5 columns]


In [4]:
# load the first five rows of the dataset
print(data.head())

    User ID  Gender  Age  EstimatedSalary  Purchased
0  15624510    Male   19            19000          0
1  15810944    Male   35            20000          0
2  15668575  Female   26            43000          0
3  15603246  Female   27            57000          0
4  15804002    Male   19            76000          0


In [5]:
# load the last five rows of the dataset
print(data.tail())

      User ID  Gender  Age  EstimatedSalary  Purchased
395  15691863  Female   46            41000          1
396  15706071    Male   51            23000          1
397  15654296  Female   50            20000          1
398  15755018    Male   36            33000          0
399  15594041  Female   49            36000          1


In [6]:
# shape of the dataset
print(data.shape)

(400, 5)


In [7]:
# To check if there are any NULL Values in the dataset
print(data.isnull().sum())

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64


# So from the above observation, it is clear that there are no NULL Values in the dataset

In [8]:
# Divide the dataset into independent and dependent variables
X = data.drop(columns = ['User ID','Gender','Purchased'])
print(X)

     Age  EstimatedSalary
0     19            19000
1     35            20000
2     26            43000
3     27            57000
4     19            76000
..   ...              ...
395   46            41000
396   51            23000
397   50            20000
398   36            33000
399   49            36000

[400 rows x 2 columns]


In [9]:
Y = data['Purchased']

In [10]:
print(Y)

0      0
1      0
2      0
3      0
4      0
      ..
395    1
396    1
397    1
398    0
399    1
Name: Purchased, Length: 400, dtype: int64


In [11]:
# Divide the independent and dependent variables into training and testing data
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.3, random_state = 0)

In [14]:
# Perform the Standard Scaling to make all the input values in one range
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

# standardize the input data
sc.fit_transform(X_train, X_test)

array([[-1.1631724 , -1.5849703 ],
       [ 2.17018137,  0.93098672],
       [ 0.0133054 ,  1.22017719],
       [ 0.20938504,  1.07558195],
       [ 0.40546467, -0.48604654],
       [-0.28081405, -0.31253226],
       [ 0.99370357, -0.8330751 ],
       [ 0.99370357,  1.8563962 ],
       [ 0.0133054 ,  1.24909623],
       [-0.86905295,  2.26126285],
       [-1.1631724 , -1.5849703 ],
       [ 2.17018137, -0.80415605],
       [-1.35925203, -1.46929411],
       [ 0.40546467,  2.2901819 ],
       [ 0.79762394,  0.75747245],
       [-0.96709276, -0.31253226],
       [ 0.11134522,  0.75747245],
       [-0.96709276,  0.55503912],
       [ 0.30742485,  0.06341534],
       [ 0.69958412, -1.26686079],
       [-0.47689368, -0.0233418 ],
       [-1.7514113 ,  0.3526058 ],
       [-0.67297331,  0.12125343],
       [ 0.40546467,  0.29476771],
       [-0.28081405,  0.06341534],
       [-0.47689368,  2.2901819 ],
       [ 0.20938504,  0.03449629],
       [ 1.28782302,  2.20342476],
       [ 0.79762394,

In [15]:
# use the training data to train the SVM Model
from sklearn.svm import SVC

cls = SVC(kernel = 'linear', random_state = 0)

In [16]:
# train the data
cls.fit(X_train, Y_train)

SVC(kernel='linear', random_state=0)

In [17]:
# predict the output for the test data
Y_pred_test = cls.predict(X_test)
print(Y_pred_test)

[0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 1 0 1 1 0 0 1 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0
 0 0 1 0 1 1 1 1 0 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 0 1 0 1 0 1 0 0 0 0 0
 0 0 0 1 1 1 0 1 0]


In [19]:
# confusion matrix
from sklearn.metrics import confusion_matrix
c_m = confusion_matrix(Y_test, Y_pred_test)
print(c_m)

[[74  5]
 [13 28]]


In [21]:
# confusion matrix
from sklearn.metrics import accuracy_score
c_m = accuracy_score(Y_test, Y_pred_test)
print(c_m)

0.85


In [22]:
# classification report 
from sklearn.metrics import classification_report
cr = classification_report(Y_test, Y_pred_test)
print(cr)

              precision    recall  f1-score   support

           0       0.85      0.94      0.89        79
           1       0.85      0.68      0.76        41

    accuracy                           0.85       120
   macro avg       0.85      0.81      0.82       120
weighted avg       0.85      0.85      0.85       120

