# Problem statement: We have to segregate UserID into "Purchased" and "Not Purchased" customers using Naive Bayes classifier

In [1]:
import pandas as pd # import pandas lib using it's short form 

In [2]:
data = pd.read_csv(r'User_Data.csv',header =0) # load dataset

In [3]:
data.head() # display top 5 rows

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [4]:
data.info() # gives info about null values and data type of each column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [5]:
data.isnull().sum()  # null values in each column

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

It means, there is no null value in our dataset

In [6]:
data.shape  # rows and columns

(400, 5)

In [7]:
data[data.duplicated()]  # there is no duplicated row in the dataset

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased


Naive Bayes algorithm is used for solving classification problems and outliers don't impact classification. Therefore, no need to check outliers in our dataset. 

In [8]:
data.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [9]:
# define X and Y 
X = data.iloc[:,[2,3]] 
Y = data.iloc[:,4]

In [10]:
X

Unnamed: 0,Age,EstimatedSalary
0,19,19000
1,35,20000
2,26,43000
3,27,57000
4,19,76000
...,...,...
395,46,41000
396,51,23000
397,50,20000
398,36,33000


In [11]:
Y

0      0
1      0
2      0
3      0
4      0
      ..
395    1
396    1
397    1
398    0
399    1
Name: Purchased, Length: 400, dtype: int64

In [12]:
print(X.shape)
print(Y.shape)

(400, 2)
(400,)


# Scaling the data of X 

In [13]:
from sklearn.preprocessing import StandardScaler # import StandardScaler function
scaler = StandardScaler() # save this func into var. scaler
scaler.fit(X)  # apply StandardScaler on X
X = scaler.transform(X) # StandardScaler function transform the values of X in predefined range of -3 to +3 

In [14]:
X

array([[-1.78179743, -1.49004624],
       [-0.25358736, -1.46068138],
       [-1.11320552, -0.78528968],
       [-1.01769239, -0.37418169],
       [-1.78179743,  0.18375059],
       [-1.01769239, -0.34481683],
       [-1.01769239,  0.41866944],
       [-0.54012675,  2.35674998],
       [-1.20871865, -1.07893824],
       [-0.25358736, -0.13926283],
       [-1.11320552,  0.30121002],
       [-1.11320552, -0.52100597],
       [-1.6862843 ,  0.47739916],
       [-0.54012675, -1.51941109],
       [-1.87731056,  0.35993973],
       [-0.82666613,  0.30121002],
       [ 0.89257019, -1.3138571 ],
       [ 0.70154394, -1.28449224],
       [ 0.79705706, -1.22576253],
       [ 0.98808332, -1.19639767],
       [ 0.70154394, -1.40195167],
       [ 0.89257019, -0.60910054],
       [ 0.98808332, -0.84401939],
       [ 0.70154394, -1.40195167],
       [ 0.79705706, -1.37258681],
       [ 0.89257019, -1.46068138],
       [ 1.08359645, -1.22576253],
       [ 0.89257019, -1.16703281],
       [-0.82666613,

# Split the dataset into training and testing sets

75% training, 25% testing => means parameter test_size = 0.25

In [15]:
from sklearn.model_selection import train_test_split # import train_test_split func from model_selection sublib.
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.25,random_state=10) # training and testing data (model can randomly take 10 rows at a time)

In [16]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(300, 2)
(100, 2)
(300,)
(100,)


In [17]:
# Now, we will fit the Naive Bayes model to the training data
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()

In [18]:
classifier

GaussianNB()

In [19]:
classifier.fit(X_train,Y_train)

GaussianNB()

In [20]:
# we will predict the test data 
Y_pred = classifier.predict(X_test) # predict the class of Y for the given testing data

In [21]:
Y_pred

array([0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0], dtype=int64)

In [22]:
print(list(zip(Y_test, Y_pred))) # compare actual Y with predicted Y

[(0, 0), (0, 0), (1, 1), (0, 1), (0, 0), (1, 1), (0, 0), (0, 1), (0, 0), (0, 0), (0, 0), (0, 0), (1, 1), (1, 1), (1, 1), (0, 0), (0, 0), (0, 0), (0, 0), (1, 1), (0, 0), (0, 0), (0, 0), (1, 1), (1, 1), (0, 0), (0, 0), (1, 1), (1, 1), (0, 0), (0, 0), (0, 0), (0, 0), (1, 1), (1, 1), (0, 0), (1, 1), (1, 1), (0, 0), (0, 0), (0, 0), (1, 0), (0, 0), (0, 0), (0, 0), (0, 0), (1, 1), (0, 0), (0, 0), (0, 0), (1, 0), (1, 1), (1, 1), (0, 0), (0, 0), (0, 0), (1, 1), (0, 0), (1, 1), (1, 1), (0, 0), (1, 1), (0, 0), (1, 1), (1, 1), (0, 1), (0, 0), (1, 1), (0, 0), (0, 0), (0, 0), (1, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 1), (1, 1), (1, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (1, 1), (0, 0), (0, 0), (0, 1), (0, 0), (0, 0), (1, 1), (0, 0), (0, 0), (0, 0)]


In [23]:
classifier.score(X_train, Y_train) # score of the model on training data

0.8933333333333333

In [24]:
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report # import these functions from metrics sublib.
cfm = confusion_matrix(Y_test,Y_pred)  #confusion matrix
print(cfm)

print('classification report')  # classification report
print(classification_report(Y_test,Y_pred))

acc = accuracy_score(Y_test,Y_pred)  # accuracy of the model
print('Gaussian Naive Bayes model accuracy:',acc)

[[64  5]
 [ 4 27]]
classification report
              precision    recall  f1-score   support

           0       0.94      0.93      0.93        69
           1       0.84      0.87      0.86        31

    accuracy                           0.91       100
   macro avg       0.89      0.90      0.90       100
weighted avg       0.91      0.91      0.91       100

Gaussian Naive Bayes model accuracy: 0.91


64+27 = 91 correct predictions

4+5 = 9 incorrect predictions 