# Naive Bayes Classification
Byju N Govindan

In [None]:
# Naive Bayes classifier is successfully used in various applications such as spam filtering, text classification, 
# sentiment analysis, and recommender systems. It uses Bayes theorem of probability for prediction of unknown class.

# Naive Bayes classifier assumes that the effect of a particular feature in a class is independent of other features. 
# For example, a loan applicant is desirable or not depending on his/her income, previous loan and transaction history, age, 
# and location. Even if these features are interdependent, these features are still considered independently. 
# This assumption simplifies computation, and that's why it is considered as naive. 
# This assumption is called class conditional independence.

# Binary class classification in Naive Bayes
(Naive Bayes classification with binary labels)

### Probability of playing sports given the weather

In [1]:
# Assigning features and label variables
weather=['Sunny','Sunny','Overcast','Rainy','Rainy','Rainy','Overcast','Sunny','Sunny',
'Rainy','Sunny','Overcast','Overcast','Rainy']
temp=['Hot','Hot','Hot','Mild','Cool','Cool','Cool','Mild','Cool','Mild','Mild','Mild','Hot','Mild']

play=['No','No','Yes','Yes','Yes','No','Yes','No','Yes','Yes','Yes','Yes','Yes','No']

In [3]:
# Import LabelEncoder
from sklearn.preprocessing import LabelEncoder
#creating labelEncoder
le = LabelEncoder()
# Converting string labels into numbers: encode weather column
weather_encoded=le.fit_transform(weather)
print (weather_encoded)

[2 2 0 1 1 1 0 2 2 1 2 0 0 1]


In [4]:
# Converting string labels into numbers: encode temp and play columns
temp_encoded=le.fit_transform(temp)
label=le.fit_transform(play)
print ("Temp:",temp_encoded)
print ("Play:",label)

Temp: [1 1 1 2 0 0 0 2 0 2 2 2 1 2]
Play: [0 0 1 1 1 0 1 0 1 1 1 1 1 0]


In [10]:
#Now combine both the features (weather and temp) in a single variable (list of tuples).
# See how: https://www.geeksforgeeks.org/python-merge-two-lists-into-list-of-tuples/
features= tuple(zip(weather_encoded,temp_encoded))
print (features)

((2, 1), (2, 1), (0, 1), (1, 2), (1, 0), (1, 0), (0, 0), (2, 2), (2, 0), (1, 2), (2, 2), (0, 2), (0, 1), (1, 2))


In [11]:
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB

#Create a Gaussian Classifier
model = GaussianNB()

# Train the model using the training sets
model.fit(features,label)

#Predict Output
predicted= model.predict([[0,2]]) # 0:Overcast, 2:Mild
print ("Predicted Value:", predicted)

#Here, 1 indicates that players can 'play'.

Predicted Value: [1]


# Multi class classification in Naive Bayes
Multinomial Naive Bayes classification

In [27]:
# Assigning features and label variables
outlook=['Sunny','Sunny','Overcast','Rainy','Rainy','Rainy','Overcast','Sunny','Sunny',
'Rainy','Sunny','Overcast','Overcast','Rainy']
temp=['Hot','Hot','Hot','Mild','Cool','Cool','Cool','Mild','Cool','Mild','Mild','Mild','Hot','Mild']
humidity=['High','High','High','High','Normal','Normal','Normal','High','Normal','Normal','Normal','High','Normal','High']
wind=['Weak','Strong','Weak','Weak','Weak','Strong','Strong','Weak','Weak','Weak','Strong','Strong','Weak','Strong']
play=['No','No','Yes','Yes','Yes','No','Yes','No','Yes','Yes','Yes','Yes','Yes','No']

In [28]:
# Import LabelEncoder
from sklearn.preprocessing import LabelEncoder
#creating labelEncoder
le = LabelEncoder()
# Converting string labels into numbers: encode outlook column
outlook_encoded=le.fit_transform(outlook)
print (outlook_encoded)

[2 2 0 1 1 1 0 2 2 1 2 0 0 1]


In [29]:
# Converting string labels into numbers: encode temperature, humidity, wind and play columns
temp_encoded=le.fit_transform(temp)
humidity_encoded=le.fit_transform(humidity)
wind_encoded=le.fit_transform(wind)
label=le.fit_transform(play)
print ("Temperature:",temp_encoded)
print ("Humidity:",humidity_encoded)
print ("Wind:",wind_encoded)
print ("Play:",label)

Temperature: [1 1 1 2 0 0 0 2 0 2 2 2 1 2]
Humidity: [0 0 0 0 1 1 1 0 1 1 1 0 1 0]
Wind: [1 0 1 1 1 0 0 1 1 1 0 0 1 0]
Play: [0 0 1 1 1 0 1 0 1 1 1 1 1 0]


In [30]:
#Now combine both the features (weather and temp) in a single variable (list of tuples).
# See how: https://www.geeksforgeeks.org/python-merge-two-lists-into-list-of-tuples/
features= tuple(zip(outlook_encoded,temp_encoded,humidity_encoded,wind_encoded))
print (features)

((2, 1, 0, 1), (2, 1, 0, 0), (0, 1, 0, 1), (1, 2, 0, 1), (1, 0, 1, 1), (1, 0, 1, 0), (0, 0, 1, 0), (2, 2, 0, 1), (2, 0, 1, 1), (1, 2, 1, 1), (2, 2, 1, 0), (0, 2, 0, 0), (0, 1, 1, 1), (1, 2, 0, 0))


In [31]:
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB

#Create a Gaussian Classifier
model = GaussianNB()

# Train the model using the training sets
model.fit(features,label)

#Predict Output
predicted= model.predict([[0,2,0, 1]]) # 0:Overcast, 2:Mild, 0:High, 2:Wind
print ("Predicted Value:", predicted)

#Predict Output
predicted2= model.predict([[2,2,0, 0]]) # 0:Overcast, 2:Mild, 0:High, 2:Wind
print ("Predicted Value:", predicted2)

#Here, 1 indicates that players can 'play', and 0 indicates that players 'cannot play'.

Predicted Value: [1]
Predicted Value: [0]


## Multinomial Naive Bayes classification on wine dataset

In [38]:
#Import scikit-learn dataset library
from sklearn.datasets import load_wine
import pandas as pd

#Load dataset
wine = load_wine()
# print the names of the 13 features
print ("Features: ", wine.feature_names)

# print the label type of wine(class_0, class_1, class_2)
print ("Labels: ", wine.target_names)

wine_df = pd.DataFrame(wine.data)
wine_df.columns = wine.feature_names
wine_df.head()

Features:  ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']
Labels:  ['class_0' 'class_1' 'class_2']


Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [39]:
wine_df['label'] = wine.target

wine_df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,label
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [41]:
# print data(feature)shape
wine_df.shape

(178, 14)

In [49]:
# print the wine data features (top 5 records)
print (wine_df[0:5])  
# or print all rows:  print(wine_df[:-1])

   alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
0    14.23        1.71  2.43               15.6      127.0           2.80   
1    13.20        1.78  2.14               11.2      100.0           2.65   
2    13.16        2.36  2.67               18.6      101.0           2.80   
3    14.37        1.95  2.50               16.8      113.0           3.85   
4    13.24        2.59  2.87               21.0      118.0           2.80   

   flavanoids  nonflavanoid_phenols  proanthocyanins  color_intensity   hue  \
0        3.06                  0.28             2.29             5.64  1.04   
1        2.76                  0.26             1.28             4.38  1.05   
2        3.24                  0.30             2.81             5.68  1.03   
3        3.49                  0.24             2.18             7.80  0.86   
4        2.69                  0.39             1.82             4.32  1.04   

   od280/od315_of_diluted_wines  proline  label  
0           

In [55]:
# print the wine labels (0:Class_0, 1:class_2, 2:class_2)
print (wine.target)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]


In [56]:
# Import train_test_split function
from sklearn.model_selection import train_test_split

X = wine_df.iloc[:, :-1]
y = wine_df.iloc[:, -1]

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=109) # 70% training and 30% test

In [58]:
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB

#Create a Gaussian Classifier
gnb = GaussianNB()

#Train the model using the training sets
gnb.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = gnb.predict(X_test)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9074074074074074


In [60]:
#from sklearn.metrics import accuracy_score
#from sklearn.metrics import classification_report
#from sklearn.metrics import recall_score , precision_score , roc_auc_score ,roc_curve
#from sklearn.metrics import confusion_matrix

metrics.confusion_matrix(y_test, y_pred)

array([[20,  1,  0],
       [ 2, 15,  2],
       [ 0,  0, 14]], dtype=int64)

In [61]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.95      0.93        21
           1       0.94      0.79      0.86        19
           2       0.88      1.00      0.93        14

    accuracy                           0.91        54
   macro avg       0.91      0.91      0.91        54
weighted avg       0.91      0.91      0.91        54

