In [1]:
# importing necessary libraries
import numpy as np
import pandas as pd
from sklearn import datasets 
from sklearn.metrics import confusion_matrix 
from sklearn.model_selection import train_test_split 

In [2]:
# loading the iris dataset 
iris = datasets.load_iris()
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [3]:
X = iris.data
y =iris.target

In [4]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [5]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [6]:
pd.DataFrame(X).head()

Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [7]:
(pd.DataFrame(X)).shape

(150, 4)

In [8]:
pd.DataFrame(X).describe()

Unnamed: 0,0,1,2,3
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [9]:
pd.DataFrame(X).isnull().sum()

0    0
1    0
2    0
3    0
dtype: int64

In [10]:
# dividing X, y into train and test data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state = 123) 
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3) 
 

In [11]:
print(X_train.shape)
print(X_test.shape)

(120, 4)
(30, 4)


In [12]:
pd.DataFrame(X_train).head(3)

Unnamed: 0,0,1,2,3
0,7.4,2.8,6.1,1.9
1,6.0,2.2,5.0,1.5
2,4.7,3.2,1.6,0.2


### Gaussian Naive Bayes

In [13]:
# training a Naive Bayes classifier 
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
gnb_model = GaussianNB().fit(X_train, y_train) 


In [14]:
y_pred_gnb = gnb_model.predict(X_test) 
y_pred_gnb

array([1, 2, 2, 1, 0, 2, 1, 0, 0, 1, 2, 0, 1, 2, 2, 2, 0, 0, 1, 0, 0, 1,
       0, 2, 0, 0, 0, 2, 2, 0])

In [15]:
pd.DataFrame(y_test).value_counts()

0    13
2    11
1     6
dtype: int64

In [16]:
print("\nConfusion Matrix gnb:\n", confusion_matrix(y_test, y_pred_gnb))


Confusion Matrix gnb:
 [[13  0  0]
 [ 0  6  0]
 [ 0  1 10]]


In [17]:
print("Accuracy gnb: {:.2f}%".format(accuracy_score(y_test, y_pred_gnb) * 100))


Accuracy gnb: 96.67%


In [18]:
print("\nClassification Report gnb:\n",classification_report(y_test, y_pred_gnb))



Classification Report gnb:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.86      1.00      0.92         6
           2       1.00      0.91      0.95        11

    accuracy                           0.97        30
   macro avg       0.95      0.97      0.96        30
weighted avg       0.97      0.97      0.97        30



In [19]:
print("GNB Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0],(y_test != y_pred_gnb).sum()))

GNB Number of mislabeled points out of a total 30 points : 1


A nice piece of this Bayesian formalism is that it naturally allows for probabilistic classification, which we can compute using the predict_proba method:

In [20]:
y_pred_gnb

array([1, 2, 2, 1, 0, 2, 1, 0, 0, 1, 2, 0, 1, 2, 2, 2, 0, 0, 1, 0, 0, 1,
       0, 2, 0, 0, 0, 2, 2, 0])

In [21]:
gnb_pred_prob = gnb_model.predict_proba(X_test) 
gnb_pred_prob


array([[7.24143720e-126, 9.23061979e-001, 7.69380215e-002],
       [1.81805757e-197, 1.22131524e-005, 9.99987787e-001],
       [2.20317207e-208, 4.05456738e-006, 9.99995945e-001],
       [1.10129676e-075, 9.99831223e-001, 1.68777099e-004],
       [1.00000000e+000, 1.15759399e-018, 4.49784802e-027],
       [8.63288527e-133, 2.65635598e-001, 7.34364402e-001],
       [6.02073762e-093, 9.99265736e-001, 7.34264388e-004],
       [1.00000000e+000, 2.12709854e-017, 2.00578694e-026],
       [1.00000000e+000, 2.33789847e-016, 1.52505822e-025],
       [6.15357796e-086, 9.99862993e-001, 1.37007031e-004],
       [5.79968127e-187, 1.79873881e-006, 9.99998201e-001],
       [1.00000000e+000, 9.37319997e-019, 1.83757860e-026],
       [6.05783386e-071, 9.99924553e-001, 7.54469667e-005],
       [3.35968637e-177, 2.94898137e-003, 9.97051019e-001],
       [2.69283932e-260, 4.56398930e-010, 1.00000000e+000],
       [2.81577531e-222, 6.99119391e-007, 9.99999301e-001],
       [1.00000000e+000, 9.71576715e-017

The columns give the posterior probabilities of the labels. If you are looking for estimates of uncertainty in your classification, Bayesian approaches like this can be a useful approach.

In [22]:
from sklearn.naive_bayes import MultinomialNB
mnb_model = MultinomialNB().fit(X_train, y_train) 
y_pred_mnb = mnb_model.predict(X_test) 

In [23]:
print("Accuracy mnb: {:.2f}%".format(accuracy_score(y_test, y_pred_mnb) * 100))
print("\nCOnfusion Matrix gnb:\n", confusion_matrix(y_test, y_pred_mnb))
print("\nClassification Report gnb:\n",classification_report(y_test, y_pred_mnb))


Accuracy mnb: 76.67%

COnfusion Matrix gnb:
 [[13  0  0]
 [ 0  6  0]
 [ 0  7  4]]

Classification Report gnb:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.46      1.00      0.63         6
           2       1.00      0.36      0.53        11

    accuracy                           0.77        30
   macro avg       0.82      0.79      0.72        30
weighted avg       0.89      0.77      0.76        30



#### Another Example using label encoder

In this example, we can use dummy dataset with three columns: weathre, temperature, and play. The first two are input features and the other is label

In [24]:
# Assigning features and label variables
weather=['Sunny','Sunny','Overcast','Rainy','Rainy','Rainy','Overcast','Sunny','Sunny',
'Rainy','Sunny','Overcast','Overcast','Rainy']
outlook = weather
temp=['Hot','Hot','Hot','Mild','Cool','Cool','Cool','Mild','Cool','Mild','Mild','Mild','Hot','Mild']

humidity = ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'Normal', 'Normal','High', 'Normal', 'High']
wind = ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Strong']
play=['No','No','Yes','Yes','Yes','No','Yes','No','Yes','Yes','Yes','Yes','Yes','No']

tennis_data = pd.DataFrame(zip(outlook, temp, humidity, wind, play), columns = ['Outlook','Temperature', 'Humidity', 'Wind', 'Play Tennis' ])
tennis_data.to_csv("tennis_data.csv", index = False)

#### Encoding features
First, you need to convert these strings labels into numbers. for example 'Overcast', 'Rainy', "Sunny' as 0,1,2. This is know as label encoding. Scikit-learn provides LabelEncoder library for encoding labels with a value between 0 and one less than the number of discrete classes.

In [29]:
# Import LabelEncoder
from sklearn.preprocessing import LabelEncoder
#creating labelEncoder
le = LabelEncoder()

# Converting string labels into numbers.
weather_encoded=le.fit_transform(weather)
print("Weather:", weather_encoded)

temp_encoded=le.fit_transform(temp)
print("Temp:",temp_encoded)

humidity_encoded=le.fit_transform(humidity)
print("Humidity:", humidity_encoded)

label=le.fit_transform(play)
print("Play:",label)

Weather: [2 2 0 1 1 1 0 2 2 1 2 0 0 1]
Temp: [1 1 1 2 0 0 0 2 0 2 2 2 1 2]
Humidity: [0 0 0 0 1 1 1 0 1 1 1 0 1 0]
Play: [0 0 1 1 1 0 1 0 1 1 1 1 1 0]


Now combine both the features (weather and temp) in a single variable 

In [26]:
X=pd.DataFrame(zip(weather_encoded,temp_encoded))
X.columns = ['weather', 'temp']
X

Unnamed: 0,weather,temp
0,2,1
1,2,1
2,0,1
3,1,2
4,1,0
5,1,0
6,0,0
7,2,2
8,2,0
9,1,2


#### Generating Model

Generate a model using Naive Bayes classifier


In [27]:
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB

#Create a Gaussian Classifier
nb_model = GaussianNB()

# Train the model using the training sets
nb_model.fit(X,label)

GaussianNB()

In [28]:
# Predcit output

pred = nb_model.predict([[2,1], [0,1]])
print("Predicted Value:", pred)

Predicted Value: [0 1]


### Multinomial Naive Bayes
The Gaussian assumption just described is by no means the only simple assumption that could be used to specify the generative distribution for each label. Another useful example is multinomial naive Bayes, where the features are assumed to be generated from a simple multinomial distribution. The multinomial distribution describes the probability of observing counts among a number of categories, and thus multinomial naive Bayes is most appropriate for features that represent counts or count rates.

The idea is precisely the same as before, except that instead of modeling the data distribution with the best-fit Gaussian, we model the data distribuiton with a best-fit multinomial distribution.



#### Example: Classifying Text

One place where multinomial naive Bayes is often used is in text classification, where the features are related to word counts or frequencies within the documents to be classified. 

In [None]:
from sklearn.datasets import fetch_20newsgroups
review_data = fetch_20newsgroups()

In [None]:
review_data.keys()


In [None]:
train = fetch_20newsgroups(subset = 'train')
test = fetch_20newsgroups(subset = 'test')

In [None]:
print(train.data[5])

In order to use this data for machine learning, we need to be able to convert the content of each string into a vector of numbers. For this we will use the TF-IDF vectorizer, and create a pipeline that attaches it to a multinomial naive Bayes classifier:

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

model = make_pipeline(TfidfVectorizer(), MultinomialNB())

In [None]:
model.fit(train.data, train.target)
labels = model.predict(test.data)

In [None]:
import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import confusion_matrix
mat = confusion_matrix(test.target, labels)


print("Accuracy mnb: {:.2f}%".format(accuracy_score(test.target, labels) * 100))
print("\nCOnfusion Matrix gnb:\n", confusion_matrix(test.target, labels))
print("\nClassification Report gnb:\n",classification_report(test.target, labels))



In [None]:
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=train.target_names, yticklabels=train.target_names)
plt.xlabel('true label')
plt.ylabel('predicted label');