#Â Naive Bayes algorithm

In [91]:
# Gaussian Naive Bayes for Iris Flowers

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

model = GaussianNB()
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

print('Training Accuracy:',accuracy_score(y_pred_train,y_train))
print('Test Accuracy:',accuracy_score(y_pred_test,y_test))
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred_test).sum()))

Training Accuracy: 0.9733333333333334
Test Accuracy: 0.9466666666666667
Number of mislabeled points out of a total 75 points : 4


In [98]:
# --->>> Your turn <<<---
# apply Gaussian Naive Bayes to the penguins dataset

import pandas as pd
from sklearn import neighbors
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
    
n_neighbors = 15

# import some data to play with
penguins = pd.read_csv('penguins_size.csv').dropna()
X = penguins[['culmen_length_mm', 'culmen_depth_mm','flipper_length_mm','body_mass_g']].values
le = LabelEncoder()
y = le.fit_transform(penguins['species'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

model = GaussianNB()
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

print('Training Accuracy:',accuracy_score(y_pred_train,y_train))
print('Test Accuracy:',accuracy_score(y_pred_test,y_test))
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred_test).sum()))



Training Accuracy: 0.9699570815450643
Test Accuracy: 0.9603960396039604
Number of mislabeled points out of a total 101 points : 4


In [88]:
# --->>> Your turn <<<---
# Implement a Naive Bayes calculation from scratch for the following problem,
# then compare your result with the sklearn result
#
# Proposed approach: 
# calculate the probabilities of different words (the features), and
# use the naive bayes formula to derive the probability for classes 0 and 1
#
# Hint: for the sklearn approach you can use MultinomialNB, and CountVectorizer
# 
# Given the three sentences  
sentence1 = "This is the first sentence in English"
sentence2 = "Another sentence also in English"
sentence3 = "This is not a sentence"
# which are assigned the classes 
y = [0,0,1]

# Use Naive Bayes to estimate the class of 
test_sentence = "This not English"

In [89]:
import numpy as np

docs, p = {}, {}
docs[0] = sentence1 + " " + sentence2 
docs[1] = sentence3 
alpha = 1.0

lclasses = np.unique(y)
for c in lclasses:
    docs[c] = docs[c].split()
    p[c] = y.count(c)/len(y)
print("a priori probabilities:", p)

features = test_sentence.split()
for xi in features:
    for c in np.unique(y):
        p[c] *= (docs[c].count(xi) + alpha) / (len(docs[c]) + alpha*len(features))

print("class probabilities", p)
print("normalised class probabilities", p[0]/(p[0]+p[1]), p[1]/(p[0]+p[1]))
p = np.array(list(p.values()))
print("predicted class:",np.argmax(p))


a priori probabilities: {0: 0.6666666666666666, 1: 0.3333333333333333}
class probabilities {0: 0.001185185185185185, 1: 0.0026041666666666665}
normalised class probabilities 0.31276725717776416 0.6872327428222358
predicted class: 1


In [80]:
# Now use sklearn to verify your result
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

X_train = [sentence1, sentence2, sentence3] 
X_test = [test_sentence] 

vectorizer = CountVectorizer(stop_words=None)
X_train = vectorizer.fit_transform(X_train).toarray()
print( vectorizer.get_feature_names_out() )

['also' 'another' 'english' 'first' 'in' 'is' 'not' 'sentence' 'the'
 'this']


In [81]:
# Let's see the encoding of our three sentences
# the columns are the counts of the words above
print(X_train)

[[0 0 1 1 1 1 0 1 1 1]
 [1 1 1 0 1 0 0 1 0 0]
 [0 0 0 0 0 1 1 1 0 1]]


In [82]:
vectorizer.transform(X_test).toarray()

array([[0, 0, 1, 0, 0, 0, 1, 0, 0, 1]])

In [None]:
mnnb = MultinomialNB()
mnnb.fit(X_train, y)

X_test2 = vectorizer.transform(X_test).toarray()
print(mnnb.predict(X_test2))
#print probabilities for each class
print(mnnb.predict_proba(X_test2))
#Comment: CountVectorizer removes single letter words like 'a' by default, so this changes the result

[1]
[[0.43601695 0.56398305]]


In [86]:
from sklearn.naive_bayes import CategoricalNB
cnb = CategoricalNB()
cnb.fit(X_train, y)

X_test2 = vectorizer.transform(X_test).toarray()
print(cnb.predict(X_test2))
#print probabilities for each class
print(cnb.predict_proba(X_test2))

[1]
[[0.1445245 0.8554755]]
