# Omar Farooq
# Naive-bayes from scratch

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('golf-dataset.csv')
data.head(100)

Unnamed: 0,Outlook,Temp,Humidity,Windy,Play Golf
0,Rainy,Hot,High,False,No
1,Rainy,Hot,High,True,No
2,Overcast,Hot,High,False,Yes
3,Sunny,Mild,High,False,Yes
4,Sunny,Hot,Normal,False,Yes
5,Sunny,Cool,Normal,True,No
6,Overcast,Cool,Normal,True,Yes
7,Rainy,Mild,High,False,No
8,Rainy,Cool,Normal,False,Yes
9,Sunny,Mild,Normal,False,Yes


In [9]:
class NaiveBayes:
    
    
    def __init__(self, _data, class_var= data.columns[4]):
        self.target = class_var
        self.data = _data
        self.train()
        
    def train(self):
        self.probs = {}
        for i in list(set(self.data[self.target])):
            self.probs[i] = {}
            for j in list(self.data.columns):
                if j != self.target:
                    self.probs[i][j] = {}
                    vals = list(set(self.data[j]))
                    for k in vals:
                        self.probs[i][j][str(k)] = (len(self.data.loc[(self.data[j]==str(k)) & (self.data[self.target] == i)])+1)/(len(self.data.loc[(self.data[self.target] == i)])+1) 
                        
    def predict(self, outlook, temp, humidity, windy):
        X = {'Outlook':outlook, 'Temp':temp, 'Humidity':humidity, 'Windy':windy}

        results = {}
        classes = list(set(data[self.target]))
        for i in classes:
            results[i] = len(data[data[self.target]==i])/len(data)
            for j in list(X.keys()):
                results[i] = results[i]*self.probs[i][j][X[j]]
        max = self.data[self.target][1]    
        for i in results.keys():
            if (results[i] > results[max]):
                max = i
        return max



#### Q1: 1.	Using all 13 samples, decide whether or not you will play golf if the sample vector is [Sunny, Mild, High, TRUE]. That is, what is the class label for this sample?

In [16]:
X = NaiveBayes(data)
a = X.predict('Sunny', 'Mild', 'High', 'True')
a

'No'

#### 2.	Using first 10 samples, predict the Yes or No label for the samples 11, 12 and 13. Write your results in the form of a table. Comment on the performance of NB classifier. What would be the class label for the sample given in 1. Does the class label change?

In [17]:
Y = NaiveBayes(data[:9])

In [27]:
table = {'Rainy, Mild, Normal, True':['Yes',''],
        'Overcast, Mild, High, True':['Yes', ''],
        'Overcast, Hot, Normal, False': ['Yes', ''],
        'Sunny, Mild, High, True': ['No', '']
        }
df = pd.DataFrame(table, index = ['actual', 'predicted'])
df.T.head()

Unnamed: 0,actual,predicted
"Rainy, Mild, Normal, True",Yes,
"Overcast, Mild, High, True",Yes,
"Overcast, Hot, Normal, False",Yes,
"Sunny, Mild, High, True",No,


In [19]:
for string in list(df.columns):
    [a, b, c, d] = string.split(', ')
    df[a+', '+b+', '+c+', '+d]['predicted'] = Y.predict(a,b,c,d)
    
df.T.head()

Unnamed: 0,actual,predicted
"Rainy, Mild, Normal, True",Yes,Yes
"Overcast, Mild, High, True",Yes,Yes
"Overcast, Hot, Normal, False",Yes,Yes
"Sunny, Mild, High, True",No,Yes


In [20]:
count = 0
for string in list(df.columns):
    if df[string]['predicted'] != df[string]['actual']:
        count = count+1
error = (count/len(df.columns))*100
error

25.0

We see that the Naive Bayes Classifier performed pretty nicely on the dataset with an error of 0%. Label of the sample in 1 is correctly predicted as 'No'.

#### 3.	Using first 11 samples, predict the Yes or No label for the samples 12 and 13. Write your results in the form of a table. Comment on the performance of NB classifier. What would be the class label for the sample given in 1. Does the class label change?

In [25]:
Z = NaiveBayes(data[:10])
table = {'Overcast, Mild, High, True':['Yes', ''],
        'Overcast, Hot, Normal, False': ['Yes', ''],
        'Sunny, Mild, High, True': ['No', '']
        }
df1 = pd.DataFrame(table, index = ['actual', 'predicted'])
df1.T.head()

Unnamed: 0,actual,predicted
"Overcast, Mild, High, True",Yes,
"Overcast, Hot, Normal, False",Yes,
"Sunny, Mild, High, True",No,


In [26]:
for string in list(df1.columns):
    [a, b, c, d] = string.split(', ')
    df1[a+', '+b+', '+c+', '+d]['predicted'] = Z.predict(a,b,c,d)
    
df1.T.head()

Unnamed: 0,actual,predicted
"Overcast, Mild, High, True",Yes,Yes
"Overcast, Hot, Normal, False",Yes,Yes
"Sunny, Mild, High, True",No,Yes


In [24]:
count1 = 0
for string in list(df1.columns):
    if df1[string]['predicted'] != df1[string]['actual']:
        count1 = count1+1
error1 = (count1/len(df1.columns))*100
error1

33.33333333333333

Yes the labels change when the data used for traning and testing changes. We see that the Naive Bayes Classifier performed well with an error of 0% when trained using all samples, 25% error when the test set is comprised of the last 3 samples, and 33% error when the testing dataset is comprised of the last 2 samples.

# The End