# The Naïve Bayes Algorithm

In [26]:
import numpy as np
import pandas as pd
from collections import defaultdict

**Dataset info:**

- Data columns (total 5 columns):
- outlook     14 non-null object
- temp        14 non-null object
- humidity    14 non-null object
- windy       14 non-null bool
- play        14 non-null object

In [27]:
f = 'datasets/tennis.csv'
df = pd.read_csv(f, delimiter=',', header=0)
df

Unnamed: 0,outlook,temp,humidity,windy,play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes
5,rainy,cool,normal,True,no
6,overcast,cool,normal,True,yes
7,sunny,mild,high,False,no
8,sunny,cool,normal,False,yes
9,rainy,mild,normal,False,yes


In [32]:
class NaiveBayes():
    """
    A simple implementation of the Naive Bayes algorithm.

    Parameters
    ------------
    None

    Attributes
    -----------
    p_ : dictionary
      Conditional probabilities after fitting the data
    s_ : 
    target_ : str
      target feature class

    """
    
    def __init__(self):
        self.df = None
        self.p_ = None
        self.s1_ = 0
        self.s2_ = 0
    
    def fit(self, df, target, smoothing=None):
        """ Fit the training data.
        """
        self.df = df
        self.target_ = target
        self.p_ = defaultdict()
        
        if smoothing == 'plus1':
            self.s1_ = 1
            self.s2_ = 2
        elif smoothing == 'laplace':
            self.s1_ = 1
            self.s2_ = df.shape[1]
        
        for c in self.df[target].unique():
            target_c = self.df[df[target] == c].shape[0]
            self.p_['p_c_{}'.format(c)] = (target_c + self.s1_) / (self.df.shape[0] + self.s2_)
            for f in self.df.columns: 
                if not f == self.target_:
                    for f_cond in self.df[f].unique():
                        self.p_['p_c_{}_{}_{}'.format(c, f, f_cond)] = (self.df[(self.df[f] == f_cond) & (self.df[target] == c)].shape[0] + self.s1_) / (target_c + self.s2_)

        return self
        
    def predict(self, condition):
        """ Predicting with a condition.
        """
        pred_ = defaultdict()
        for c in self.df[self.target_].unique():
            p = self.p_['p_c_{}'.format(c)]
            for f in conditions.keys():
                p *= self.p_['p_c_{}_{}_{}'.format(c, f, conditions[f])]
            pred_[c] = p
        
        pred_sum = sum(pred_.values())
        for c in pred_.keys():
            pred_[c] = pred_[c] / pred_sum
            
        return pred_

In [21]:
nb = NaiveBayes()
nb.fit(df, target='play')
nb.p_

defaultdict(None,
            {'p_c_no': 0.35714285714285715,
             'p_c_no_outlook_sunny': 0.6,
             'p_c_no_outlook_overcast': 0.0,
             'p_c_no_outlook_rainy': 0.4,
             'p_c_no_temp_hot': 0.4,
             'p_c_no_temp_mild': 0.4,
             'p_c_no_temp_cool': 0.2,
             'p_c_no_humidity_high': 0.8,
             'p_c_no_humidity_normal': 0.2,
             'p_c_no_windy_False': 0.4,
             'p_c_no_windy_True': 0.6,
             'p_c_yes': 0.6428571428571429,
             'p_c_yes_outlook_sunny': 0.2222222222222222,
             'p_c_yes_outlook_overcast': 0.4444444444444444,
             'p_c_yes_outlook_rainy': 0.3333333333333333,
             'p_c_yes_temp_hot': 0.2222222222222222,
             'p_c_yes_temp_mild': 0.4444444444444444,
             'p_c_yes_temp_cool': 0.3333333333333333,
             'p_c_yes_humidity_high': 0.3333333333333333,
             'p_c_yes_humidity_normal': 0.6666666666666666,
             'p_c_yes_windy_Fal

In [22]:
conditions = {
    'outlook': 'sunny',
    'temp': 'cool',
    'humidity': 'high',
    'windy': True
}

nb.predict(conditions)

defaultdict(None, {'no': 0.795417348608838, 'yes': 0.20458265139116202})

---

### Now compare Naive Bayes with the usage of smoothing.

In [40]:
nb1 = NaiveBayes()
nb1.fit(df, target='play')

nb2 = NaiveBayes()
nb2.fit(df, target='play', smoothing='plus1')

nb3 = NaiveBayes()
nb3.fit(df, target='play', smoothing='laplace')

print(nb1.predict(conditions))
print(nb2.predict(conditions))
print(nb3.predict(conditions))

defaultdict(None, {'no': 0.795417348608838, 'yes': 0.20458265139116202})
defaultdict(None, {'no': 0.7530216530370828, 'yes': 0.2469783469629173})
defaultdict(None, {'no': 0.6576280471103808, 'yes': 0.3423719528896192})


In [43]:
nb1.p_

defaultdict(None,
            {'p_c_no': 0.35714285714285715,
             'p_c_no_outlook_sunny': 0.6,
             'p_c_no_outlook_overcast': 0.0,
             'p_c_no_outlook_rainy': 0.4,
             'p_c_no_temp_hot': 0.4,
             'p_c_no_temp_mild': 0.4,
             'p_c_no_temp_cool': 0.2,
             'p_c_no_humidity_high': 0.8,
             'p_c_no_humidity_normal': 0.2,
             'p_c_no_windy_False': 0.4,
             'p_c_no_windy_True': 0.6,
             'p_c_yes': 0.6428571428571429,
             'p_c_yes_outlook_sunny': 0.2222222222222222,
             'p_c_yes_outlook_overcast': 0.4444444444444444,
             'p_c_yes_outlook_rainy': 0.3333333333333333,
             'p_c_yes_temp_hot': 0.2222222222222222,
             'p_c_yes_temp_mild': 0.4444444444444444,
             'p_c_yes_temp_cool': 0.3333333333333333,
             'p_c_yes_humidity_high': 0.3333333333333333,
             'p_c_yes_humidity_normal': 0.6666666666666666,
             'p_c_yes_windy_Fal

In [44]:
nb2.p_

defaultdict(None,
            {'p_c_no': 0.375,
             'p_c_no_outlook_sunny': 0.5714285714285714,
             'p_c_no_outlook_overcast': 0.14285714285714285,
             'p_c_no_outlook_rainy': 0.42857142857142855,
             'p_c_no_temp_hot': 0.42857142857142855,
             'p_c_no_temp_mild': 0.42857142857142855,
             'p_c_no_temp_cool': 0.2857142857142857,
             'p_c_no_humidity_high': 0.7142857142857143,
             'p_c_no_humidity_normal': 0.2857142857142857,
             'p_c_no_windy_False': 0.42857142857142855,
             'p_c_no_windy_True': 0.5714285714285714,
             'p_c_yes': 0.625,
             'p_c_yes_outlook_sunny': 0.2727272727272727,
             'p_c_yes_outlook_overcast': 0.45454545454545453,
             'p_c_yes_outlook_rainy': 0.36363636363636365,
             'p_c_yes_temp_hot': 0.2727272727272727,
             'p_c_yes_temp_mild': 0.45454545454545453,
             'p_c_yes_temp_cool': 0.36363636363636365,
             'p_c

In [45]:
nb3.p_

defaultdict(None,
            {'p_c_no': 0.3157894736842105,
             'p_c_no_outlook_sunny': 0.4,
             'p_c_no_outlook_overcast': 0.1,
             'p_c_no_outlook_rainy': 0.3,
             'p_c_no_temp_hot': 0.3,
             'p_c_no_temp_mild': 0.3,
             'p_c_no_temp_cool': 0.2,
             'p_c_no_humidity_high': 0.5,
             'p_c_no_humidity_normal': 0.2,
             'p_c_no_windy_False': 0.3,
             'p_c_no_windy_True': 0.4,
             'p_c_yes': 0.5263157894736842,
             'p_c_yes_outlook_sunny': 0.21428571428571427,
             'p_c_yes_outlook_overcast': 0.35714285714285715,
             'p_c_yes_outlook_rainy': 0.2857142857142857,
             'p_c_yes_temp_hot': 0.21428571428571427,
             'p_c_yes_temp_mild': 0.35714285714285715,
             'p_c_yes_temp_cool': 0.2857142857142857,
             'p_c_yes_humidity_high': 0.2857142857142857,
             'p_c_yes_humidity_normal': 0.5,
             'p_c_yes_windy_False': 0.5,
  