In [1]:
import pandas as pd
import numpy as np
import math

## Gaussian Naive Bayes


In [2]:
playing_golf = pd.read_csv("../data/playing_golf.csv", sep='\t')
playing_golf

Unnamed: 0,index,Outlook,Temperature,Humidity,Wind,Play
0,1,Sunny,85,85,False,No
1,2,Sunny,80,90,True,No
2,3,Overcast,83,78,False,Yes
3,4,Rain,70,96,False,Yes
4,5,Rain,68,80,False,Yes
5,6,Rain,65,70,True,No
6,7,Overcast,64,65,True,Yes
7,8,Sunny,72,95,False,No
8,9,Sunny,69,70,False,Yes
9,10,Rain,75,80,False,Yes


In [3]:
playing_golf.dtypes

index           int64
Outlook        object
Temperature     int64
Humidity        int64
Wind             bool
Play           object
dtype: object

In [4]:
X = playing_golf[['Outlook', 'Temperature', 'Humidity', 'Wind']]
y = playing_golf.Play

prior_dict = {}
likelyhood_dict = {}

# For each Class in y
for c in y.unique():
    # Calculate the pior and store in the prior_dict dictionary
    prior_dict.update({c:{"prior":np.mean(y==c), "count":np.count_nonzero(y==c)}})

    # For each feature into X
    for col_name, feature_series in X.iloc[np.where(y == c)].iteritems():
        
        if(str(feature_series.dtype) in ['int64','double']):
            if col_name not in likelyhood_dict.keys():
                likelyhood_dict.update({col_name:{c:{"mean":np.mean(feature_series), "sd":np.std(feature_series)}}})
            else:
                likelyhood_dict[col_name].update({c:{"mean":np.mean(feature_series), "sd":np.std(feature_series)}})

print(likelyhood_dict)

{'Temperature': {'No': {'mean': 75.5, 'sd': 7.632168761236874}, 'Yes': {'mean': 71.5, 'sd': 6.075908711186061}}, 'Humidity': {'No': {'mean': 85.0, 'sd': 9.354143466934854}, 'Yes': {'mean': 78.16666666666667, 'sd': 9.702519718551924}}}


In [5]:

def calculate_gaussian_prob(x, colname, c):
    mean = likelyhood_dict[colname][c]['mean']
    sd = likelyhood_dict[colname][c]['mean']
    low_number = 1e-6 # prevent division by zero
    
    # Gaussian PDF to calculate the probability of a relative likelihood 
    # that the value of the random variable x would equal that sample (mean and sd)
    coeff = 1.0 / math.sqrt(2.0 * math.pi * sd + low_number)
    exponent = math.exp(-(math.pow(x - mean, 2) / (2 * sd + low_number)))
    return coeff * exponent


predictions = []
def func(x):
    
    posterior_dict = []
    for c in y.unique():
        # Initialize posterior as prior
        posterior = prior_dict[c]['prior']
        for col_name in x.index:
            # Naive assumption (independence):
            # P(x1,x2,x3|Y) = P(x1|Y)*P(x2|Y)*P(x3|Y)
            # Posterior is product of prior and likelihoods (ignoring scaling factor)
            #print(type(x[col_name]))
            if(type(x[col_name]) in [int, float, np.int64, np.float]):
                posterior *= calculate_gaussian_prob(x[col_name], col_name, c)
        posterior_dict.append((c,posterior))
        
    return posterior_dict

# For row in dataset
results = [ func(row) for index, row in X.iterrows()]
results

[[('No', 0.000437146634151998), ('Yes', 0.0002649064730610991)],
 [('No', 0.0005999138246254644), ('Yes', 0.00031469146232782096)],
 [('No', 0.00041042735867443124), ('Yes', 0.0005065052251517874)],
 [('No', 0.00031921184558675916), ('Yes', 0.00016443418796288793)],
 [('No', 0.0004726595797473193), ('Yes', 0.0011475373743421264)],
 [('No', 0.00010193026997853047), ('Yes', 0.0006204630382086237)],
 [('No', 3.1474630170557416e-05), ('Yes', 0.00028436572403873773)],
 [('No', 0.00040690904482652374), ('Yes', 0.00020814705608512602)],
 [('No', 0.0001599114260384843), ('Yes', 0.0007980843432051244)],
 [('No', 0.0006848750066205949), ('Yes', 0.0011475373743421264)]]