In [200]:
import numpy as np
import pandas as pd

Bayes Theorem States that:

$$
P(Y|X) = \frac{P(Y \cap X)}{P(X)} = \frac{P(Y) * P(X|Y)}{P(X)}$$

Therefore, if we get enough observations of X, we can surmise Y using Bayes

In [3]:
golf = pd.read_excel("../Class_Data/golf_categories.xlsx")

In [48]:
golf.head()

Unnamed: 0,Temperature,Humidity,Outlook,Wind,Play
0,High,Medium,Sunny,False,No
1,High,High,Sunny,True,No
2,Low,Low,Rain,True,No
3,Medium,High,Sunny,False,No
4,Low,Medium,Rain,True,No


In [38]:
play_yes = len(golf[golf['Play'] == "Yes"])
play_no = len(golf[golf['Play'] == "No"])

In [172]:
features_dfs = {}
features = ['Temperature', 'Humidity', 'Outlook', 'Wind']
laplace = True

for feature in features:
    df = golf.groupby(['Play', feature], as_index=False).count().iloc[:, :3]
    df['Occurrences'] = df.apply(
    lambda x: x.iloc[2]/play_yes if x['Play'] == 'Yes' else x.iloc[2]/play_no,
    axis=1
)
    df.drop(df.columns[2], axis=1)
    features_dfs[feature] = df.pivot(index=df.columns[1], columns='Play', values='Occurrences').reset_index()

if laplace == True:
    for name, df in features_dfs.items():
        df.fillna({'Yes': 1/play_yes}, inplace=True)
        df.fillna({'No': 1/play_no}, inplace=True)

In [175]:
# All the bayes theorems probabilities
for name, df in features_dfs.items():
    print(name)
    print(df, "\n")

Temperature
Play Temperature   No       Yes
0           High  0.4  0.222222
1            Low  0.4  0.444444
2         Medium  0.2  0.333333 

Humidity
Play Humidity   No       Yes
0        High  0.4  0.222222
1         Low  0.2  0.444444
2      Medium  0.4  0.333333 

Outlook
Play   Outlook   No       Yes
0     Overcast  0.2  0.444444
1         Rain  0.4  0.333333
2        Sunny  0.6  0.222222 

Wind
Play   Wind   No       Yes
0     False  0.4  0.666667
1      True  0.6  0.333333 



In [176]:
record = {
    'Temperature': 'High',
    'Humidity': 'Low',
    'Outlook': 'Overcast',
    'Wind': False
}

In [180]:
yes = play_yes / (play_yes + play_no)
no = play_no / (play_yes + play_no)

for name, feature in features_dfs.items():
    yes *= feature[feature[name] == record[name]]['Yes'].iloc[0]
    no *= feature[feature[name] == record[name]]['No'].iloc[0]

# Now calculate the probability of playing and not playing via normalization
p_yes = yes / (yes + no)
p_no = no / (yes + no)

if p_yes >= 0.5:
    print('Play')
else:
    print("Don't play")
p_yes, p_no

Play


(np.float64(0.8916629514043691), np.float64(0.1083370485956309))

For Numerical Attributes, numbers should be normalized using the normal distribution

## Sci-Kit Learn

The Simple way to use categorical variables in GuassianNB, using pd.get_dummies
<br>You need to drop the rows, and then rejoin them
<br>Either using np.concat or pd.concat the dataframes

In [331]:
# Gaussian Naive Bayes
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

golf_numerical = pd.read_csv('../Class_Data/golf.csv')

# Nominal to Binominal (yes = 1 no = 0)
golf_numerical['Play'] = golf_numerical['Play'].map(
    {'yes': 1, 'no': 0}
)

# Create dummy variables
golf_dummies = pd.get_dummies(golf_numerical[['Outlook']])

golf_concat = pd.concat([golf_numerical, golf_dummies], axis=1)
golf_concat.drop(['Outlook'], axis=1, inplace=True)

X = golf_concat.drop('Play', axis=1).values
y = golf_concat['Play'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

gnb = GaussianNB()

gnb.fit(X_train, y_train)

y_pred = gnb.predict(X_test)

from sklearn.metrics import classification_report
print("Classification report")
print(classification_report(y_test, y_pred))

Classification report
              precision    recall  f1-score   support

           0       0.33      1.00      0.50         1
           1       1.00      0.60      0.75         5

    accuracy                           0.67         6
   macro avg       0.67      0.80      0.62         6
weighted avg       0.89      0.67      0.71         6



In [301]:
gnb.score(X_test, y_test)

0.6

The next way to do this is by using a pipeline and an encoder, this way, when you train your encoder, you use the same data split

The Pipeline allows for multiple processes to take place in call of the fit function, this prevents data leakage from the parametrization and the fitting of the model

Column Transformer allows us to perform seperate transformations inside a pipeline, for this naive bayes model, we will be normalizing the numerical data, and using one hot encoder on the categorical data

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform
from sklearn.model_selection import train_test_split

golf = pd.read_csv('../Class_Data/golf.csv')
golf['Play'] = golf['Play'].map({'yes': 1, 'no': 0})

golf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Outlook      14 non-null     object
 1   Temperature  14 non-null     int64 
 2   Humidity     14 non-null     int64 
 3   Windy        14 non-null     bool  
 4   Play         14 non-null     int64 
dtypes: bool(1), int64(3), object(1)
memory usage: 594.0+ bytes


In [None]:
numerical = ['Temperature', 'Humidity']
categorical = ['Outlook']

preprocessor = 


pipe = Pipeline([
    ('encoder', OneHotEncoder()),
    ('guassiannb', GaussianNB())
]
)