In [1]:
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
plt.rcParams['figure.dpi'] = 300

golf_dataset = pd.read_csv("golf_dataset_mini_original_with_testset.csv")
df = golf_dataset.copy()

df

Unnamed: 0,Outlook_sunny,Outlook_overcast,Outlook_rain,Temperature,Humidity,Wind,Play
0,1,0,0,85.0,85.0,0,0
1,1,0,0,80.0,90.0,1,0
2,0,1,0,83.0,78.0,0,1
3,0,0,1,70.0,96.0,0,1
4,0,0,1,68.0,80.0,0,1
5,0,0,1,65.0,70.0,1,0
6,0,1,0,64.0,65.0,1,1
7,1,0,0,72.0,95.0,0,0
8,1,0,0,69.0,70.0,0,1
9,0,0,1,75.0,80.0,0,1


In [2]:
df = golf_dataset.copy()

# Define categories for 'Temperature' and 'Humidity'
df['Temperature'] = pd.cut(df['Temperature'], bins=[0, 80, 100], labels=['Warm', 'Hot'])
df['Humidity'] = pd.cut(df['Humidity'], bins=[0, 75, 100], labels=['Dry', 'Humid'])

# One-hot encode the categorized columns and drop them after
one_hot_columns = pd.get_dummies(df[['Temperature', 'Humidity']], drop_first=True)

# Drop the categorized columns
df.drop(['Temperature', 'Humidity'], axis=1, inplace=True)

# Concatenate the one-hot encoded columns with the original DataFrame
df = pd.concat([one_hot_columns, df], axis=1)

df

Unnamed: 0,Temperature_Hot,Humidity_Humid,Outlook_sunny,Outlook_overcast,Outlook_rain,Wind,Play
0,True,True,1,0,0,0,0
1,False,True,1,0,0,1,0
2,True,True,0,1,0,0,1
3,False,True,0,0,1,0,1
4,False,True,0,0,1,0,1
5,False,False,0,0,1,1,0
6,False,False,0,1,0,1,1
7,False,True,1,0,0,0,0
8,False,False,1,0,0,0,1
9,False,True,0,0,1,0,1


In [3]:
from sklearn.model_selection import train_test_split

# Setting feature matrix X and target vector y
X, y = df.drop('Play', axis=1), df['Play']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, shuffle=False)

In [4]:
from sklearn.naive_bayes import BernoulliNB
nb_clf = BernoulliNB()
nb_clf.fit(X_train, y_train)

def sort_attr_label(attr, lbl):
    return (pd.concat([attr, lbl], axis=1)
            .sort_values([attr.name, lbl.name])
            .reset_index()
            .rename(columns={'index': 'ID'})
            .set_index('ID'))

print(sort_attr_label(y_train, X_train['Outlook_sunny']))

    Play  Outlook_sunny
ID                     
5      0              0
13     0              0
0      0              1
1      0              1
7      0              1
2      1              0
3      1              0
4      1              0
6      1              0
9      1              0
11     1              0
12     1              0
8      1              1
10     1              1


In [5]:
from fractions import Fraction

def calc_feature_prob(attr, lbl):
    total_classes = lbl.value_counts()
    counts = pd.crosstab(attr, lbl)
    prob_df = counts.apply(lambda x: [Fraction(c, total_classes[x.name]).limit_denominator() for c in x])

    return prob_df

for col in X_train.columns:
  print(calc_feature_prob(X_train[col], y_train), end="\n\n")

Play               0    1
Temperature_Hot          
False            4/5  7/9
True             1/5  2/9

Play              0    1
Humidity_Humid          
False           1/5  4/9
True            4/5  5/9

Play             0    1
Outlook_sunny          
0              2/5  7/9
1              3/5  2/9

Play              0    1
Outlook_overcast        
0                 1  5/9
1                 0  4/9

Play            0    1
Outlook_rain          
0             3/5  2/3
1             2/5  1/3

Play    0    1
Wind          
0     2/5  2/3
1     3/5  1/3



In [6]:
y_pred = nb_clf.predict(X_test)
print(y_pred)

[0 1 1 0 0 0 1 1 0 1 1 1 1 1]


In [7]:
df = golf_dataset.copy()

from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score

# Convert into Binary
df['Temperature'] = pd.cut(df['Temperature'], bins=[0, 80, 100], labels=['Warm', 'Hot'])
df['Humidity'] = pd.cut(df['Humidity'], bins=[0, 75, 100], labels=['Dry', 'Humid'])
one_hot_columns = pd.get_dummies(df[['Temperature', 'Humidity']], drop_first=True)
df.drop(['Temperature', 'Humidity'], axis=1, inplace=True)
df = pd.concat([one_hot_columns, df], axis=1)

# Split train & test data
X, y = df.drop('Play', axis=1), df['Play']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, shuffle=False)

# Training Phase
nb_clf = BernoulliNB()
nb_clf.fit(X_train, y_train)

# Classification Phase
y_pred = nb_clf.predict(X_test)
print(y_pred)

# Evaluation Phase
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

[0 1 1 0 0 0 1 1 0 1 1 1 1 1]
Accuracy: 0.8571428571428571
