In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, Binarizer ,MinMaxScaler,OneHotEncoder
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score,recall_score,precision_score
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

#Preprocessing for Each Naïve Bayes Variant
Since Naïve Bayes classifiers work differently depending on the data type, we must preprocess the dataset accordingly:

A. Gaussian Naïve Bayes

1. Used for continuous numerical features (e.g., Temperature, Humidity).
2. Categorical features (e.g., Outlook, Windy) should be encoded numerically.

B. Multinomial Naïve Bayes

1. Used for count-based or frequency-based features (e.g., text data).
2. Requires categorical features to be encoded numerically.

C. Bernoulli Naïve Bayes

1. Used for binary features (e.g., True/False values).
2. Categorical features should be converted into binary form.

##Gaussian Naïve Bayes

###label encodig

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/mujahidashraf/data/refs/heads/main/golf_df.csv")
df.head()

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes


In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/mujahidashraf/data/refs/heads/main/golf_df.csv")
le = LabelEncoder()
df['Outlook'] = le.fit_transform(df['Outlook'])
df['Temperature'] = le.fit_transform(df['Temperature'])
df['Humidity'] = le.fit_transform(df['Humidity'])
df['Windy'] = le.fit_transform(df['Windy'])
X = df.drop(columns=['Play'])
y = df['Play']
#scaler = MinMaxScaler()
#scaler = StandardScaler()
#X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8, random_state=10)
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)#
print('Gaussian Naïve Bayes Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_gnb)))
y_pred_train = gnb.predict(X_train)
print('Gaussian Naïve Bayes Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))
cm = confusion_matrix(y_test, y_pred_gnb)

print('Confusion matrix\n\n', cm)
print(classification_report(y_test, y_pred_gnb))


Gaussian Naïve Bayes Model accuracy score: 1.0000
Gaussian Naïve Bayes Training-set accuracy score: 0.9091
Confusion matrix

 [[1 0]
 [0 2]]
              precision    recall  f1-score   support

          no       1.00      1.00      1.00         1
         yes       1.00      1.00      1.00         2

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3



In [None]:
param_grid_gnb = {'var_smoothing': np.logspace(-9, 0, 10)}
grid_gnb = GridSearchCV(GaussianNB(), param_grid_gnb, cv=3, scoring='accuracy')
grid_gnb.fit(X_train, y_train)

In [None]:
print("Best GaussianNB params:", grid_gnb.best_params_)
y_pred_gnb = grid_gnb.best_estimator_.predict(X_test)
print("GaussianNB Accuracy:", accuracy_score(y_test, y_pred_gnb))

Best GaussianNB params: {'var_smoothing': np.float64(0.0001)}
GaussianNB Accuracy: 1.0


###one hot encoding

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/mujahidashraf/data/refs/heads/main/golf_df.csv")
encoder = OneHotEncoder(sparse_output=False)

# Fitting the encoder and transforming the data
one_hot_encoded_array = encoder.fit_transform(df[['Outlook', 'Temperature','Humidity','Windy']])

# The transformed data is an array, so we need to convert it back to a DataFrame
one_hot_encoded_df = pd.DataFrame(one_hot_encoded_array, columns=encoder.get_feature_names_out(['Outlook', 'Temperature','Humidity','Windy']))

# Concatenating the one-hot encoded columns to the original DataFrame
df = pd.concat([df, one_hot_encoded_df], axis=1).drop(['Outlook', 'Temperature','Humidity','Windy'], axis=1)
#print(df.columns)
X = df.drop(columns=['Play'])
y = df['Play']
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.6, random_state=10)
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)
print('Gaussian Naïve Bayes Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_gnb)))
y_pred_train = gnb.predict(X_train)
print('Gaussian Naïve Bayes Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))
cm = confusion_matrix(y_test, y_pred_gnb)

print('Confusion matrix\n\n', cm)
print(classification_report(y_test, y_pred_gnb))

Gaussian Naïve Bayes Model accuracy score: 0.8333
Gaussian Naïve Bayes Training-set accuracy score: 0.8750
Confusion matrix

 [[1 0]
 [1 4]]
              precision    recall  f1-score   support

          no       0.50      1.00      0.67         1
         yes       1.00      0.80      0.89         5

    accuracy                           0.83         6
   macro avg       0.75      0.90      0.78         6
weighted avg       0.92      0.83      0.85         6



In [None]:
param_grid_gnb = {'var_smoothing': np.logspace(-9, 0, 10)}
grid_gnb = GridSearchCV(GaussianNB(), param_grid_gnb, cv=3, scoring='accuracy')
grid_gnb.fit(X_train, y_train)

In [None]:
print("Best GaussianNB params:", grid_gnb.best_params_)
y_pred_gnb = grid_gnb.best_estimator_.predict(X_test)
print("GaussianNB Accuracy:", accuracy_score(y_test, y_pred_gnb))

Best GaussianNB params: {'var_smoothing': np.float64(1e-09)}
GaussianNB Accuracy: 0.8333333333333334


##Multinomial Naïve Bayes

###label encoding

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/mujahidashraf/data/refs/heads/main/golf_df.csv")
le = LabelEncoder()
df['Outlook'] = le.fit_transform(df['Outlook'])
df['Temperature'] = le.fit_transform(df['Temperature'])
df['Humidity'] = le.fit_transform(df['Humidity'])
df['Windy'] = le.fit_transform(df['Windy'])
X = df.drop(columns=['Play'])
y = df['Play']
#scaler = MinMaxScaler()
#scaler = StandardScaler()
#X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8, random_state=10)
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
y_pred_mnb = mnb.predict(X_test)
print('Multinomial Naïve Bayes Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_mnb)))
y_pred_train = mnb.predict(X_train)
print('Multinomial Naïve Bayes Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))
cm = confusion_matrix(y_test, y_pred_mnb)

print('Confusion matrix\n\n', cm)
print(classification_report(y_test, y_pred_mnb))

Multinomial Naïve Bayes Model accuracy score: 0.6667
Multinomial Naïve Bayes Training-set accuracy score: 0.7273
Confusion matrix

 [[0 1]
 [0 2]]
              precision    recall  f1-score   support

          no       0.00      0.00      0.00         1
         yes       0.67      1.00      0.80         2

    accuracy                           0.67         3
   macro avg       0.33      0.50      0.40         3
weighted avg       0.44      0.67      0.53         3



In [None]:
param_grid_mnb = {'alpha': np.linspace(0.1, 10, 10), 'fit_prior': [True, False]}
grid_mnb = GridSearchCV(MultinomialNB(), param_grid_mnb, cv=3, scoring='accuracy')
grid_mnb.fit(X_train, y_train)

In [None]:
print("Best MultinomialNB params:", grid_mnb.best_params_)
y_pred_mnb = grid_mnb.best_estimator_.predict(X_test)
print("MultinomialNB Accuracy:", accuracy_score(y_test, y_pred_mnb))

Best MultinomialNB params: {'alpha': np.float64(0.1), 'fit_prior': False}
MultinomialNB Accuracy: 0.8


###one hot encoding

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/mujahidashraf/data/refs/heads/main/golf_df.csv")
encoder = OneHotEncoder(sparse_output=False)

# Fitting the encoder and transforming the data
one_hot_encoded_array = encoder.fit_transform(df[['Outlook', 'Temperature','Humidity','Windy']])

# The transformed data is an array, so we need to convert it back to a DataFrame
one_hot_encoded_df = pd.DataFrame(one_hot_encoded_array, columns=encoder.get_feature_names_out(['Outlook', 'Temperature','Humidity','Windy']))

# Concatenating the one-hot encoded columns to the original DataFrame
df = pd.concat([df, one_hot_encoded_df], axis=1).drop(['Outlook', 'Temperature','Humidity','Windy'], axis=1)
#print(df.columns)
X = df.drop(columns=['Play'])
y = df['Play']
#scaler = MinMaxScaler()
#scaler = StandardScaler()
#X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.7, random_state=10)
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
y_pred_mnb = mnb.predict(X_test)
print('Multinomial Naïve Bayes Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_mnb)))
y_pred_train = mnb.predict(X_train)
print('Multinomial Naïve Bayes Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))
cm = confusion_matrix(y_test, y_pred_mnb)

print('Confusion matrix\n\n', cm)
print(classification_report(y_test, y_pred_mnb))

Multinomial Naïve Bayes Model accuracy score: 0.8000
Multinomial Naïve Bayes Training-set accuracy score: 0.8889
Confusion matrix

 [[0 1]
 [0 4]]
              precision    recall  f1-score   support

          no       0.00      0.00      0.00         1
         yes       0.80      1.00      0.89         4

    accuracy                           0.80         5
   macro avg       0.40      0.50      0.44         5
weighted avg       0.64      0.80      0.71         5



In [None]:
param_grid_mnb = {'alpha': np.linspace(0.1, 10, 10), 'fit_prior': [True, False]}
grid_mnb = GridSearchCV(MultinomialNB(), param_grid_mnb, cv=3, scoring='accuracy')
grid_mnb.fit(X_train, y_train)

In [None]:
print("Best MultinomialNB params:", grid_mnb.best_params_)
y_pred_mnb = grid_mnb.best_estimator_.predict(X_test)
print("MultinomialNB Accuracy:", accuracy_score(y_test, y_pred_mnb))

Best MultinomialNB params: {'alpha': np.float64(7.800000000000001), 'fit_prior': True}
MultinomialNB Accuracy: 0.8


##Bernoulli Naïve Bayes

###label encoding


In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/mujahidashraf/data/refs/heads/main/golf_df.csv")
le = LabelEncoder()
df['Outlook'] = le.fit_transform(df['Outlook'])
df['Temperature'] = le.fit_transform(df['Temperature'])
df['Humidity'] = le.fit_transform(df['Humidity'])
df['Windy'] = le.fit_transform(df['Windy'])
X = df.drop(columns=['Play'])
y = df['Play']
#scaler = MinMaxScaler()
#scaler = StandardScaler()
#X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.7, random_state=10)
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
y_pred_bnb = bnb.predict(X_test)
print('Bernoulli Naïve Bayes Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_bnb)))
y_pred_train = bnb.predict(X_train)
print('Bernoulli Naïve Bayes Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))
cm = confusion_matrix(y_test, y_pred_bnb)

print('Confusion matrix\n\n', cm)
print(classification_report(y_test, y_pred_bnb))

Bernoulli Naïve Bayes Model accuracy score: 0.8000
Bernoulli Naïve Bayes Training-set accuracy score: 0.8889
Confusion matrix

 [[0 1]
 [0 4]]
              precision    recall  f1-score   support

          no       0.00      0.00      0.00         1
         yes       0.80      1.00      0.89         4

    accuracy                           0.80         5
   macro avg       0.40      0.50      0.44         5
weighted avg       0.64      0.80      0.71         5



In [None]:
param_grid_bnb = {'alpha': np.linspace(0.1, 10, 10), 'binarize': np.linspace(0.0, 1.0, 10)}
grid_bnb = GridSearchCV(BernoulliNB(), param_grid_bnb, cv=3, scoring='accuracy')
grid_bnb.fit(X_train, y_train)

In [None]:
print("Best BernoulliNB params:", grid_bnb.best_params_)
y_pred_bnb = grid_bnb.best_estimator_.predict(X_test)
print("BernoulliNB Accuracy:", accuracy_score(y_test, y_pred_bnb))

Best BernoulliNB params: {'alpha': np.float64(0.1), 'binarize': np.float64(0.0)}
BernoulliNB Accuracy: 0.8


###One hot encoding

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/mujahidashraf/data/refs/heads/main/golf_df.csv")
encoder = OneHotEncoder(sparse_output=False)

# Fitting the encoder and transforming the data
one_hot_encoded_array = encoder.fit_transform(df[['Outlook', 'Temperature','Humidity','Windy']])

# The transformed data is an array, so we need to convert it back to a DataFrame
one_hot_encoded_df = pd.DataFrame(one_hot_encoded_array, columns=encoder.get_feature_names_out(['Outlook', 'Temperature','Humidity','Windy']))

# Concatenating the one-hot encoded columns to the original DataFrame
df = pd.concat([df, one_hot_encoded_df], axis=1).drop(['Outlook', 'Temperature','Humidity','Windy'], axis=1)
#print(df.columns)
X = df.drop(columns=['Play'])
y = df['Play']
#scaler = MinMaxScaler()
#scaler = StandardScaler()
#X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.7, random_state=40)
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
y_pred_bnb = bnb.predict(X_test)
print('Bernoulli Naïve Bayes Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_bnb)))
y_pred_train = bnb.predict(X_train)
print('Bernoulli Naïve Bayes Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))
cm = confusion_matrix(y_test, y_pred_bnb)

print('Confusion matrix\n\n', cm)
print(classification_report(y_test, y_pred_bnb))

Bernoulli Naïve Bayes Model accuracy score: 0.8000
Bernoulli Naïve Bayes Training-set accuracy score: 0.8889
Confusion matrix

 [[2 0]
 [1 2]]
              precision    recall  f1-score   support

          no       0.67      1.00      0.80         2
         yes       1.00      0.67      0.80         3

    accuracy                           0.80         5
   macro avg       0.83      0.83      0.80         5
weighted avg       0.87      0.80      0.80         5



In [None]:
param_grid_bnb = {'alpha': np.linspace(0.1, 10, 10), 'binarize': np.linspace(0.0, 1.0, 10)}
grid_bnb = GridSearchCV(BernoulliNB(), param_grid_bnb, cv=3, scoring='accuracy')
grid_bnb.fit(X_train, y_train)

In [None]:
print("Best BernoulliNB params:", grid_bnb.best_params_)
y_pred_bnb = grid_bnb.best_estimator_.predict(X_test)
print("BernoulliNB Accuracy:", accuracy_score(y_test, y_pred_bnb))

Best BernoulliNB params: {'alpha': np.float64(0.1), 'binarize': np.float64(0.0)}
BernoulliNB Accuracy: 0.8
