### Imports

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

### Loading Data

In [2]:
df_train = pd.read_csv('data/data_train.csv')
df_train.head()

df_test = pd.read_csv('data/data_validation.csv')
df_test.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
2,769,1,2.9,1,0,0,9,0.1,182,5,...,248,874,3946,5,2,7,0,0,0,3
3,775,0,1.0,0,3,0,46,0.7,159,2,...,862,1864,568,17,15,11,1,1,1,0
4,595,0,0.9,1,7,1,23,0.1,121,3,...,441,810,3752,10,2,18,1,1,0,3


In [3]:
x_train = df_train.drop('price_range', axis=1)
y_train = df_train['price_range']

x_test = df_test.drop('price_range', axis=1)
y_test = df_test['price_range']

x_train

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,804,1,0.8,1,12,1,41,0.9,89,1,13,709,818,2027,11,5,11,1,0,0
1,1042,0,2.2,0,15,1,11,0.6,139,5,16,68,1018,2826,18,0,2,1,0,0
2,1481,1,2.0,1,0,0,35,0.5,105,3,0,249,522,2635,17,16,4,1,0,1
3,1104,0,1.7,0,1,1,60,0.4,199,2,13,653,1413,1229,6,0,3,1,1,1
4,652,0,0.5,1,1,0,58,0.6,142,3,2,464,781,565,18,12,9,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1395,536,1,1.4,0,0,1,53,0.7,135,3,0,547,705,1211,15,10,7,1,0,1
1396,1097,0,0.8,0,10,1,21,0.1,160,7,15,1277,1352,2219,15,6,12,1,0,1
1397,1179,1,0.5,0,7,1,32,0.3,182,2,12,85,1451,340,16,5,16,1,0,0
1398,719,1,0.5,1,0,1,23,0.4,113,6,9,431,1727,3990,14,9,12,1,1,1


### Define numeric and categorical columns

Because Naive Bayes treat numeric and categorical columns differently, columns with boolean values are included in categorical columns while the other in numeric columns.

In [4]:
numeric_columns = ["battery_power", "clock_speed", "fc", "int_memory", "m_dep", "mobile_wt", "n_cores", "pc", "px_height", "px_width", "ram", "sc_h", "sc_w", "talk_time"]
categorical_columns = ["blue", "dual_sim", "four_g", "three_g", "touch_screen", "wifi"]

### Naive Bayes Prediction from Scratch

In [5]:
from lib.naive_bayes import NaiveBayesCategorical, NaiveBayesGaussian

# Numeric columns
model_gnb_scratch = NaiveBayesGaussian()
model_gnb_scratch.fit(df_train[numeric_columns], y_train)

# Categorical columns
model_cnb_scratch = NaiveBayesCategorical()
model_cnb_scratch.fit(df_train[categorical_columns], y_train)

scratch_numeric_posteriors = model_gnb_scratch.predict_proba(df_test[numeric_columns].values)
scratch_categorical_posteriors = model_cnb_scratch.predict_proba(df_test[categorical_columns].values)

combined_probabilities_scratch = np.array(scratch_numeric_posteriors) * np.array(scratch_categorical_posteriors)

# Get the class with the highest combined probability as the final prediction
scratch_predictions = np.argmax(combined_probabilities_scratch, axis=1)
scratch_predictions

array([2, 2, 3, 0, 3, 1, 3, 0, 3, 1, 3, 2, 3, 0, 3, 0, 2, 1, 0, 2, 3, 1,
       0, 1, 1, 1, 2, 2, 0, 2, 2, 3, 3, 0, 2, 3, 2, 3, 1, 1, 0, 3, 0, 2,
       2, 1, 1, 2, 1, 3, 1, 2, 3, 0, 1, 3, 2, 3, 3, 2, 2, 3, 3, 1, 3, 2,
       2, 2, 2, 3, 2, 3, 1, 0, 1, 2, 0, 3, 1, 0, 3, 3, 0, 2, 3, 1, 3, 3,
       0, 2, 1, 1, 1, 2, 1, 1, 3, 2, 1, 3, 3, 3, 1, 2, 3, 2, 3, 2, 3, 3,
       3, 3, 2, 2, 3, 2, 0, 3, 1, 3, 1, 2, 2, 3, 3, 1, 2, 2, 1, 3, 3, 1,
       0, 0, 3, 0, 0, 1, 3, 0, 1, 3, 3, 1, 2, 3, 1, 2, 1, 2, 3, 1, 0, 2,
       1, 0, 3, 0, 0, 3, 3, 3, 0, 2, 3, 0, 1, 1, 0, 2, 3, 1, 0, 1, 1, 1,
       3, 0, 2, 0, 3, 1, 1, 2, 0, 2, 0, 3, 2, 0, 3, 2, 0, 1, 2, 0, 1, 3,
       3, 1, 2, 2, 2, 3, 2, 3, 3, 3, 0, 2, 0, 2, 3, 1, 2, 3, 2, 0, 0, 2,
       2, 2, 3, 3, 0, 2, 0, 3, 0, 2, 2, 2, 2, 1, 3, 0, 1, 3, 3, 3, 0, 0,
       1, 1, 3, 3, 2, 2, 0, 2, 1, 2, 2, 3, 0, 2, 2, 1, 1, 2, 0, 3, 2, 3,
       3, 0, 1, 3, 3, 0, 0, 2, 1, 1, 1, 0, 3, 2, 2, 2, 0, 1, 0, 1, 1, 1,
       0, 0, 3, 0, 3, 0, 1, 0, 1, 3, 2, 2, 0, 2, 1,

### Naive Bayes Prediction with Scikit Library

In [6]:
from sklearn.naive_bayes import GaussianNB, CategoricalNB

# Numeric columns
model_gnb_scikit = GaussianNB()
model_gnb_scikit.fit(df_train[numeric_columns], y_train)

# Categorical columns
model_cnb_scikit = CategoricalNB()
model_cnb_scikit.fit(df_train[categorical_columns], y_train)

categorical_posteriors_scikit = model_cnb_scikit.predict_proba(df_test[categorical_columns])
numerical_posteriors_scikit   = model_gnb_scikit.predict_proba(df_test[numeric_columns])

combined_probabilities_scikit = categorical_posteriors_scikit * numerical_posteriors_scikit

# Get the class with the highest combined probability as the final prediction
lib_predictions = np.argmax(combined_probabilities_scikit, axis=1)
lib_predictions

array([2, 2, 3, 0, 3, 1, 3, 0, 3, 1, 3, 2, 3, 0, 3, 0, 2, 1, 0, 2, 3, 1,
       0, 1, 1, 1, 2, 2, 0, 2, 2, 3, 3, 0, 2, 3, 2, 3, 1, 1, 0, 3, 0, 2,
       2, 1, 1, 2, 1, 3, 1, 2, 3, 0, 1, 3, 2, 3, 3, 2, 2, 3, 3, 1, 3, 2,
       2, 2, 2, 3, 2, 3, 1, 0, 1, 2, 0, 3, 1, 0, 3, 3, 0, 2, 3, 1, 3, 3,
       0, 2, 1, 1, 1, 2, 1, 1, 3, 2, 1, 3, 3, 3, 1, 2, 3, 2, 3, 2, 3, 3,
       3, 3, 2, 2, 3, 2, 0, 3, 1, 3, 1, 2, 2, 3, 3, 1, 2, 2, 1, 3, 3, 1,
       0, 0, 3, 0, 0, 1, 3, 0, 1, 3, 3, 1, 2, 3, 1, 2, 1, 2, 3, 1, 0, 2,
       1, 0, 3, 0, 0, 3, 3, 3, 0, 2, 3, 0, 1, 1, 0, 2, 3, 1, 0, 1, 1, 1,
       3, 0, 2, 0, 3, 1, 1, 2, 0, 2, 0, 3, 2, 0, 3, 2, 0, 1, 2, 0, 1, 3,
       3, 1, 2, 2, 2, 3, 2, 3, 3, 3, 0, 2, 0, 2, 3, 1, 2, 3, 2, 0, 0, 2,
       2, 2, 3, 3, 0, 2, 0, 3, 0, 2, 2, 2, 2, 1, 3, 0, 1, 3, 3, 3, 0, 0,
       1, 1, 3, 3, 2, 2, 0, 2, 1, 2, 2, 3, 0, 2, 2, 1, 1, 2, 0, 3, 2, 3,
       3, 0, 1, 3, 3, 0, 0, 2, 1, 1, 1, 0, 3, 2, 2, 2, 0, 1, 0, 1, 1, 1,
       0, 0, 3, 0, 3, 0, 1, 0, 1, 3, 2, 2, 0, 2, 1,

Naive Bayes from scratch is able to have the same prediction result as Naive Bayes using scikit-learn library

### Performance Evaluation

In [7]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

# Utility function to evaluate performance
def evaluate_classifier_performance(prediction, y_test):
    metrics = {
        'Accuracy Average': accuracy_score(y_test, prediction),
        'F1 Macro Average': f1_score(y_test, prediction, average='macro'),
        'F1 Micro Average': f1_score(y_test, prediction, average='micro'),
        'Precision Macro Average': precision_score(y_test, prediction, average='macro', zero_division=0),
        'Precision Micro Average': precision_score(y_test, prediction, average='micro', zero_division=0),
        'Recall Macro Average': recall_score(y_test, prediction, average='macro', zero_division=0),
        'Recall Micro Average': recall_score(y_test, prediction, average='micro', zero_division=0)
    }

    for metric, value in metrics.items():
        print(f'{metric}: {value:.4f}')

#### Evaluate performance Naive Bayes from scratch

In [8]:
evaluate_classifier_performance(scratch_predictions, y_test)

Accuracy Average: 0.7817
F1 Macro Average: 0.7809
F1 Micro Average: 0.7817
Precision Macro Average: 0.7814
Precision Micro Average: 0.7817
Recall Macro Average: 0.7806
Recall Micro Average: 0.7817


#### Evaluate performance Naive Bayes with Scikit

In [9]:
evaluate_classifier_performance(lib_predictions,y_test)

Accuracy Average: 0.7817
F1 Macro Average: 0.7809
F1 Micro Average: 0.7817
Precision Macro Average: 0.7814
Precision Micro Average: 0.7817
Recall Macro Average: 0.7806
Recall Micro Average: 0.7817


Naive Bayes from scratch is able to have the same performance as Naive Bayes using scikit-learn library