In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

data = pd.read_csv('C:/Users/marya/OneDrive/Desktop/Classification_project1/preprocessed_green_tripdata_2015-07.csv')

# Discretize the trip price into three categories: low, medium, high
price_quantiles = data['total_amount'].quantile([0.33, 0.67]).values
data['price_category'] = pd.cut(data['total_amount'], bins=[-np.inf, price_quantiles[0], price_quantiles[1], np.inf], labels=[0, 1, 2])

# Select only numeric features
numeric_features = data.select_dtypes(include=[np.number]).drop(['total_amount'], axis=1)

# Handle missing values by filling them with the mean of the column
numeric_features = numeric_features.fillna(numeric_features.mean())

# Standardize the features
scaler = StandardScaler()
features = scaler.fit_transform(numeric_features)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, data['price_category'], test_size=0.2, random_state=42)


In [7]:
class GaussianNaiveBayes:
    def __init__(self, epsilon=1e-9):
        self.epsilon = epsilon  # Small value to avoid division by zero

    def fit(self, X, y):
        self.classes = np.unique(y)
        self.mean = np.zeros((len(self.classes), X.shape[1]), dtype=np.float64)
        self.var = np.zeros((len(self.classes), X.shape[1]), dtype=np.float64)
        self.priors = np.zeros(len(self.classes), dtype=np.float64)
        
        for idx, c in enumerate(self.classes):
            X_c = X[y == c]
            self.mean[idx, :] = X_c.mean(axis=0)
            self.var[idx, :] = X_c.var(axis=0) + self.epsilon  # Adding epsilon to variance
            self.priors[idx] = X_c.shape[0] / float(X.shape[0])
    
    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)
    
    def _predict(self, x):
        posteriors = []
        
        for idx, c in enumerate(self.classes):
            prior = np.log(self.priors[idx])
            posterior = np.sum(np.log(self._pdf(idx, x)))
            posterior = prior + posterior
            posteriors.append(posterior)
        
        return self.classes[np.argmax(posteriors)]
    
    def _pdf(self, class_idx, x):
        mean = self.mean[class_idx]
        var = self.var[class_idx]
        numerator = np.exp(- (x - mean) ** 2 / (2 * var))
        denominator = np.sqrt(2 * np.pi * var)
        return numerator / denominator


In [8]:
nb_model = GaussianNaiveBayes()

nb_model.fit(X_train, y_train)

y_pred = nb_model.predict(X_test)

class_labels = {0: 'Low', 1: 'Medium', 2: 'High'}
results = []

for class_label in class_labels.keys():
    y_test_binary = (y_test == class_label).astype(int)
    y_pred_binary = (y_pred == class_label).astype(int)
    
    accuracy = accuracy_score(y_test_binary, y_pred_binary)
    precision = precision_score(y_test_binary, y_pred_binary, zero_division=0)
    recall = recall_score(y_test_binary, y_pred_binary, zero_division=0)
    f1 = f1_score(y_test_binary, y_pred_binary, zero_division=0)
    
    results.append((class_labels[class_label], accuracy, precision, recall, f1))

results_df = pd.DataFrame(results, columns=['Class', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
print(results_df)


  posterior = np.sum(np.log(self._pdf(idx, x)))


    Class  Accuracy  Precision    Recall  F1 Score
0     Low  0.919116   0.825093  0.977269  0.894757
1  Medium  0.860457   0.793836  0.757519  0.775252
2    High  0.939069   0.981120  0.831623  0.900207
