# Naive Bayes

Probabilistic Classifier based on applying Bayes' theorem with strong, naive independence assumptions between the features.

Naive Bayes Classifier formula can be written based on Bayes theorem as:
$$P(y|X) = \frac{P(X|y) \cdot P(y)}{P(X)}$$
$y$ - class labels we want to predict,
$X$ - feature vector.

We assume that features are mutually independent (reason for it's naivety):
$$P(y|X) = \frac{P(x_1|y) \cdot P(x_2|y) \cdot ... \cdot P(x_n|y) \cdot P(y)}{P(X)}$$

We select:
$$y = argmax_y P(y|X) = argmax_y \frac{P(x_1|y) \cdot P(x_2|y) \cdot ... \cdot P(x_n|y) \cdot P(y)}{P(X)}$$
$$y = argmax_y P(x_1|y) \cdot P(x_2|y) \cdot ... \cdot P(x_n|y) \cdot P(y)$$
Because result can we very small, we do a little trick below:
$$y = argmax_y log(P(x_1|y)) + log(P(x_2|y)) + ... + log(P(x_n|y)) + log(P(y))$$
$P(y)$ - prior probability, how often certain class label occurs

$P(x_i|y)$ - class conditional probability, calucated with Gaussian distribution:
$$ P(x_i|y) = \frac{1}{\sqrt{2\pi \sigma^2_y}} \cdot e^{-\frac{1}{2}(\frac{x_i - \mu_y}{\sigma_y})^2}$$
$\mu_y$ - mean, $\sqrt{2\pi \sigma^2_y}$ - standard deviation.

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
from evaluation import *
from feature_scaler import *

import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Data Pre-processing

In [28]:
df = pd.read_csv('./data/breast-cancer.csv')

encoder = LabelEncoder()
df['diagnosis'] = encoder.fit_transform(df['diagnosis'])
df = df.drop(columns=["id"], axis=1)

In [29]:
# Data split and scaling
X = df.drop(columns=["diagnosis"])
y = df['diagnosis']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_ss, X_test_ss = standard_scaler(X_train, X_test) # Standard Scaler
X_train_mms, X_test_mms = min_max_scaler(X_train, X_test) # Min-Max Scaler

### Removing features when the correlation between them exceeds a certain threshold

In [30]:
X_corr = X.corr(numeric_only=True).round(3)
correlation_threshold = 0.9
upper_matrix = X_corr.where(np.triu(np.ones(X_corr.shape), k=1).astype(bool))
features_to_drop = [x for x in upper_matrix.columns if any(upper_matrix[x] > correlation_threshold)]
X_corr = X.drop(X[features_to_drop], axis=1)

X_corr_train, X_corr_test = train_test_split(X_corr, test_size=0.2, random_state=42)
X_corr_train_ss, X_corr_test_ss = standard_scaler(X_corr_train, X_corr_test) # Standard Scaler
X_corr_train_mms, X_corr_test_mms = min_max_scaler(X_corr_train, X_corr_test) # Min-Max Scaler

Steps:
- Training:
    - calculate mean,
    - calculate variance,
    - frequency of each class.
- Predictions:
    - calculate y (using Gaussian formula),
    - choosing class with highest probability.

## Implementation from Scratch

In [31]:
class NaiveBayesClassifierScratch:
    def __init__(self):
        self.classes = None
        self.priors = [] # P(x|c)
        self.mean = []
        self.var = []

    def fit(self, X, y):
        self.classes = np.unique(y)

        for c in self.classes:
            sample_of_class = X[y == c]
            self.mean.append(sample_of_class.mean(axis=0))
            self.var.append(sample_of_class.var(axis=0))
            self.priors.append(sample_of_class.shape[0] / float(X.shape[0]))

    def pdf(self, x, c): # p(x|c)
        n = np.exp(-0.5 * ((x - self.mean[c])/self.var[c])**2)
        d = np.sqrt(2 * np.pi * self.var[c])
        return n/d

    def predict_sample(self, sample):
        pred_priors = []
        # Probabilities for each class
        for i in range(len(self.classes)):
            p_c = np.log(self.priors[i])
            pred_priors.append(np.sum(np.log(self.pdf(sample, i))) + p_c)

        # Class with highest probability
        return self.classes[np.argmax(pred_priors)]

    def predict(self, X):
        return [self.predict_sample(x) for x in X]
    
    def accuracy(self, y_true, y_pred):
        return np.mean(y_true == y_pred)

## Built-In

### Without Feature Scaling

In [33]:
score = [] 
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score.append(np.array(evaluate(y_test, y_pred, False)))

model = GaussianNB()
model.fit(X_corr_train, y_train)
y_pred = model.predict(X_corr_test)
score.append(np.array(evaluate(y_test, y_pred, False)))

### Min-Max Scaling

In [34]:
model = GaussianNB()
model.fit(X_train_mms, y_train)
y_pred = model.predict(X_test_mms)
score.append(np.array(evaluate(y_test, y_pred, False)))

model = GaussianNB()
model.fit(X_corr_train_mms, y_train)
y_pred = model.predict(X_corr_test_mms)
score.append(np.array(evaluate(y_test, y_pred, False)))

### Standard Scaling

In [35]:
model = GaussianNB()
model.fit(X_train_ss, y_train)
y_pred = model.predict(X_test_ss)
score.append(np.array(evaluate(y_test, y_pred, False)))

model = GaussianNB()
model.fit(X_corr_train_ss, y_train)
y_pred = model.predict(X_corr_test_ss)
score.append(np.array(evaluate(y_test, y_pred, False)))

### PCA

In [36]:
tmp_pca = [0,0,0,0,0,0]
tmp_pca2 = [0,0,0,0,0,0]

import warnings
for n in range(2, 20):
    pca = PCA(n_components=n)
    pca.fit(X_train)

    warnings.filterwarnings("ignore")
    X_train_pca = pca.transform(X_train_mms)
    X_test_pca = pca.transform(X_test_mms)

    model = GaussianNB()
    model.fit(X_train_pca, y_train)
    y_pred = model.predict(X_test_pca)
    tmp = np.array(evaluate(y_test, y_pred, False))
    if tmp_pca[0] < tmp[0]:
        tmp_pca = tmp

    pca = PCA(n_components=n)
    pca.fit(X_corr_train_ss)

    warnings.filterwarnings("ignore")
    X_train_pca = pca.transform(X_corr_train_mms)
    X_test_pca = pca.transform(X_corr_test_mms)

    model = GaussianNB()
    model.fit(X_train_pca, y_train)
    y_pred = model.predict(X_test_pca)
    tmp = np.array(evaluate(y_test, y_pred, False))
    if tmp_pca2[0] < tmp[0]:
        tmp_pca2 = tmp

score.append(tmp_pca)
score.append(tmp_pca2)


In [37]:
tmp_pca = [0,0,0,0,0,0]
tmp_pca2 = [0,0,0,0,0,0]

import warnings
for n in range(2, 20):
    pca = PCA(n_components=n)
    pca.fit(X_train)

    warnings.filterwarnings("ignore")
    X_train_pca = pca.transform(X_train_ss)
    X_test_pca = pca.transform(X_test_ss)

    model = GaussianNB()
    model.fit(X_train_pca, y_train)
    y_pred = model.predict(X_test_pca)
    tmp = np.array(evaluate(y_test, y_pred, False))
    if tmp_pca[0] < tmp[0]:
        tmp_pca = tmp

    pca = PCA(n_components=n)
    pca.fit(X_corr_train_ss)

    warnings.filterwarnings("ignore")
    X_train_pca = pca.transform(X_corr_train_ss)
    X_test_pca = pca.transform(X_corr_test_ss)

    model = GaussianNB()
    model.fit(X_train_pca, y_train)
    y_pred = model.predict(X_test_pca)
    tmp = np.array(evaluate(y_test, y_pred, False))
    if tmp_pca2[0] < tmp[0]:
        tmp_pca2 = tmp

score.append(tmp_pca)
score.append(tmp_pca2)

In [39]:
pd.DataFrame(score,
    index=['Without Scaler', 'Without Scaler Corr','Min-Max Scaler', 'Min-Max Scaler Corr', 'Standard Scaler', 'Standard Scaler Corr', 'PCA Min-Max Scaler', 'PCA Min-Max Scaler Corr', 'PCA Standard Scaler', 'PCA Standard Scaler Corr'], 
    columns=['Accuracy', 'F1', 'Recall', 'Precision', 'AUC', 'Confusion Matrix'])

Unnamed: 0,Accuracy,F1,Recall,Precision,AUC,Confusion Matrix
Without Scaler,0.973684,0.973481,0.973684,0.974751,0.965116,"[[71, 0], [3, 40]]"
Without Scaler Corr,0.921053,0.920443,0.921053,0.921213,0.909106,"[[68, 3], [6, 37]]"
Min-Max Scaler,0.964912,0.964738,0.964912,0.965205,0.958074,"[[70, 1], [3, 40]]"
Min-Max Scaler Corr,0.921053,0.920443,0.921053,0.921213,0.909106,"[[68, 3], [6, 37]]"
Standard Scaler,0.964912,0.964738,0.964912,0.965205,0.958074,"[[70, 1], [3, 40]]"
Standard Scaler Corr,0.921053,0.920443,0.921053,0.921213,0.909106,"[[68, 3], [6, 37]]"
PCA Min-Max Scaler,0.964912,0.964537,0.964912,0.966784,0.953488,"[[71, 0], [4, 39]]"
PCA Min-Max Scaler Corr,0.903509,0.9015,0.903509,0.907182,0.881264,"[[69, 2], [9, 34]]"
PCA Standard Scaler,0.964912,0.964537,0.964912,0.966784,0.953488,"[[71, 0], [4, 39]]"
PCA Standard Scaler Corr,0.885965,0.883591,0.885965,0.88872,0.862594,"[[68, 3], [10, 33]]"
