# Naive Bayes

## Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

## Functions
Runs PCA with various components and finds the best fit for the naïve bayes algorithm.

In [2]:
def optimized_NB(X_train, X_test, y_train, y_test):

    misclassification = 1000

    saved_X_train = X_train
    saved_X_test = X_test

    for i in range(99):

        i += 2
        temp = 0

        for j in range(10):

            print("Component: {}/100".format(i), end="\r")

            pca = PCA(n_components = i)
            X_train = pca.fit_transform(saved_X_train)
            X_test = pca.transform(saved_X_test)
            explained_variance = pca.explained_variance_ratio_

            gnb = GaussianNB()
            y_pred = gnb.fit(X_train, y_train).predict(X_test)

            temp += (y_test != y_pred).sum()

        temp /= 10

        if(temp < misclassification):
            misclassification = temp
            error = temp / X_test.shape[0] * 100
            components = i
    
    return misclassification, error, components

## Normal Data

### Load Data

In [3]:
# Load lines from mfeat-pix.txt
features = open('mfeat-pix.txt').readlines()

# Create labels for each line, 0-9 for each 200 lines
labels = []
for i in range(10):
  for j in range(200):
    labels.append(i)
labels = np.array(labels)

# Convert each line to a numpy array
for i in range(len(features)):
  features[i] = np.array(features[i].split()).astype('float')
  # Normalize
  features[i] = features[i] / 6
features = np.array(features)

### Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=40)

### PCA & NB

In [5]:
misclassification, error, components  = optimized_NB(X_train, X_test, y_train, y_test)
print("Average number of mislabeled points out of a total %d points: %.2f" % (X_test.shape[0], misclassification))
print("Average error precentage: %.2f%%" % (error))
print("Having used {} components.\n".format(components))

Average number of mislabeled points out of a total 400 points: 21.80
Average error precentage: 5.45%
Having used 31 components.



## Augmented Data

### Load Data

In [6]:
X_train =  np.load("X_train_augmented.npy")
X_test = np.load("X_test_augmented.npy")
y_train = np.load("y_train_augmented.npy")
y_test = np.load("y_test_augmented.npy")

X_train = X_train.reshape((X_train.shape[0], 240))

### PCA & NB

In [7]:
misclassification, error, components  = optimized_NB(X_train, X_test, y_train, y_test)
print("Average number of mislabeled points out of a total %d points: %.2f" % (X_test.shape[0], misclassification))
print("Average error precentage: %.2f%%" % (error))
print("Having used {} components.\n".format(components))

Average number of mislabeled points out of a total 400 points: 22.60
Average error precentage: 5.65%
Having used 55 components.

