# Naive Bayes

## Libraries

In [406]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

## Functions

In [407]:
def find_components(X_train, X_test, y_train, y_test):

    misclassification = 1000

    saved_X_train = X_train
    saved_X_test = X_test

    for i in range(49):

        i += 2
        temp = 0

        for j in range(10):

            print("Component: {}/50".format(i), end="\r")

            pca = PCA(n_components = i)
            X_train = pca.fit_transform(saved_X_train)
            X_test = pca.transform(saved_X_test)
            explained_variance = pca.explained_variance_ratio_

            gnb = GaussianNB()
            y_pred = gnb.fit(X_train, y_train).predict(X_test)

            temp += (y_test != y_pred).sum()

        temp /= 10

        if(temp < misclassification):
            misclassification = temp
            components = i
    
    return components 

## Normal Data

### Load Data

In [408]:
# Load lines from mfeat-pix.txt
features = open('mfeat-pix.txt').readlines()

# Create labels for each line, 0-9 for each 200 lines
labels = []
for i in range(10):
  for j in range(200):
    labels.append(i)
labels = np.array(labels)

# Convert each line to a numpy array
for i in range(len(features)):
  features[i] = np.array(features[i].split()).astype('float')
  # Normalize
  features[i] = features[i] / 6
features = np.array(features)

### Split

In [409]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=40)

### PCA

In [410]:
n = find_components(X_train, X_test, y_train, y_test)
print("\nUsing {} components.\n".format(n))

pca = PCA(n_components=n)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_

Component: 50/50
Using 40 components.



### Model

In [411]:
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)

print("Number of mislabeled points out of a total %d points: %d" % (X_test.shape[0], (y_test != y_pred).sum()))
print("Error precentage: %.2f%%" % ((y_test != y_pred).sum() / X_test.shape[0] * 100))

Number of mislabeled points out of a total 400 points: 22
Error precentage: 5.50%


## Augmented Data

### Load Data

In [412]:
X_train =  np.load("X_train_augmented.npy")
X_test = np.load("X_test_augmented.npy")
y_train = np.load("y_train_augmented.npy")
y_test = np.load("y_test_augmented.npy")

X_train = X_train.reshape((X_train.shape[0], 240))

### PCA

In [413]:
n = find_components(X_train, X_test, y_train, y_test)
print("\nUsing {} components.\n".format(n))

pca = PCA(n_components=n)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_

Component: 50/50
Using 25 components.



### Model

In [414]:
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)

print("Number of mislabeled points out of a total %d points: %d" % (X_test.shape[0], (y_test != y_pred).sum()))
print("Error precentage: %.2f%%" % (float((y_test != y_pred).sum() / X_test.shape[0] * 100)))

Number of mislabeled points out of a total 400 points: 23
Error precentage: 5.75%
