# Naive Bayes

## Libraries

In [277]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

## Normal Data

### Load Data

In [278]:
# Load lines from mfeat-pix.txt
features = open('mfeat-pix.txt').readlines()

# Create labels for each line, 0-9 for each 200 lines
labels = []
for i in range(10):
  for j in range(200):
    labels.append(i)
labels = np.array(labels)

# Convert each line to a numpy array
for i in range(len(features)):
  features[i] = np.array(features[i].split()).astype('float')
  # Normalize
  features[i] = features[i] / 6
features = np.array(features)

### Split

In [279]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=40)

### PCA

In [280]:
pca = PCA(n_components=0.99)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_

### Model

In [281]:
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)

print("Number of mislabeled points out of a total %d points: %d" % (X_test.shape[0], (y_test != y_pred).sum()))
print("Error precentage: %.2f%%" % ((y_test != y_pred).sum() / X_test.shape[0] * 100))

Number of mislabeled points out of a total 400 points: 21
Error precentage: 5.25%


## Augmented Data

### Load Data

In [282]:
X_train =  np.load("X_train_augmented.npy")
X_test = np.load("X_test_augmented.npy")
y_train = np.load("y_train_augmented.npy")
y_test = np.load("y_test_augmented.npy")

X_train = X_train.reshape((X_train.shape[0], 240))

### PCA

In [283]:
pca = PCA(n_components=0.95)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_

### Model

In [284]:
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)

print("Number of mislabeled points out of a total %d points: %d" % (X_test.shape[0], (y_test != y_pred).sum()))
print("Error precentage: %.2f%%" % (float((y_test != y_pred).sum() / X_test.shape[0] * 100)))

Number of mislabeled points out of a total 400 points: 24
Error precentage: 6.00%
