# SKLearn Multinomial Naive Bayes Classifier

In [1]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics

result = pd.read_csv("balanced_dataset.csv")
RANDOM_STATE = 42

### K-fold cross validation
---

Split the data into k sections or 'folds'. The model runs k times. Each fold is used once as testing while the others form the training set. An average accuracy is caluclated from the accurracies from the k runs.

---
We will run the k-fold cross validation 3 times for a total of 12 runs.

K-folds | Number of Runs | Train Data | Test Data
-----|-----|-----|-----
3 | 3 | 66% | 33%
4 | 4 | 75% | 25%
5 | 5 | 80% | 20%

In [2]:
split_list = [3, 4, 5]

### Set X and y
- X is the text reviews
- y is the positive, neutral, or negative labels

In [3]:
X = result['review']
y = result['label']

### Create the Vectorizer

In [4]:
vectorizer = CountVectorizer(stop_words='english')

### Run the model
- It will run for a total of 12 times
- Each time will compute an accuracy
- For each value of k, an average accuracy will be computed
- It will produce a graph of the average accuracies along with the standard deviation for each set of k runs

In [5]:
fig, ax = plt.subplots()
ax.set_xlabel("Values of K")
ax.set_ylabel("AUC")
ax.set_title("Mean Accuracy and Standard Deviation per K")
for s in split_list:
    auc_list = []
    print("K = ", s)
    kf = KFold(n_splits=s, shuffle=True, random_state=RANDOM_STATE)
    for train_ind, test_ind in kf.split(X):
        # Split X and y on the index given by the KFold
        X_train, X_test = X[train_ind], X[test_ind]
        y_train, y_test = y[train_ind], y[test_ind]
        # Generate the text counts using the vectorizer
        train_features = vectorizer.fit_transform([r for r in X_train])
        test_features = vectorizer.transform([r for r in X_test])
        # Fit model to the training data
        nb = MultinomialNB()
        nb.fit(train_features, [int(r) for r in y_train])
        # Predict classifications for test features
        predictions = nb.predict(test_features)
        # Compute the accuracy.
        actual = [int(r) for r in y_test]
        fpr, tpr, thresholds = metrics.roc_curve(actual, predictions, pos_label=1)
        current_auc = metrics.auc(fpr, tpr)
        # Add current auc to list
        auc_list.append(current_auc)
        print("Area Under Curve: {0}".format(current_auc))
    np_out = np.array(auc_list)
    avg = np.mean(np_out)
    std = np.std(np_out)
    ax.scatter(s, avg)
    ax.errorbar(s, avg, yerr=std)
    print("Average accuracy with ", s, " splits: ", avg, "\n")
plt.show()

K =  3
Area Under Curve: 0.7133880842009334
Area Under Curve: 0.7098642317118737
Area Under Curve: 0.707854104104104
Average accuracy with  3  splits:  0.710368806672 

K =  4
Area Under Curve: 0.7098791785540476
Area Under Curve: 0.726765503530861
Area Under Curve: 0.7095319110006744
Area Under Curve: 0.7068167027716117
Average accuracy with  4  splits:  0.713248323964 

K =  5
Area Under Curve: 0.7177976443876826
Area Under Curve: 0.7246980089543956
Area Under Curve: 0.7144189833072725
Area Under Curve: 0.7146984954837026
Area Under Curve: 0.7183999981358478
Average accuracy with  5  splits:  0.718002626054 

