In [1]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics

In [2]:
result = pd.read_csv("balanced_dataset.csv")

# K-fold cross validation

---

Idea: Split the data into k sections or 'folds'. The model runs k times. Each fold is used once as validation while the others form the training set. The accuracy is the average of all the tests.

---

## SKLearn Multinomial Naive Bayes

In [3]:
RANDOM_STATE = 42

### We will run the k-fold cross validation 3 times

K-folds | Number of Runs | Train Data | Test Data
-----|-----|-----|-----
3 | 3 | 66% | 33%
4 | 4 | 75% | 25%
5 | 5 | 80% | 20%

In [4]:
split_list = [3, 4, 5]

### Set X and y
- X is the text reviews
- y is the positive, neutral, or negative labels

In [5]:
X = result['review']
y = result['label']

### Create the Vectorizer

In [6]:
vectorizer = CountVectorizer(stop_words='english')

### We run the model in the loop

In [7]:
fig, ax = plt.subplots()
ax.set_xlabel("K")
ax.set_ylabel("AUC")
ax.set_title("Accuracy Mean and Standard Deviation")
for s in split_list:
    auc_list = []
    print("K = ", s)
    kf = KFold(n_splits=s, shuffle=True, random_state=RANDOM_STATE)
    for train_ind, test_ind in kf.split(X):
        # Split X and y on the index given by the KFold
        X_train, X_test = X[train_ind], X[test_ind]
        y_train, y_test = y[train_ind], y[test_ind]
        # Generate the text counts using the vectorizer
        train_features = vectorizer.fit_transform([r for r in X_train])
        test_features = vectorizer.transform([r for r in X_test])
        # Fit model to the training data
        nb = MultinomialNB()
        nb.fit(train_features, [int(r) for r in y_train])
        # Predict classifications for test features
        predictions = nb.predict(test_features)
        # Compute the accuracy.
        actual = [int(r) for r in y_test]
        fpr, tpr, thresholds = metrics.roc_curve(actual, predictions, pos_label=1)
        current_auc = metrics.auc(fpr, tpr)
        # Add current auc to list
        auc_list.append(current_auc)
        print("Area Under Curve: {0}".format(current_auc))
    np_out = np.array(auc_list)
    avg = np.mean(np_out)
    std = np.std(np_out)
    ax.scatter(s, avg)
    ax.errorbar(s, avg, yerr=std)
    print("Average accuracy with ", s, " splits: ", avg, "\n")
plt.show()

K =  3
Area Under Curve: 0.7000706744418634
Area Under Curve: 0.7015808465295192
Area Under Curve: 0.722141168910934
Average accuracy with  3  splits:  0.707930896627 

K =  4
Area Under Curve: 0.7007299140394527
Area Under Curve: 0.7053729775569522
Area Under Curve: 0.7141166994809379
Area Under Curve: 0.7235998717514764
Average accuracy with  4  splits:  0.710954865707 

K =  5
Area Under Curve: 0.6984328315748425
Area Under Curve: 0.7213039171443699
Area Under Curve: 0.7048253470667264
Area Under Curve: 0.7221744636202467
Area Under Curve: 0.7215223100720008
Average accuracy with  5  splits:  0.713651773896 

