In [1]:
import csv
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics

In [2]:
result = pd.read_csv("balanced_dataset.csv")

# Task: K-fold cross validation

---

Idea: Split the data into k sections or 'folds'. The model runs k times. Each fold is used once as validation while the others form the training set. The accuracy is the average of all the tests.

---

In [3]:
RANDOM_STATE = 42

In [4]:
SPLITS = 3

In [5]:
kf = KFold(n_splits=SPLITS, shuffle=True, random_state=RANDOM_STATE)

In [6]:
X = result['review']
y = result['label']

## For the Hard Coded Naive Bayes

In [7]:
running_avg = 0
for train_ind, test_ind in kf.split(X):
    train_df = result.iloc[train_ind, :]
    test_df = result.iloc[test_ind, :]
    train_df.to_csv('train.csv', encoding='utf-8', index=False)
    test_df.to_csv('test.csv', encoding='utf-8', index=False)
    #MODEL GOES HERE

## For the SKLearn Multi Nomial Naive Bayes

### Create the Vectorizer

In [8]:
vectorizer = CountVectorizer(stop_words='english')

### We run the model in the loop
- Note: this model is in progress

In [9]:
running_avg = 0
for train_ind, test_ind in kf.split(X):
    # Split X and y on the index given by the KFold
    X_train, X_test = X[train_ind], X[test_ind]
    y_train, y_test = y[train_ind], y[test_ind]
    # Generate the text counts using the vectorizer
    train_features = vectorizer.fit_transform([r for r in X_train])
    test_features = vectorizer.transform([r for r in X_test])
    # Fit a naive bayes model to the training data.
    nb = MultinomialNB()
    nb.fit(train_features, [int(r) for r in y_train])
    # Now we can use the model to predict classifications for our test features.
    predictions = nb.predict(test_features)
    # Compute the accuracy.
    actual = [int(r) for r in y_test]
    fpr, tpr, thresholds = metrics.roc_curve(actual, predictions, pos_label=1)
    running_avg = running_avg + metrics.auc(fpr, tpr)
    print("Multinomial naive bayes AUC: {0}".format(metrics.auc(fpr, tpr)))
avg_accuracy = running_avg/SPLITS
print("Average accuracy: ", avg_accuracy)

Multinomial naive bayes AUC: 0.7000706744418634
Multinomial naive bayes AUC: 0.7015808465295192
Multinomial naive bayes AUC: 0.722141168910934
Average accuracy:  0.707930896627
