In [2]:
import csv
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics

# Task:
- Sub-sample the dataset so that you have equal entries for positive, neutral and negative. 

---
Then either
- Split the dataset into training and testing: This is a traditional split
- Perform K-Fold cross validation: This split happens multiple times.
  There is no dedicated 'train' and 'test' file

In [3]:
RANDOM_STATE = 42

## Import CSV
- This is the same dataset.csv on the google drive
- except the file was saved under encoding UTF-8

In [4]:
df = pd.read_csv("dataset_enc.csv")

- As we can see, it has 102,647 rows and 10 columns

In [5]:
df.shape

(102647, 10)

## Data stats
- 74k positive
- 22k neutral
- 5.5k negative

In [23]:
df.groupby('label').count()

Unnamed: 0_level_0,bookID,title,author,rating,ratingsCount,reviewsCount,reviewerName,reviewerRatings,review
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
-1,5565,5535,5560,5565,5565,5565,5521,5565,5565
0,22704,22636,22651,22704,22704,22704,22432,22704,22704
1,74378,74045,74117,74378,74378,74378,73733,74378,74378


## Subsample the data 
- Idea: each label needs to be represented equally
- Create seperate dataframes filled with pos, neg, and neutral values

In [25]:
df_pos = df[df['label']==1]
df_neutral = df[df['label']==0]
df_neg = df[df['label']==-1]

- Take 5,500 randomly selected values of each data frame

In [26]:
df_neg_sampled = df_neg.sample(5500, random_state=RANDOM_STATE)
df_pos_sampled = df_pos.sample(5500, random_state=RANDOM_STATE)
df_neutral_sampled = df_neutral.sample(5500, random_state=RANDOM_STATE)

In [27]:
df_neg_sampled.shape

(5500, 10)

- Combine them into one dataframe where each label is represented equally

In [28]:
frames = [df_neg_sampled, df_pos_sampled, df_neutral_sampled]
result = pd.concat(frames)
result.shape

(16500, 10)

In [29]:
result.groupby('label').count()

Unnamed: 0_level_0,bookID,title,author,rating,ratingsCount,reviewsCount,reviewerName,reviewerRatings,review
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
-1,5500,5470,5495,5500,5500,5500,5457,5500,5500
0,5500,5487,5482,5500,5500,5500,5445,5500,5500
1,5500,5475,5479,5500,5500,5500,5458,5500,5500


- Write to file

In [42]:
result.to_csv('balanced_dataset.csv', encoding='utf-8')

# Split

In [6]:
result = pd.read_csv("balanced_dataset.csv")

In [7]:
X = result['review']
y = result['label']

## Option 1. The Traditional Train-Test Split
- This is the first option, not ideal

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=RANDOM_STATE)

## Option 2. K-fold cross validation

---

Idea: Split the data into k sections or 'folds'. The model runs k times. Each fold is used once as validation while the others form the training set. The accuracy is the average of all the tests.

---

In [8]:
SPLITS = 3

In [9]:
kf = KFold(n_splits=SPLITS, shuffle=True, random_state=RANDOM_STATE)

In [10]:
# Vectorizer is for the 'fast' Naive Bayes
vectorizer = CountVectorizer(stop_words='english')

## We run the model in the loop
- Note: this model is in progress

In [16]:
running_avg = 0
for train_ind, test_ind in kf.split(X):
    # Split X and y on the index given by the KFold
    X_train, X_test = X[train_ind], X[test_ind]
    y_train, y_test = y[train_ind], y[test_ind]
    # HERE IS WHERE WE RUN THE MODEL
    # below is my starter code on running the 'fast' Naive Bayes
    # Generate the text counts using the vectorizer
    train_features = vectorizer.fit_transform([r for r in X_train])
    test_features = vectorizer.transform([r for r in X_test])
    # Fit a naive bayes model to the training data.
    nb = MultinomialNB()
    nb.fit(train_features, [int(r) for r in y_train])
    # Now we can use the model to predict classifications for our test features.
    predictions = nb.predict(test_features)
    # Compute the error.
    actual = [int(r) for r in y_test]
    fpr, tpr, thresholds = metrics.roc_curve(actual, predictions, pos_label=1)
    running_avg = running_avg + metrics.auc(fpr, tpr)
    print("Multinomial naive bayes AUC: {0}".format(metrics.auc(fpr, tpr)))
avg_accuracy = running_avg/SPLITS
print("Average accuracy: ", avg_accuracy)

Multinomial naive bayes AUC: 0.7547413187308184
Multinomial naive bayes AUC: 0.7540694619018724
Multinomial naive bayes AUC: 0.7509482437724274
Average accuracy:  0.753253008135


## MNB example from blog

Just for reference, don't run

In [None]:
vectorizer = CountVectorizer(stop_words='english')
train_features = vectorizer.fit_transform([r[0] for r in reviews])
test_features = vectorizer.transform([r[0] for r in test])

# Fit a naive bayes model to the training data.
nb = MultinomialNB()
nb.fit(train_features, [int(r[1]) for r in reviews])

# Now we can use the model to predict classifications for our test features.
predictions = nb.predict(test_features)

# Compute the error.  
fpr, tpr, thresholds = metrics.roc_curve(actual, predictions, pos_label=1)
print("Multinomial naive bayes AUC: {0}".format(metrics.auc(fpr, tpr)))