# K-fold Cross-Validation
Idea: Split the data into k sections or 'folds'. The model runs k times. Each fold is used once as validation while the others form the training set. The accuracy is the average of all the tests. 

In [5]:
import pandas as pd
import numpy as np
import csv
from sklearn.model_selection import train_test_split, KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
import os

RANDOM_STATE = 42

#Split
result = pd.read_csv("balanced_dataset.csv")
X = result['review']
y = result['label']

SPLITS = 3
kf = KFold(n_splits=SPLITS, shuffle=True, random_state=RANDOM_STATE)
counter = 0

for i in range(1, SPLITS+1):
    if(os.path.isdir("SPLIT_" + str(i)) == False):
        os.mkdir("SPLIT_"  + str(i))

for train_ind, test_ind in kf.split(X):
    train_df = result.iloc[train_ind, :]
    test_df = result.iloc[test_ind, :]
    counter = 1 + counter
    #Print to the training set
    train_df.to_csv('SPLIT_' + str(counter) + '/train.csv', encoding='utf-8', index=False)
    #Print the validation to the test set
    test_df.to_csv('SPLIT_' + str(counter) + '/test.csv', encoding='utf-8', index=False)

Before we start training the classifier, we need to declare the list of stop words we are using:

In [6]:
#This is a list with words that we may encounter in the reviews that just add noise to our classifier
#E.g.: pronouns, articles, quantifiers, etc.
stop_words = ["you", "your", "got", "she", "they", "him", "her", "them", "what", "where", "who", "have", "does",
              "whom", "mine", "yours", "his", "hers", "ours", "theirs", "this", "that", "these", "those", "did", 
              "there", "about", "which", "whose", "whoever", "whatever", "whichever", "whomever", "wherever", 
              "myself", "yourself", "because", "for", "himself", "herself", "itself", "ourselves", "themselves", 
              "anything", "everybody", "another", "each", "few", "many", "none", "some", "all", "any", "it", "our",   
              "anybody", "anyone", "everyone", "everything", "nobody", "other", "others", "somebody", "someone", 
              "something", "one", "the", "before", "after", "through", "just", "could", "but", "however", "how", 
              "can", "could", "with", "went", "are", "were", "was", "and", "from", "would"]

# Train the Classifier & Perform the Prediction

Training: Create a dictionary using all the words that appear in the reviews in train.csv then find how many times each word appears in positive, negative, and neutral reviews. Create a file dictionary.csv with the stats.

Predicting: We classify every record in test.csv as either positive (1), negative (-1), or neutral (0) using a supervised naive-Bayes algorithm

In [None]:
for counter in range(1, SPLITS+1):
    #Create a dictionary with all the words that appear in the reviews
    df = pd.DataFrame()
    print("Processing train and test files for SPLIT =", counter)
    print("Please allow for a sufficient amount of time (30 to 35 hrs) to train the classifier and run the model to predict test.csv.")
    with open('SPLIT_' + str(counter) + '/train.csv', encoding='utf-8') as f:
        reader = csv.reader(f)
        counter_negative_reviews = 0
        counter_positive_reviews = 0
        counter_neutral_reviews = 0
        #Read every line
        for row in reader:
            review = len(row) - 2 #Position of the review column
            label = len(row) - 1 #Position of the label column
            #Skip the first row
            if(row[review] != "review"):
                if(row[label] == "1"):
                    counter_positive_reviews = counter_positive_reviews + 1
                elif(row[label] == "0"):
                    counter_neutral_reviews = counter_neutral_reviews + 1
                elif(row[label] == "-1"):
                    counter_negative_reviews = counter_negative_reviews + 1                
                print("Creating dictionary. Currently working on line with bookID", row[0])
                #Tokenize the review into words and store them in a list 
                list = row[review].split()
                #For each word in the list
                for i in range(0, len(list)):
                    word = list[i].lower().strip()
                    #That is not an article, pronoun or preposition and that is not in stop_words
                    if((word not in stop_words) and (len(word) > 2)):
                        #Find its label
                        review_type = ""
                        if(row[label] == "1"):
                            review_type = "positive"
                        elif(row[label] == "0"):
                            review_type = "neutral"
                        else:
                            review_type = "negative"
                        #If that word is already in the dataframe update the label count
                        if(len(df) > 0 and len(df[df.word == word]) == 1):
                            df.loc[df.word == word, review_type] = 1 + df.loc[df.word == word, review_type]
                        #If not then create it
                        else:
                            if(review_type == "positive"):
                                df = df.append({'word': word, 'positive': 1, 'negative': 0, 'neutral': 0}, ignore_index=True) 
                            elif(review_type == "neutral"):
                                df = df.append({'word': word, 'positive': 0, 'negative': 0, 'neutral': 1}, ignore_index=True)
                            else:
                                df = df.append({'word': word, 'positive': 0, 'negative': 1, 'neutral': 0}, ignore_index=True)

    #print(df) 
    #Place our new lexicon in a csv file
    df.to_csv('SPLIT_' + str(counter) + '/dictionary.csv', index=False)  
    counter_total_reviews = counter_positive_reviews + counter_negative_reviews + counter_neutral_reviews
    print("Total number of positive reviews:", counter_positive_reviews)
    print("Total number of negative reviews:", counter_negative_reviews)
    print("Total number of neutral reviews:", counter_neutral_reviews)
    print("Total number of reviews:", counter_total_reviews)
    with open('SPLIT_' + str(counter) + '/prediction.csv', "w") as w:
        with open('SPLIT_' + str(counter) + '/test.csv', encoding='utf-8') as f:
            reader = csv.reader(f)            
            #Read every line
            for row in reader:
                #Check if you are working with the first line so you can add a new column, "NB_predictedLabel" to the new file prediction.csv
                if(row[0] == "bookID"):
                    row.append("NB_predictedLabel")
                    w.write(",".join(row))
                    w.write("\n")
                else: 
                    #1. Copy the whole row into prediction.csv
                    for i in range(0, len(row)):
                        w.write(row[i])
                        w.write(",")
                    #2. Build the model and perform the prediction     
                    #Continue reading from test.csv
                    review = len(row) - 2 #Position of the review column in test.csv
                    label = len(row) - 1 #Position of the label column in test.csv
                    print("Predicting the review type of row with bookID", row[0])                                
                    #Tokenize the review into words and store them in a list 
                    list = row[review].split()
                    #Calculate the general probability that a new review has of belonging to each of the 3 classes [this calculation is based on the train.csv file]
                    probability_positive = counter_positive_reviews/counter_total_reviews #P(Positive)
                    probability_negative = counter_negative_reviews/counter_total_reviews #P(Negative)
                    probability_neutral = counter_neutral_reviews/counter_total_reviews #P(Neutral)   
                    #For each word in the list
                    for i in range(0, len(list)):
                        word = list[i].lower().strip()
                        #That is not an article, pronoun or preposition and that is not in stop_words
                        if((word not in stop_words) and (len(word) > 2)):
                            #Predict the conditional likelihood probability that this word has of appearing in each of the 3 classes: positive, negative and neutral [based on train.csv and the lexicon]
                            #1. The word is in lexicon
                            if(len(df[df.word == word]) == 1):
                                #Calculate P(word|positive)
                                p = (int(df.loc[df.word == word, "positive"]) + 1)/(sum(df["positive"].tolist()) + len(df))
                                probability_positive = p * probability_positive
                                #Calculate P(word|negative)
                                p = (int(df.loc[df.word == word, "negative"]) + 1)/(sum(df["negative"].tolist()) + len(df))
                                probability_negative = p * probability_negative
                                #Calculate P(word|neutral)
                                p = (int(df.loc[df.word == word, "neutral"]) + 1)/(sum(df["neutral"].tolist()) + len(df))
                                probability_neutral = p * probability_neutral
                            #2. The word is not in the lexicon
                            else:
                                #Calculate P(word|positive)
                                p = (1)/(sum(df["positive"].tolist()) + len(df))
                                probability_positive = p * probability_positive
                                #Calculate P(word|negative)
                                p = (1)/(sum(df["negative"].tolist()) + len(df))
                                probability_negative = p * probability_negative
                                #Calculate P(word|neutral)
                                p = (1)/(sum(df["neutral"].tolist()) + len(df))
                                probability_neutral = p * probability_neutral 
                    #3. Copy the prediction into the last column of the row
                    if(probability_positive > probability_negative and probability_positive > probability_neutral):
                        w.write("1")
                    elif(probability_neutral > probability_negative and probability_neutral > probability_positive):
                        w.write("0")
                    elif(probability_negative > probability_neutral and probability_negative > probability_positive):
                        w.write("-1")
                    else: #When there are two or three classes with the same probability
                        w.write("N/A")        
                    w.write("\n")
    print("Done.")  

# Calculate the Error Rate

In [None]:
import csv

SPLITS = 3

for counter in range(1, SPLITS+1):
    correct_predictions = 0
    incorrect_predictions = 0
    with open('SPLIT_' + str(counter) + '/prediction.csv', encoding='utf-8') as f:
        reader = csv.reader(f)
        #Read every line
        for row in reader:
            label = len(row) - 2 
            predicted_label = len(row) - 1 
            if(row[0] != "bookID"):            
                if(row[label] == row[predicted_label]):
                    correct_predictions = 1 + correct_predictions
                else:
                    incorrect_predictions = 1 + incorrect_predictions
print("Classifier Accuracy:", correct_predictions/(correct_predictions + incorrect_predictions))