# Naive Bayes Classifier -- Homework 4

### Basic Information:
IMDb Database used. There are two files from a separate repository: Positive Reviews and Negative Reviews 

##### Split the dataset into 
- 70% for the training set
- 15% for the development set
- 15% for the test set

Build a binary classifier to perform the movie review classification automatically.

In [1]:
# Mary B. Makarious
# Homework 4 -- IMDb Classification

In [15]:
# Import Packages 

import string
import re
from collections import Counter
import numpy as np
import pandas as pd
import random
import math
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

# Keep everything in Jupyter
%matplotlib inline 

# Ignore several specific Pandas warnings
import warnings
warnings.filterwarnings("ignore")

## Read in the Data
- Split them by line
- Add to a list 

In [8]:
# Read in the data

positiveFile = open("rt-polarity.pos","r")
positiveReviews = positiveFile.read()
positiveReviews = positiveReviews.lower()
positiveReviews = positiveReviews.split("\n")

negativeFile = open("rt-polarity.neg","r")
negativeReviews = negativeFile.read()
negativeReviews = negativeReviews.lower()
negativeReviews = negativeReviews.split("\n")

### Merging
- Merge all the reviews into 2 lists

In [12]:
movie_ratings = []
for i in range(0, len(positiveReviews)):
    movie_ratings.append('pos')
for i in range(0, len(negativeReviews)):
    movie_ratings.append('neg')

movie_reviews = []
movie_reviews = positiveReviews + negativeReviews
reviews = []
for i in range(0, len(movie_reviews)):   
    if movie_ratings[i] == 'pos':
        category = "pos"
        value = movie_reviews[i]
    else: 
        category = "neg"
        value = movie_reviews[i]
        
    reviews.append((category, value))

print ("Length of Reviews: ", len(reviews))

Length of Reviews:  10664


### Shuffling
- Shuffle the index values 
- Split the data into the train, development, and test sets

In [13]:
random.shuffle(reviews)

split = round(len(reviews)*.70)
split2 = round(len(reviews)*.15)

train_set = reviews[0:split]
del reviews[0:split]

development_set = reviews[0:split2]
del reviews[0:split2]

test_set = reviews[0:split2]

print ("Length of Training Set: ", len(train_set))
print ("Length of Development Set: ",len(development_set))
print ("Length of Test Set: ",len(test_set))

Length of Training Set:  7465
Length of Development Set:  1600
Length of Test Set:  1599


## Helper Functions
These serve the following purposes: 
- Removing punctuation
- Removing the whitespace

In [24]:
def removePunctuation(data):
    table = data.translate(str.maketrans("", "", string.punctuation)) 
    return table

def postPunctuationTrimming(data):
    data = removePunctuation(data)
    data = data.lower()
    return re.split("\W+", data)

# Testing out the Helper Functions
# Optional 

trimFunction = postPunctuationTrimming(positiveReviews[0])
counter = Counter(trimFunction)


print(counter)

Counter({'to': 2, 'the': 2, '': 1, 'jeanclaud': 1, 'hes': 1, 'damme': 1, 'that': 1, 'than': 1, 'van': 1, 'centurys': 1, 'and': 1, 'arnold': 1, 'is': 1, 'splash': 1, 'make': 1, 'schwarzenegger': 1, 'conan': 1, 'segal': 1, 'a': 1, 'going': 1, 'rock': 1, '21st': 1, 'or': 1, 'new': 1, 'steven': 1, 'greater': 1, 'even': 1, 'be': 1, 'destined': 1})


## Training the Model
- Count the words
- Assosciating the words with their prior probabilities and labels 

In [17]:
wordcollection = {}
word_count = {"pos": {}, "neg": {}}
priorProb = {"pos" : 0., "neg" : 0.}
documents = []

for i in range(0, len(train_set)):   
    category = train_set[i][0]
    indexValue = train_set[i][1]
        
    documents.append((category, indexValue))
    
    priorProb[category] += 1
    
    words = postPunctuationTrimming(indexValue)
    
    counts = Counter(words)
    
    for word, count in counts.items():
        if word not in wordcollection:
            wordcollection[word] = 0.0 
        if word not in word_count[category]:
            word_count[category][word] = 0.0
        wordcollection[word] += count
        word_count[category][word] += count

## Label Predictions 
##### This predict function takes in the reviews one at a time and implements the Naive Bayes algorithm to predict the labels based on the words

In [20]:
def Predict(data):
    movie_scores = []
    for i in range(0, len(data)):
        words = postPunctuationTrimming(data[i][1])
        counts = Counter(words)
    
# Probability of the word being positive or negative 
        priorProb_pos = (priorProb["pos"] / sum(priorProb.values()))
        priorProb_neg = (priorProb["neg"] / sum(priorProb.values()))

        logProb_positive = 0.0
        logProb_negative = 0.0

        for words, count in counts.items():
            if not words in wordcollection or len(words) <=2:
                continue
            probWord = wordcollection[words] / sum(wordcollection.values())
            probWord_givePositive = word_count["pos"].get(words, 0.0) / sum(word_count["pos"].values())
            probWord_giveNegative = word_count["neg"].get(words, 0.0) / sum(word_count["neg"].values())
    
            if probWord_givePositive > 0:
                logProb_positive += math.log(count * probWord_givePositive / probWord)
            if probWord_giveNegative > 0:
                logProb_negative += math.log(count * probWord_giveNegative / probWord)
        
        positiveScore = math.exp(logProb_positive + math.log(priorProb_pos))
        negativeScore = math.exp(logProb_negative + math.log(priorProb_neg))                  
    
        if positiveScore > negativeScore:
            movie_scores.append("pos")
        else:
            movie_scores.append("neg")
            
# Return the movie scores
    return movie_scores

## Accuracy and f1 Scores -- for Development Set
Initial labels are taken in, then a 0 or 1 is given in order to be used to find f1 scores

In [26]:
initialLabel = []
binaryLabel = []
binaryPredictions = []
for l in range(0, len(development_set)):
    initialLabel.append(development_set[l][0])
predictions = Predict(development_set)

for l in range(0, len(development_set)):
    if initialLabel[l] == 'pos':
        binaryLabel.append(1)
    else:
        binaryLabel.append(0)

for l in range(0, len(development_set)):
    if predictions[l] == 'pos':
        binaryPredictions.append(1)
    else:
        binaryPredictions.append(0)        

print("Accuracy for Development Set: " , accuracy_score(initialLabel, predictions))
print("f1 score for Development Set: " , f1_score(binaryLabel, binaryPredictions, average = 'binary'))

Accuracy for Development Set:  0.764375
f1 score for Development Set:  0.762743864065


## Accuracy and f1 Scores -- for Test Set
Initial labels are taken in, then a 0 or 1 is given in order to be used to find f1 scores

In [29]:
initialLabel = []
binaryLabel = []
binaryPredictions = []
for l in range(0, len(test_set)):
    initialLabel.append(test_set[l][0])

predictions = Predict(test_set)

for l in range(0, len(test_set)):
    if initialLabel[l] == 'pos':
        binaryLabel.append(1)
    else:
        binaryLabel.append(0)

for l in range(0, len(test_set)):
    if predictions[l] == 'pos':
        binaryPredictions.append(1)
    else:
        binaryPredictions.append(0)        

print("Accuracy for Test Set: " , accuracy_score(initialLabel, predictions))
print("f1 score for Test Set: " , f1_score(binaryLabel, binaryPredictions, average = 'binary'))

Accuracy for Test Set:  0.759849906191
f1 score for Test Set:  0.754161331626
