In [1]:
import os
import nltk
import pandas
import collections
import itertools
import numpy as np
import math

In [2]:
K = 0.1
POS_PATH = './trainData/posTrain/'
NEG_PATH = './trainData/negTrain/'
POS_TEST_PATH = './TestData/posTest/'
NEG_TEST_PATH = './TestData/negTest/'

In [3]:
tokenizer = nltk.tokenize.RegexpTokenizer(r'[a-zA-Z]+[\']*[a-zA-Z]+|[;!?$]')

In [4]:
# Get all the training files.. this will take a sec
posFileList = [POS_PATH+f for f in os.listdir(POS_PATH)]
negFileList = [NEG_PATH+f for f in os.listdir(NEG_PATH)]

In [5]:
posTestList = [POS_TEST_PATH+f for f in os.listdir(POS_TEST_PATH)]
negTestList = [NEG_TEST_PATH+f for f in os.listdir(NEG_TEST_PATH)]

In [6]:
"""
getTokens
    Creates list of words from multiple files, removes punctuation, lower cases all
Input
    fileList = list of files in the directory with all tokens of one class
Returns
    tokens = list of words in all files of given fileList
"""
def getTokens(fileList):
    tokens = []
    for filename in fileList:
        with open(filename, 'r', encoding='ISO-8859-1') as f:
            rawText = f.read()
            tokens += tokenizer.tokenize(rawText)
            
    return tokens


In [7]:
"""
getFreq
    Counts the number of times a word is in a list
Input
    tokens = list of words
Output
    d = dictionary with key=word and value = count of word in given list
"""
def getFreq(tokens):
    d = {}
    for word in tokens:
        if word in d:
            d[word] += 1
        else:
            d[word] = 1
    return d


In [8]:
"""
calcLaplaseSmooth
    Calculates the probability of the word given the class with laplase smoothing
    Probability is (xi + a)/(N + a*d)
        xi =  The number of appearances on of the word w in positive (or neg) documents
        a = the pseudo-count
        N = The number of total word appearances in positive (or neg) documents.
        d = the number of unique words in both negative and positive documents
Inputs
    word = word we are looking at
    dic = dictionary of word counts for specific class (pos or neg)
    a = alpha value
    d = number of unique words in both negative and positive (or neg) documents
    N = number of total word appearances in positive (or neg) documents
Outputs
    Probability of word given class, with laplase smoothing. 
"""
def calcLaplaseSmooth(word,dic,a,d,N):
    if word in dic:
        x = dic[word]
    else:
        x = 0     
    return (x+a)/(N + a*d)

In [9]:
"""
getLaplaseSmooth
    Calculates the probability of the word given the class with laplase smoothing
        for all words in reviews. 
Inputs
    pfl = positive file list
    nfl = negative file list
    a = alpha value
Outputs
    cpp, cpn = dictionarys of all words conditional probability - positive and negative
"""
def getLaplaseSmooth(pfl,nfl,a):
    ptl = getTokens(pfl)                  # list of words from the files in the list
    ntl = getTokens(nfl)
    
    p = getFreq(ptl)                      # dictionary of word frequency, positive
    n = getFreq(ntl)
    
    allWords = set(p.keys())              # creates set of all words - for unique total words
    allWords.update(set(n.keys()))
    d = len(allWords)

    cpp = {}                              # Conditional probability positve dictionary 
    cpn = {}
    
    for word in allWords:
        cpp[word] = calcLaplaseSmooth(word,p,a,d,len(ptl))
        cpn[word] = calcLaplaseSmooth(word,n,a,d,len(ntl))
        
    return cpp, cpn

In [10]:
"""
test
    Predicts if review is positive or negative based off probabilites calulated.
        To avoid the risk of the computer approximating our probabilities to zeros, use log space
            We add log of probabilities instead of multipling all word probabilities P(c = pos|wi)
Inputs
    fileList = list of files to test with
    actualClass = correct class of the review
    cpp = conditional probability positive dictionary
    cpn = conditional probability negative dictionary
Output
    r = dictionary of file names and if classified correctly or not (1 or 0)
"""
def test(fileList, actualClass, cpp, cpn):
    
    r = {}                                     # result dictionary
    
    for filename in fileList:
        
        tokens = getTokens([filename])         # use other function to get tokens in this file
        pp = 0                                 # probability positive count
        pn = 0                                 # probability negative count 
        
        for word in tokens:                    # have to sum log of probabilities
            if word in cpp:
                pp += math.log(cpp[word])
            if word in cpn:
                pn += math.log(cpn[word])
                
        if pp > pn and actualClass == 'p':     # Compare sums to determine predicted class
            r[filename] = 1
        elif pn > pp and actualClass == 'n':
            r[filename] = 1
        else:
            r[filename] = 0
            
    return r
    

In [11]:
# Calculate smoothing and prior
conditionalProbPos, conditionalProbNeg = getLaplaseSmooth(posFileList, negFileList,K)

In [12]:
# Test 
resultsP = test(posTestList, 'p', conditionalProbPos, conditionalProbNeg)
resultsN = test(negTestList, 'n', conditionalProbPos, conditionalProbNeg)

In [13]:
TP = sum(resultsP.values())
FN = len(resultsP.values()) - TP
TN = sum(resultsN.values())
FP = len(resultsN.values()) - TN
acc = (TP+TN)/(TP+FN+FP+TN)
print('Pred \ Gold \t P \t N')
print('P \t \t '+str(TP)+' \t '+str(FP))
print('N \t \t '+str(FN)+' \t '+str(TN))

Pred \ Gold 	 P 	 N
P 	 	 73 	 21
N 	 	 16 	 69


In [14]:
acc = (TP+TN)/float(TP+FN+FP+TN)
print('accuracy is: ')
print(acc)

accuracy is: 
0.7932960893854749
