In [16]:
import os
import nltk
import pandas
import collections
import itertools
import numpy as np
import math

In [4]:
K = 0.1
POS_PATH = './trainData/posTrain/'
NEG_PATH = './trainData/negTrain/'
POS_TEST_PATH = './TestData/posTest/'
NEG_TEST_PATH = './TestData/negTest/'

In [5]:
tokenizer = nltk.tokenize.RegexpTokenizer(r'[a-zA-Z]+[\']*[a-zA-Z]+|[;!?$]')

In [6]:
# Get all the training files.. this will take a sec
posFileList = [POS_PATH+f for f in os.listdir(POS_PATH)]
negFileList = [NEG_PATH+f for f in os.listdir(NEG_PATH)]

In [7]:
posTestList = [POS_TEST_PATH+f for f in os.listdir(POS_TEST_PATH)]
negTestList = [NEG_TEST_PATH+f for f in os.listdir(NEG_TEST_PATH)]

In [8]:
# Input
#___ fileList = list of files in directory with all tokens of one class
# Returns tokens - list of words in all files. 
def getTokens(fileList):
    tokens = []
    for filename in fileList:
        with open(filename, 'r', encoding='ISO-8859-1') as f:
            rawText = f.read()
            tokens += tokenizer.tokenize(rawText)
            
    return tokens


In [9]:
posTokens = getTokens(posFileList)
negTokens = getTokens(negFileList)

In [30]:
posTokens

['the',
 'farrelly',
 'brothers',
 'third',
 'film',
 'after',
 'dumb',
 'and',
 'dumber',
 'and',
 'kingpin',
 'brings',
 'together',
 'the',
 'real',
 'life',
 'couple',
 'of',
 'cameron',
 'diaz',
 'and',
 'matt',
 'dillon',
 'some',
 'nasty',
 'humour',
 'cute',
 'dog',
 'and',
 'mix',
 'of',
 'love',
 'fate',
 'and',
 'romance',
 'plot',
 'high',
 'school',
 'nerd',
 'ted',
 'gets',
 'lucky',
 'when',
 'the',
 'cutest',
 'girl',
 'in',
 'his',
 'class',
 'asks',
 'him',
 'to',
 'the',
 'prom',
 'unfortunately',
 'for',
 'ted',
 'he',
 'accidentally',
 'gets',
 'part',
 'of',
 'his',
 'male',
 'anatomy',
 'ouch',
 '!',
 'caught',
 'in',
 'his',
 'zipper',
 'and',
 'misses',
 'the',
 'big',
 'night',
 'thirteen',
 'years',
 'later',
 'ted',
 'still',
 "can't",
 'get',
 'mary',
 'out',
 'of',
 'his',
 'mind',
 'and',
 'hires',
 'private',
 'detective',
 'dillon',
 'to',
 'find',
 'her',
 'once',
 'found',
 'it',
 'turns',
 'out',
 'that',
 'mary',
 'has',
 'more',
 'than',
 'one',
 '

In [10]:
# Input token list
# Output dictionary with key = word, val = word count
def getFreq(tokens):
    d = {}
    for word in tokens:
        if word in d:
            d[word] += 1
        else:
            d[word] = 1
    return d


In [11]:
# Inputs
#___ word: word we are looking at
#___ dic: dictionary of word counts for specific class (pos or neg)
#___ a: alpha value
#___ d: number of unique words in both negative and positive (or neg) documents
#___ N: number of total word appearances in positive (or neg) documents.

def calcLaplaseSmooth(word,dic,a,d,N):
    if word in dic:
        x = dic[word]
    else:
        x = 0
        
    return (x+a)/(N + a*d)

In [34]:
# Input
#___ pfl = positive file list
#___ nfl = negative file list
# Output
#___ dictionary of all words conditional probability - positive and negative
#
# probability is (xi + alpha)/(N + a*d)
#___ xi is The number of appearances on of the word w in positive (or neg) documents
#___ a is the pseudo-count
#___ N is The number of total word appearances in positive (or neg) documents.
#___ d is the number of unique words in both negative and positive documents

def getLaplaseSmooth(pfl,nfl,a):
    ptl = getTokens(pfl)                  # list of words from the files in the list
    ntl = getTokens(nfl)
    
    p = getFreq(ptl)                      # dictionary of word frequency, positive
    n = getFreq(ntl)
    
    allWords = set(p.keys())              # creates set of all words - for unique total words
    allWords.update(set(n.keys()))
    d = len(allWords)

    cpp = {}                              # Conditional probability positve dictionary 
    cpn = {}
    
    for word in allWords:
        cpp[word] = calcLaplaseSmooth(word,p,a,d,len(ptl))
        cpn[word] = calcLaplaseSmooth(word,n,a,d,len(ntl))
        
    return cpp, cpn

In [35]:
conPos, conNeg = getLaplaseSmooth(posFileList, negFileList,0.1)

In [14]:
conPos

{'gregson': 1.0139946095057293e-05,
 'boris': 7.66678851089698e-06,
 'unequipped': 2.473157584160316e-07,
 'moreover': 7.66678851089698e-06,
 'trophies': 5.193630926736663e-06,
 "letterman's": 2.473157584160316e-07,
 'advertising': 1.5086261263377925e-05,
 'requiring': 5.193630926736663e-06,
 'yea': 2.473157584160316e-07,
 'tantoo': 5.193630926736663e-06,
 'resides': 1.261310367921761e-05,
 'grifters': 5.193630926736663e-06,
 'comraderie': 2.473157584160316e-07,
 'ravishing': 5.193630926736663e-06,
 'wrapped': 3.4871521936660455e-05,
 'servicable': 2.473157584160316e-07,
 'primarily': 4.723730985746204e-05,
 'princess': 1.5086261263377925e-05,
 'squeaking': 2.473157584160316e-07,
 'datalink': 2.7204733425763476e-06,
 'fireball': 2.7204733425763476e-06,
 'drama': 0.00022035834074868413,
 'novikov': 2.497889160001919e-05,
 'synthesizer': 2.7204733425763476e-06,
 'peer': 2.473157584160316e-07,
 'comeuppance': 2.473157584160316e-07,
 'cancellation': 2.7204733425763476e-06,
 'restoration': 

In [36]:
conPos

{'gregson': 1.0139946095057293e-05,
 'boris': 7.66678851089698e-06,
 'unequipped': 2.473157584160316e-07,
 'moreover': 7.66678851089698e-06,
 'trophies': 5.193630926736663e-06,
 "letterman's": 2.473157584160316e-07,
 'advertising': 1.5086261263377925e-05,
 'requiring': 5.193630926736663e-06,
 'yea': 2.473157584160316e-07,
 'tantoo': 5.193630926736663e-06,
 'resides': 1.261310367921761e-05,
 'grifters': 5.193630926736663e-06,
 'comraderie': 2.473157584160316e-07,
 'ravishing': 5.193630926736663e-06,
 'wrapped': 3.4871521936660455e-05,
 'servicable': 2.473157584160316e-07,
 'primarily': 4.723730985746204e-05,
 'princess': 1.5086261263377925e-05,
 'squeaking': 2.473157584160316e-07,
 'datalink': 2.7204733425763476e-06,
 'fireball': 2.7204733425763476e-06,
 'drama': 0.00022035834074868413,
 'novikov': 2.497889160001919e-05,
 'synthesizer': 2.7204733425763476e-06,
 'peer': 2.473157584160316e-07,
 'comeuppance': 2.473157584160316e-07,
 'cancellation': 2.7204733425763476e-06,
 'restoration': 

In [19]:
# Input 
#___ fileList - list of files to test with
#___ actualClass - correct class of the review
#___ cpp - conditional probability positive
#___ cpn - conditional probability negative 
#
# Output
#___ dictionary of file names and if classified correctly or not (1 or 0)
def test(fileList, actualClass, cpp, cpn):
    
    r = {}                                     # result dictionary
    
    for filename in fileList:
        
        tokens = getTokens([filename])         # use other function to get tokens in this file
        pp = 0                                 # probability positive count
        pn = 0                                 # probability negative count 
        
        for word in tokens:                    # have to sum log of probabilities
            if word in cpp:
                pp += math.log(cpp[word])
            if word in cpn:
                pn += math.log(cpn[word])
                
        if pp > pn and actualClass == 'p':     # Compare sums to determine predicted class
            r[filename] = 1
        elif pn > pp and actualClass == 'n':
            r[filename] = 1
        else:
            r[filename] = 0
            
    return r
    

In [20]:
rp = test(posTestList, 'p', conPos, conNeg)

In [21]:
rp

{'./TestData/posTest/cv619_tok-19600.txt': 1,
 './TestData/posTest/cv625_tok-0573.txt': 1,
 './TestData/posTest/cv699_tok-10425.txt': 1,
 './TestData/posTest/cv623_tok-22849.txt': 1,
 './TestData/posTest/cv671_tok-10077.txt': 1,
 './TestData/posTest/cv620_tok-13475.txt': 1,
 './TestData/posTest/cv643_tok-26988.txt': 0,
 './TestData/posTest/cv629_tok-6997.txt': 0,
 './TestData/posTest/cv680_tok-18142.txt': 0,
 './TestData/posTest/cv698_tok-27735.txt': 1,
 './TestData/posTest/cv647_tok-9974.txt': 1,
 './TestData/posTest/cv657_tok-7984.txt': 1,
 './TestData/posTest/cv616_tok-10844.txt': 1,
 './TestData/posTest/cv654_tok-17990.txt': 1,
 './TestData/posTest/cv645_tok-21150.txt': 0,
 './TestData/posTest/cv617_tok-10917.txt': 1,
 './TestData/posTest/cv665_tok-19873.txt': 1,
 './TestData/posTest/cv685_tok-11187.txt': 0,
 './TestData/posTest/cv687_tok-20347.txt': 0,
 './TestData/posTest/cv669_tok-10965.txt': 1,
 './TestData/posTest/cv634_tok-28807.txt': 1,
 './TestData/posTest/cv644_tok-23616.t

In [22]:
sum(rp.values())

73

In [24]:
len(rp.values()) - sum(rp.values())

16

In [25]:
rn = test(negTestList, 'n', conPos, conNeg)

In [26]:
sum(rn.values())

69

In [27]:
len(rn.values()) - sum(rn.values())

21

In [8]:
conditionalProbPos = {}
conditionalProbNeg = {}
# Calculate smoothing and prior

In [20]:
TP = sum(resultsP.values())
FN = len(resultsP.values()) - TP
TN = sum(resultsN.values())
FP = len(resultsN.values()) - TN
acc = (TP+TN)/(TP+FN+FP+TN)
print('Pred \ Gold \t P \t N')
print('P \t \t '+str(TP)+' \t '+str(FP))
print('N \t \t '+str(FN)+' \t '+str(TN))

Pred \ Gold 	 P 	 N
P 	 	 73 	 21
N 	 	 16 	 69


In [19]:
acc = (TP+TN)/float(TP+FN+FP+TN)
print('accuracy is: ')
print(acc)

accuracy is: 
0.793296089385
