In [1]:
import os
import nltk
import pandas
import collections
import itertools
import numpy as np

In [2]:
K = 0.1
POS_PATH = './trainData/posTrain/'
NEG_PATH = './trainData/negTrain/'
POS_TEST_PATH = './TestData/posTest/'
NEG_TEST_PATH = './TestData/negTest/'

In [3]:
tokenizer = nltk.tokenize.RegexpTokenizer(r'[a-zA-Z]+[\']*[a-zA-Z]+|[;!?$]')

In [4]:
# Get all the training files.. this will take a sec
posFileList = [POS_PATH+f for f in os.listdir(POS_PATH)]
negFileList = [NEG_PATH+f for f in os.listdir(NEG_PATH)]

In [5]:
posTestList = [POS_TEST_PATH+f for f in os.listdir(POS_TEST_PATH)]
negTestList = [NEG_TEST_PATH+f for f in os.listdir(NEG_TEST_PATH)]

In [6]:
# Input fileList = list of files in directory with all tokens of one class
# Returns tokens - list of words in all files. 
def getTokens(fileList):
    tokens = []
    for filename in fileList:
        with open(filename, 'r', encoding='ISO-8859-1') as f:
            rawText = f.read()
            tokens += tokenizer.tokenize(rawText)
            
    return tokens


In [7]:
posTokens = getTokens(posFileList)
negTokens = getTokens(negFileList)

In [8]:
# Input token list
# Output dictionary with key = word, val = word count
def getFreq(tokens):
    d = {}
    for word in tokens:
        if word in d:
            d[word] += 1
        else:
            d[word] = 1
    return d


In [12]:
# Inputs
#___ word: word we are looking at
#___ dic: dictionary of word counts for specific class (pos or neg)
#___ a: alpha value
#___ d: number of unique words in both negative and positive (or neg) documents
#___ N: number of total word appearances in positive (or neg) documents.

def calcLaplaseSmooth(word,dic,a,d,N):
    if word in dic:
        x = dic[word]
    else:
        x = 0
        
    return (x+a)/(N + a*d)

In [13]:
# Input
# ptl = positive tokens list
# ntl = negative tokens list
# output - dictionary of all words conditional probability - positive and negative
# probability is (xi + alpha)/(N + a*d)
#___ xi is The number of appearances on of the word w in positive (or neg) documents
#___ a is the pseudo-count
#___ N is The number of total word appearances in positive (or neg) documents.
#___ d is the number of unique words in both negative and positive documents
def getLaplaseSmooth(ptl,ntl,a):
    p = getFreq(ptl)                      # dictionary of word frequency, positive
    n = getFreq(ntl)
    
    allWords = set(p.keys())              # creates set of all words - for unique total words
    allWords.update(set(n.keys()))
    d = len(allWords)

    cpp = {}                              # Conditional probability positve dictionary 
    cpn = {}
    
    for word in allWords:
        cpp[word] = calcLaplaseSmooth(word,p,a,d,len(ptl))
        cpn[word] = calcLaplaseSmooth(word,n,a,d,len(ntl))
        
    return cpp, cpn

In [14]:
conPos, conNeg = getLaplaseSmooth(posTokens, negTokens,0.1)

In [18]:
conPos

{'geiger': 2.473157584160316e-07,
 'servicing': 2.7204733425763476e-06,
 'classify': 2.7204733425763476e-06,
 'teen': 0.00011153940704563025,
 'doubtless': 2.7204733425763476e-06,
 'ryne': 5.193630926736663e-06,
 'preconception': 2.7204733425763476e-06,
 'tiara': 2.473157584160316e-07,
 'johnstone': 2.473157584160316e-07,
 'unmistakably': 5.193630926736663e-06,
 'feather': 2.473157584160316e-07,
 'bettis': 2.473157584160316e-07,
 "everett's": 2.473157584160316e-07,
 'intention': 1.7559418847538242e-05,
 'shuler': 5.193630926736663e-06,
 'forgo': 2.473157584160316e-07,
 'sterotypically': 2.7204733425763476e-06,
 'fences': 5.193630926736663e-06,
 'loitering': 2.473157584160316e-07,
 'juggler': 2.7204733425763476e-06,
 'clandestine': 2.473157584160316e-07,
 'piano': 2.497889160001919e-05,
 'wollter': 2.473157584160316e-07,
 'flagging': 2.7204733425763476e-06,
 'belonging': 1.261310367921761e-05,
 'viv': 2.7204733425763476e-06,
 'inhabited': 5.193630926736663e-06,
 'nemesis': 7.66678851089

In [33]:
len(posTokens)

400926

In [34]:
s = 0
for key, val in d.items():
    s += val
s

400926

In [35]:
len(negTokens)

353925

In [37]:
len(dneg)

23882

In [38]:
len(d)

25554

In [42]:
s = set(d.keys())
for word in dneg:
    s.add(word)


In [43]:
len(s)

34154

In [49]:
s2 = set(d.keys())
s2.update(set(dneg.keys()))
len(s2)

34154

In [8]:
conditionalProbPos = {}
conditionalProbNeg = {}
# Calculate smoothing and prior

In [20]:
TP = sum(resultsP.values())
FN = len(resultsP.values()) - TP
TN = sum(resultsN.values())
FP = len(resultsN.values()) - TN
acc = (TP+TN)/(TP+FN+FP+TN)
print('Pred \ Gold \t P \t N')
print('P \t \t '+str(TP)+' \t '+str(FP))
print('N \t \t '+str(FN)+' \t '+str(TN))

Pred \ Gold 	 P 	 N
P 	 	 73 	 21
N 	 	 16 	 69


In [19]:
acc = (TP+TN)/float(TP+FN+FP+TN)
print('accuracy is: ')
print(acc)

accuracy is: 
0.793296089385
