In [43]:
#a. Divide the dataset as train, development and test.
import os, string, re
import numpy as np
from math import log
from random import randrange, seed, shuffle

paths = ['aclImdb\\train\\neg\\',  'aclImdb\\train\\pos\\', 'aclImdb\\test\\neg\\', 'aclImdb\\test\\pos\\']


In [44]:
def getData(path, files):
    tempData = list()
    for file in files:
        with open(path + file, 'r', encoding = 'utf-8') as currFile: 
            for line in currFile.readlines():
                tempData.append(line.replace('<br />', '').translate(str.maketrans('', '', string.punctuation)).lower())
    return tempData

In [45]:
actualData = [[], [], [], []]

for path in paths:
    actualData[paths.index(path)] = getData(path, os.listdir(path))

trainNegData, trainPosData, testNegData, testNegData = actualData[0], actualData[1], actualData[2], actualData[3]


In [46]:
trainNegDWL, trainPosDWL = [], []
testNegDWL, testPosDWL = [], []

labeledData = [trainNegDWL, trainPosDWL, testNegDWL, testPosDWL]

In [47]:
def applyLabel(data, outputList):
    for d in data:
        for x in d:
            if data.index(d) % 2 == 0:
                outputList[data.index(d)].append([x, 0])
            elif data.index(d) % 2 != 0:
                outputList[data.index(d)].append([x, 1])

In [48]:
applyLabel(actualData, labeledData)
print(len(trainNegDWL))

12500


In [49]:
trainData = trainNegDWL + trainPosDWL
testData = testNegDWL + testPosDWL

In [50]:
def devideData(data, k):
    train, dev = data, []
    #random.seed(1)
    shuffle(data)
    train = data[:int(len(data)*k)]
    dev = data[int(len(data)*k):]
    
    '''for z in range(int(len(data)*(1-k))):
        dev.append(train.pop(random.randrange(len(train))))'''
    
    return train, dev
    

In [51]:
trainData, devData = devideData(trainData, 0.75)

print('Train dataset size: ', len(trainData))
print('Dev dataset size: ', len(devData))
print('Test dataset size: ', len(testData))

Train dataset size:  18750
Dev dataset size:  6250
Test dataset size:  25000


In [52]:
#b.Build a vocabulary as list. 
def buildVocab(data, numOfOccurr):
    vocab = {} #words have 2 counts in order neg, pos
    
    for d in data:
        line = set()
        line = line.union(set([z for z in re.split(r'[\s|,|;|.|/|\[|\]|;|\!|?|\'|\\|\)|\(|\"|@|&|#|-|*|%|>|<|^|-]\s*',str(d[0]).replace('<br />', '').strip()) if z]))
        for word in line:
            if word not in vocab:
                vocab[word] = [0, 0]
            if d[1] == 0:
                vocab[word][0] += 1
            elif d[1] == 1:
                vocab[word][1] += 1
        
    for key in list(vocab.keys()):
            if vocab[key][0] + vocab[key][1] < numOfOccurr:
                del vocab[key]
                
    return vocab

In [53]:
trainVocaDict = buildVocab(trainData, 5)
trainVocaDict

{'best': [1276, 2297],
 'in': [8221, 8268],
 'lowbudget': [111, 68],
 'had': [3037, 2535],
 'some': [3756, 3390],
 'and': [9045, 9060],
 'just': [4431, 3390],
 'babes': [18, 12],
 'so': [4619, 3844],
 'how': [2475, 2189],
 'jc': [4, 6],
 'went': [552, 438],
 'someone': [869, 535],
 'strictly': [61, 39],
 'characters': [1958, 1929],
 'black': [509, 508],
 'checked': [36, 19],
 'anyhow': [20, 18],
 'never': [1864, 1847],
 'mostly': [313, 310],
 'later': [535, 753],
 'capture': [94, 121],
 'from': [4308, 4397],
 'time': [3113, 3056],
 'would': [3324, 2641],
 'sort': [526, 397],
 'years': [1085, 1622],
 'local': [283, 284],
 'there': [3839, 3110],
 'cable': [92, 78],
 'what': [3686, 3331],
 'people': [2344, 2183],
 'ever': [1853, 1627],
 'again': [1054, 1232],
 'it': [7996, 7830],
 'men': [404, 562],
 'tell': [627, 514],
 'rest': [709, 529],
 'is': [8334, 8468],
 'imdb': [264, 139],
 'be': [5596, 4955],
 'these': [1545, 1494],
 'hanging': [94, 59],
 'two': [1647, 1877],
 'managed': [160, 1

In [54]:
#c.Calculate the following probability

#Probability of the occurrence
def calWordOccurProbability(w, vocaDict, totalCount):
    if w not in vocaDict:
        return 0
    else:
        return ((vocaDict[w][0] + vocaDict[w][1])/totalCount)

# Conditional probability based on the sentiment
def calCondWordProbability(w, vocaDict, label):
    countList = [0, 0]
    
    for d in trainData:
        if d[1] == 0:
            countList[0] += 1
        else:
            countList[1] += 1
    
    if w not in vocaDict:
        return 0
    else:
        return (vocaDict[w][label]/countList[label])
    

In [55]:
print('P["the"] = ', calWordOccurProbability('the', trainVocaDict, len(trainData)))
print('P["the"|Negative] = ', calCondWordProbability('the', trainVocaDict, 0))
print('P["the"|Positive] = ', calCondWordProbability('the', trainVocaDict, 1))

P["the"] =  0.9917866666666667
P["the"|Negative] =  0.9930997876857749
P["the"|Positive] =  0.9904608788853162


In [56]:
#d. Calculate accuracy using dev dataset 
def predict(rev, vocab, c, y, lenOfData):
    probList = list()
    
    for label in [0, 1]:
        p = log(c[label]/lenOfData)
        
        for w in rev[0]:
            if w not in vocab:
                continue
            if y == 0 and vocab[w][label] == 0:
                p = 0
                break
            p += log((vocab[w][label] + y) / (c[label] + y*len(vocab)))
        
        probList.append(p)
        
    return 0 if probList[0] > probList[1] else 1


def calKfoldDataAccu(givenData, y, k, numOfOccur):
    accuList = list()
    foldList = list()
    
    foldLen = len(givenData)//k
    duplData = list(givenData)
    
    for u in range(k):
        shuffle(duplData)
        singleFold = duplData[:foldLen]
        duplData = duplData[foldLen:]
    
        foldList.append(singleFold)
    
    for fold in foldList:
        trainFoldList = list(foldList)
        list(foldList).remove(fold)
        trainFoldList = sum(trainFoldList, [])
        testFoldList = fold
        
        vocab = buildVocab(trainFoldList, numOfOccur)
        
        countList = [0, 0]
        
        for d in trainFoldList:
            if d[1] == 0:
                countList[0] += 1
            else:
                countList[1] += 1
        
        matchCounter = 0
        
        for rev in testFoldList:
            if predict(rev, vocab, countList, y, len(trainFoldList)) == rev[1]:
                matchCounter += 1
        
        accuList.append(matchCounter / len(testFoldList))
        
    return accuList
        

In [57]:
seed(1)

#dev data accuracy without smoothing
print('5-Fold accuracy: ', calKfoldDataAccu(devData, 0, 5, 5))

5-Fold accuracy:  [0.5152, 0.4888, 0.4952, 0.5184, 0.5184]


In [58]:
#e. Do following experiments

#Compare the effect of Smoothing
def accuFunc(trainDataset, testDataset, y, numOfOccur):
    vocab = buildVocab(trainDataset, numOfOccur)
    
    countList = [0, 0]
        
    for d in trainDataset:
        if d[1] == 0:
            countList[0] += 1
        else:
            countList[1] += 1
            
    matchCounter = 0
    
    for rev in testDataset:
        if predict(rev, vocab, countList, y, len(trainDataset)) == rev[1]:
            matchCounter += 1
            
    return (matchCounter / len(testDataset))

In [59]:
print('Without smoothing: ', accuFunc(trainData, devData, 0, 5))

Without smoothing:  0.50176


In [60]:
print('With laplace estimate: ', accuFunc(trainData, devData, 1, 5))

With laplace estimate:  0.48096


In [61]:
#Derive Top 10 words that predict positive and negative class

def getTop10(data, l, y, numOfOccur):
    vocab = buildVocab(data, numOfOccur)
    
    wordList = []
    
    for w in list(vocab.keys()):
        v = (vocab[w][l] + y) / (vocab[w][1] + y + vocab[w][0] + y)
        wordList.append([w, v])
    
    wordList = np.array(wordList)
    wordList = wordList[np.lexsort(wordList.T)]
    
    return wordList[:10]
    
        

In [62]:
topNeg = getTop10(devData, 0, 1, 5)
topPos = getTop10(devData, 1, 1, 5)
print("Top 10 words(negative):\n ", topNeg)
print("\nTop 10 words(positive):\n ", topPos)

Top 10 words(negative):
  [['excellently' '0.045454545454545456']
 ['910' '0.05263157894736842']
 ['enchanting' '0.05555555555555555']
 ['planets' '0.058823529411764705']
 ['cushing' '0.0625']
 ['quintessential' '0.06666666666666667']
 ['wang' '0.06666666666666667']
 ['forties' '0.07142857142857142']
 ['symbols' '0.07142857142857142']
 ['grayson' '0.07692307692307693']]

Top 10 words(positive):
  [['stinker' '0.037037037037037035']
 ['unwatchable' '0.037037037037037035']
 ['unfunny' '0.04285714285714286']
 ['210' '0.05']
 ['lousy' '0.05']
 ['wretched' '0.0625']
 ['310' '0.06666666666666667']
 ['410' '0.06666666666666667']
 ['waste' '0.0670926517571885']
 ['godawful' '0.07142857142857142']]


In [63]:
#f. Using the test dataset

finalAccu = accuFunc(trainData, testData, 1, 5)

In [64]:
finalAccu

0.4846