In [18]:
import numpy as np
import matplotlib.pyplot as plt
import re
%matplotlib inline
import random
import math
import pandas as pd

In [118]:
train_list = []
with open("imdb_train_text.txt","r") as text, open("imdb_train_labels.txt","r") as labels:
    for line in text:
        s = labels.readline()
        line = line.replace("<br /><br />", " ")
        line =  "".join([ c.lower() if c.isalnum() else " " for c in line ])
        train_list.append((line,int(s)))
# random.shuffle(train_list)

In [113]:
test_list = []
with open("imdb_test_text.txt","r") as text, open("imdb_test_labels.txt","r") as labels:
    for line in text:
        s = labels.readline()
        line = line.replace("<br /><br />", " ")
        line =  "".join([ c.lower() if c.isalnum() else " " for c in line ])
        test_list.append((line,int(s)))
# random.shuffle(test_list)

In [119]:
try_dic = {}
for el in train_list:
    try:
        try_dic[el[1]] +=1
    except:
        try_dic[el[1]] = 1

In [120]:
try_dic

{1: 5100, 2: 2284, 3: 2420, 4: 2696, 7: 2496, 8: 3009, 9: 2263, 10: 4732}

In [23]:
vocab = {}
word_given_class = {}
for clas in try_dic.keys():
    word_given_class[clas] = {}
for (text,rating) in train_list:
    text_list = text.split()
    for word in text_list:
        if word in vocab.keys():
            vocab[word] += 1
        else:
            vocab[word] = 1
        if word in word_given_class[rating].keys():
            word_given_class[rating][word] +=1
        else:
            word_given_class[rating][word] = 1 

In [24]:
def size_d(dic):
    i = 0
    for el in dic.keys():
        i += dic[el]
    return i

In [25]:
size_of_class = {}
for clas in try_dic.keys():
    size_of_class[clas] = size_d(word_given_class[clas])
voc_size = size_d(vocab)

In [26]:
def classify(line):
#     line =  "".join([ c.lower() if c.isalnum() else " " for c in line ])
#     line = re.sub("<br />"," ",line)
    text_list = line.split()
    res = {}
    for clas in try_dic.keys():
        res[clas] = 0
        res[clas] += math.log(try_dic[clas])
        for word in text_list:
            if word in word_given_class[clas].keys():
                res[clas] += math.log((word_given_class[clas][word]+1)/(size_of_class[clas]+len(vocab)))
            else:
                res[clas] += math.log((0+1)/(size_of_class[clas]+len(vocab)))
    return (max(res,key=res.get),res)

In [27]:
correct = 0
wrong = 0
for (text,rating) in train_list:
    if(classify(text)[0] == rating):
#     if(1 == rating):
        correct += 1
    else:
        wrong += 1

print( " Train accuracy is ", correct/(correct+wrong))

 Train accuracy is  0.6844


In [28]:
correct = 0
wrong = 0
for (text,rating) in test_list:
    if(classify(text)[0] == rating):
#     if(1 == rating):
        correct += 1
    else:
        wrong += 1

print( " Test accuracy is ", correct/(correct+wrong))

 Test accuracy is  0.38476


Random guessing

In [29]:
correct = 0
wrong = 0
for (text,rating) in test_list:
    if(random.choice(list(try_dic.keys())) == rating):
        correct += 1
    else:
        wrong += 1
print( " Random class accuracy is ", correct/(correct+wrong))

 Random class accuracy is  0.12328


Majority guess

In [30]:
correct = 0
wrong = 0
for (text,rating) in test_list:
    if(1 == rating):
        correct += 1
    else:
        wrong += 1
print( " Majority class accuracy is ", correct/(correct+wrong))

 Majority class accuracy is  0.20088


## Confusion Matrix

In [31]:
act =[]
pred =[]
for (text,rating) in test_list:
    pred.append(classify(text)[0])
    act.append(rating)
y_actu = pd.Series(act, name='Actual')
y_pred = pd.Series(pred, name='Predicted')
df_confusion = pd.crosstab(y_actu, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

In [32]:
df_confusion

Predicted,1,2,3,4,7,8,9,10,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,4272,87,155,258,36,61,20,133,5022
2,1587,50,184,273,55,54,6,93,2302
3,1362,56,230,491,127,120,13,142,2541
4,1036,46,210,673,225,229,28,188,2635
7,400,9,80,262,424,521,73,538,2307
8,423,13,64,167,311,720,131,1021,2850
9,332,6,23,95,153,470,123,1142,2344
10,797,11,43,103,181,559,178,3127,4999
All,10209,278,989,2322,1512,2734,572,6384,25000


## Stemming and Stop Words

In [87]:
train_list = []
with open("stop_train_text.txt","r") as text, open("imdb_train_labels.txt","r") as labels:
    for line in text:
        s = labels.readline()
        line = line.replace("<br /><br />", " ")
        line =  "".join([ c.lower() if c.isalnum() else " " for c in line ])
        train_list.append((line,int(s)))
# random.shuffle(train_list)

In [88]:
test_list = []
with open("stop_test_text.txt","r") as text, open("imdb_test_labels.txt","r") as labels:
    for line in text:
        s = labels.readline()
        line = line.replace("<br /><br />", " ")
        line =  "".join([ c.lower() if c.isalnum() else " " for c in line ])
        test_list.append((line,int(s)))
# random.shuffle(test_list)

In [89]:
try_dic = {}
for el in train_list:
    try:
        try_dic[el[1]] +=1
    except:
        try_dic[el[1]] = 1

In [90]:
try_dic

{1: 5100, 2: 2284, 3: 2420, 4: 2696, 7: 2496, 8: 3009, 9: 2263, 10: 4732}

In [91]:
vocab = {}
word_given_class = {}
for clas in try_dic.keys():
    word_given_class[clas] = {}
for (text,rating) in train_list:
    text_list = text.split()
    for word in text_list:
        if word in vocab.keys():
            vocab[word] += 1
        else:
            vocab[word] = 1
        if word in word_given_class[rating].keys():
            word_given_class[rating][word] +=1
        else:
            word_given_class[rating][word] = 1 

In [92]:
def size_d(dic):
    i = 0
    for el in dic.keys():
        i += dic[el]
    return i

In [93]:
size_of_class = {}
for clas in try_dic.keys():
    size_of_class[clas] = size_d(word_given_class[clas])
voc_size = size_d(vocab)

In [94]:
def classify(line):
#     line =  "".join([ c.lower() if c.isalnum() else " " for c in line ])
#     line = re.sub("<br />"," ",line)
    text_list = line.split()
    res = {}
    for clas in try_dic.keys():
        res[clas] = 0
        res[clas] += math.log(try_dic[clas])
        for word in text_list:
            if word in word_given_class[clas].keys():
                res[clas] += math.log((word_given_class[clas][word]+1)/(size_of_class[clas]+len(vocab)))
            else:
                res[clas] += math.log((0+1)/(size_of_class[clas]+len(vocab)))
    return (max(res,key=res.get),res)

In [99]:
correct = 0
wrong = 0
act = []
pred = []
for (text,rating) in test_list:
    p = classify(text)[0]
    pred.append(p)
    act.append(rating)
    if(p == rating):
        correct += 1
    else:
        wrong += 1
print( " Test accuracy is ", correct/(correct+wrong))
y_actu = pd.Series(act, name='Actual')
y_pred = pd.Series(pred, name='Predicted')
df_confusion = pd.crosstab(y_pred,  y_actu, rownames=['Predicted'], colnames=['Actual'], margins=True)
df_confusion

 Test accuracy is  0.38452


Actual,1,2,3,4,7,8,9,10,All
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,4155,1501,1266,925,340,356,281,595,9419
2,128,78,106,76,25,30,18,35,496
3,193,204,276,294,111,93,48,70,1289
4,249,284,501,666,288,208,116,138,2450
7,43,64,115,215,407,322,163,207,1536
8,70,44,106,206,497,692,432,534,2581
9,37,13,22,39,121,211,165,246,854
10,147,114,149,214,518,938,1121,3174,6375
All,5022,2302,2541,2635,2307,2850,2344,4999,25000


## Feature Engineering

In [100]:
pos_words = []
neg_words = []
with open("positive-words.txt","r") as text:
    for line in text:
        pos_words.append(line.strip())
with open("negative-words.txt","r") as text:
    for line in text:
        neg_words.append(line.strip())
pos_words = set(pos_words)
neg_words = set(neg_words)

In [101]:
train_list = []
with open("imdb_train_text.txt","r") as text, open("imdb_train_labels.txt","r") as labels:
    for line in text:
        s = labels.readline()
        line = line.replace("<br /><br />", " ")
        line =  "".join([ c.lower() if c.isalnum() else " " for c in line ])
        train_list.append((line,int(s)))
# random.shuffle(train_list)

In [102]:
test_list = []
with open("imdb_test_text.txt","r") as text, open("imdb_test_labels.txt","r") as labels:
    for line in text:
        s = labels.readline()
        line = line.replace("<br /><br />", " ")
        line =  "".join([ c.lower() if c.isalnum() else " " for c in line ])
        test_list.append((line,int(s)))
# random.shuffle(test_list)

In [103]:
try_dic = {}
for el in train_list:
    try:
        try_dic[el[1]] +=1
    except:
        try_dic[el[1]] = 1

In [104]:
try_dic

{1: 5100, 2: 2284, 3: 2420, 4: 2696, 7: 2496, 8: 3009, 9: 2263, 10: 4732}

In [105]:
vocab = {}
word_given_class = {}
for clas in try_dic.keys():
    word_given_class[clas] = {}
for (text,rating) in train_list:
    text_list = text.split()
    for word in text_list:
        if (word not in pos_words) and (word not in neg_words):
            continue
        if word in vocab.keys():
            vocab[word] += 1
        else:
            vocab[word] = 1
        if word in word_given_class[rating].keys():
            word_given_class[rating][word] +=1
        else:
            word_given_class[rating][word] = 1 

In [106]:
def size_d(dic):
    i = 0
    for el in dic.keys():
        i += dic[el]
    return i

In [107]:
size_of_class = {}
for clas in try_dic.keys():
    size_of_class[clas] = size_d(word_given_class[clas])
voc_size = size_d(vocab)

In [108]:
def classify(line):
#     line =  "".join([ c.lower() if c.isalnum() else " " for c in line ])
#     line = re.sub("<br />"," ",line)
    text_list = line.split()
    res = {}
    for clas in try_dic.keys():
        res[clas] = 0
#         res[clas] += math.log(try_dic[clas])
        for word in text_list:
            if (word not in pos_words) and (word not in neg_words):
                continue
            if word in word_given_class[clas].keys():
                res[clas] += math.log((word_given_class[clas][word]+1)/(size_of_class[clas]+len(vocab)))
            else:
                res[clas] += math.log((0+1)/(size_of_class[clas]+len(vocab)))
    return (max(res,key=res.get),res)

In [109]:
correct = 0
wrong = 0
for (text,rating) in test_list:
    if(classify(text)[0] == rating):
#     if(1 == rating):
        correct += 1
    else:
        wrong += 1

print( " Test accuracy is ", correct/(correct+wrong))

 Test accuracy is  0.36436


In [110]:
correct = 0
wrong = 0
act = []
pred = []
for (text,rating) in test_list:
    p = classify(text)[0]
    pred.append(p)
    act.append(rating)
    if(p == rating):
        correct += 1
    else:
        wrong += 1
print( " Test accuracy is ", correct/(correct+wrong))
y_actu = pd.Series(act, name='Actual')
y_pred = pd.Series(pred, name='Predicted')
df_confusion = pd.crosstab(y_pred,  y_actu, rownames=['Predicted'], colnames=['Actual'], margins=True)
df_confusion

 Test accuracy is  0.36436


Actual,1,2,3,4,7,8,9,10,All
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,3190,942,677,444,102,91,82,223,5751
2,616,328,356,308,70,53,46,91,1868
3,469,374,504,479,144,130,51,96,2247
4,362,360,546,679,306,217,113,161,2744
7,116,104,172,304,644,604,308,395,2647
8,64,67,95,185,407,538,387,568,2311
9,72,37,63,108,229,412,396,635,1952
10,133,90,128,128,405,805,961,2830,5480
All,5022,2302,2541,2635,2307,2850,2344,4999,25000


In [111]:
classify(test_list[190][0])

(10,
 {1: -117.98546666877958,
  2: -115.26586762348616,
  3: -114.79935725821286,
  4: -116.76894292448473,
  7: -115.28920533766437,
  8: -113.99675673461151,
  9: -112.85836912088479,
  10: -112.62042076362854})

In [112]:
test_list[190][1]

7