In [1]:
############ Compulsory Standard Library #################
import glob
import numpy as np
import matplotlib.pyplot as plt
import csv
import nltk
import re
import graphviz
from anytree import Node, RenderTree

############ Sklearn pre-processing Library #################
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.metrics import accuracy_score

############ Sklearn model Library #################
from sklearn.linear_model import LogisticRegression

In [2]:
def load_imdb(path):
    
    print("Loading the imdb data")
    
    train_neg_files = glob.glob(path+"/train/neg/*.txt")
    train_pos_files = glob.glob(path+"/train/pos/*.txt")
    
    X_train_corpus = []
    y_train = []
    
    for tnf in train_neg_files:
        f = open(tnf, 'r', encoding="utf8")
        line = f.read()
        #line = line[:len(line)/2]
        X_train_corpus.append(line)
        y_train.append(0)
        f.close()
    
    for tpf in train_pos_files:
        f = open(tpf, 'r', encoding="utf8")
        line = f.read()
        #line = line[:len(line)/2]
        X_train_corpus.append(line)
        y_train.append(1)
        f.close()
    
    print("Train Data loaded.")
    
    test_neg_files = glob.glob(path+"/test/neg/*.txt")
    test_pos_files = glob.glob(path+"/test/pos/*.txt")
    
    X_test_corpus = []
    y_test = []
    
    for tnf in test_neg_files:
        f = open(tnf, 'r', encoding="utf8")
        X_test_corpus.append(f.read())
        y_test.append(0)
        f.close()
    
    for tpf in test_pos_files:
        f = open(tpf, 'r', encoding="utf8")
        X_test_corpus.append(f.read())
        y_test.append(1)
        f.close()
    
    print("Test Data loaded.")
    
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    
    return X_train_corpus, y_train, X_test_corpus , y_test

In [3]:
def load_list(filename, split_delimiter):
    vocabulary = []
    with open(filename, 'r') as f:
        for l in f:
            vocabulary.append(l.strip().split(split_delimiter))
    return np.asarray(vocabulary)

In [4]:
def replace_contraction(corpus, cont_list):
    for i in range(0, cont_list.shape[0]):
        corpus = corpus.lower().replace(cont_list[i,0], cont_list[i,1])
    return corpus

In [5]:
def update_corpus_contraction(X_corpus):
    cont_list = load_list("contraction_list.txt", ',')
    print(cont_list.shape)
    print('corpus update start')
    for i in range(0,len(X_corpus)):
        X_corpus[i] = cleanhtml(X_corpus[i])
        X_corpus[i] = replace_contraction(X_corpus[i], cont_list)
    print('corpus update end')
    print()
    return X_corpus

In [37]:
def negative_positive_counts(X, y, word_index):
    neg_count = np.sum(X[y==0, word_index])
    pos_count = np.sum(X[y==1, word_index])    
    return neg_count, pos_count

In [36]:
def log_ratio_positive_negative(X, y, word_index):
    neg_count, pos_count = negative_positive_counts(X,y, word_index)
    log_ratio = np.log(pos_count)-np.log(neg_count)
    return log_ratio, neg_count, pos_count

In [35]:
def sort_top_words_with_count(X, y, words,filename, top_k=10):
    log_ratio = []
    neg_count = []
    pos_count = []
    
    for i in range(0,len(words)):
        log_ratio_, neg_count_, pos_count_ = log_ratio_positive_negative(X, y, i)
        log_ratio.append(log_ratio_)
        neg_count.append(neg_count_)
        pos_count.append(pos_count_)
    
    sorted_indices_descending_abs = np.argsort(np.absolute(log_ratio))[::-1]
    
    filename = filename + '.txt'
    with open(filename, mode='w', encoding='utf8') as w:
        for i in sorted_indices_descending_abs[: top_k]:
#             print("%s\t%0.2f" %(words[i], weights[i]))
#             n_p=negative_positive_counts(X, y, i)
            w.write("%s\t%0.2f\t%d\t%d" %(str(words[i]), log_ratio[i], neg_count[i], pos_count[i]))
            w.write('\n')
        w.close()

In [8]:
def cleanhtml(text):
    cleanr = re.compile('<.*?>')
    cleantag = re.sub(cleanr, '', text)
    cleantext = cleantag.replace('br', '')
    return cleantext

# Reference :
# https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string

### Load the data

In [9]:
X_train_corpus , y_train, X_test_corpus , y_test = load_imdb('../aclImdb')

Loading the imdb data
Train Data loaded.
Test Data loaded.


### Corpus Update (Remove the contraction)
Such as :<br>
[isn't $\rightarrow$ is not] <br>
[haven't $\rightarrow$ have not] <br>
ain't -> are not (despite it also could be a 'is not' or 'am not')

In [15]:
X_train_corpus_update = update_corpus_contraction(X_train_corpus)
# X_test_corpus_update = update_corpus_contraction(X_test_corpus)

(72, 2)
corpus update start
corpus update end



### top 100 words extraction (1-5 grams)

In [39]:
token = r"(?u)\b[\w\'/]+\b"
tf_vectorizer = CountVectorizer(lowercase=True, max_df=1.0, min_df=100, binary=True, token_pattern=token)

In [17]:
tf_vectorizer.set_params(ngram_range=(1,1))
X_train = tf_vectorizer.fit_transform(X_train_corpus_update)
words = tf_vectorizer.get_feature_names()

In [18]:
len(words)

3851

In [38]:
sort_top_words_with_count(X_train, y_train, words, '1gram', top_k=100)

In [58]:
for i in range(1,6):
    print('Processing',i,'grams')
    tf_vectorizer.set_params(ngram_range=(i,i))
    X_train = tf_vectorizer.fit_transform(X_train_corpus_update)
    words = tf_vectorizer.get_feature_names()
    
#     clf = LogisticRegression(penalty='l2', C=1, random_state=42)
#     clf.fit(X_train, y_train)

    filename=str(i)+"gram"
    sort_top_words_with_count(X_train, y_train, words, filename, top_k=100)
    
#     del clf
    del X_train
    del words

Processing 1 grams
Processing 2 grams
Processing 3 grams


  This is separate from the ipykernel package so we can avoid doing imports until


Processing 4 grams


  This is separate from the ipykernel package so we can avoid doing imports until


Processing 5 grams


### Generate tree relation

Reference : <br>
https://stackoverflow.com/questions/2358045/how-can-i-implement-a-tree-in-python-are-there-any-built-in-data-structures-in
http://anytree.readthedocs.io/en/latest/

In [60]:
one_gram = load_list("1gram.txt",'\t')
two_gram = load_list("2gram.txt",'\t')
three_gram = load_list("3gram.txt", '\t')
four_gram = load_list("4gram.txt", '\t')
five_gram = load_list("5gram.txt", '\t')

In [61]:
string_one = "hi"
string_two = "my"
string_three = "friend"
print(string_one + " " + string_two + " "+ string_three)

hi my friend


In [63]:
negated = []
amplifier = []
ind_words = 0
for i in one_gram:
    for j in two_gram:
        split_words = j[0].split()
        after_weight = float(j[1])
        previous_weight = float(i[1])
        for k in range(0, len(split_words)):
            if i[0] == split_words[k]:
                if k == 0:
                    ind_words = 1
                elif k == 1:
                    ind_words = 0
                   
                if np.sign(after_weight)!=np.sign(previous_weight) or np.absolute(after_weight) < np.absolute(previous_weight):
                    negated.append(split_words[ind_words])
                else:
                    amplifier.append(split_words[ind_words])
                print(i[0], i[1],'\t', j[0], j[1])
print()
print('negated')
print(np.unique(negated))
print('amplifier')
print(np.unique(amplifier))

waste -2.55 	 waste your -3.39
waste -2.55 	 waste of -3.05
waste -2.55 	 not waste -2.97
waste -2.55 	 a waste -2.50
waste -2.55 	 to waste -2.00
worst -2.20 	 worst films -4.65
worst -2.20 	 worst movies -3.90
worst -2.20 	 worst film -3.63
worst -2.20 	 worst movie -3.41
worst -2.20 	 the worst -2.32
worst -2.20 	 worst of -1.96
awful -2.14 	 awful the -3.30
awful -2.14 	 is awful -3.06
awful -2.14 	 was awful -3.02
awful -2.14 	 awful i -2.95
awful -2.14 	 awful and -2.65
poorly -2.11 	 poorly written -2.65
captures 2.06 	 captures the 2.65
existent -2.04 	 non existent -2.01
insult -1.94 	 an insult -2.36
insult -1.94 	 insult to -2.29
crap -1.78 	 this crap -4.26
crap -1.78 	 of crap -2.43
mess -1.72 	 this mess -3.65
horrible -1.71 	 is horrible -2.22
superb 1.69 	 is superb 1.99
superb 1.69 	 a superb 1.88
pile -1.65 	 pile of -2.05
terrible -1.64 	 was terrible -2.75
terrible -1.64 	 terrible the -2.61
terrible -1.64 	 is terrible -2.47
worse -1.61 	 is worse -3.01
worse -1.61

In [None]:
negated = []
amplifier = []
ind_words = 0
for i in one_gram:
    for j in three_gram:
        split_words = j[0].split()
        after_weight = float(j[1])
        previous_weight = float(i[1])
        for k in range(0, len(split_words)):
            if i[0] == split_words[k] and (np.absolute(after_weight)>np.absolute(previous_weight) or np.sign(after_weight)!=np.sign(previous_weight)):
                if k == 0:
                    ind_words = 1
                elif k == 1:
                    ind_words = 0
                   
                if np.sign(after_weight)!=np.sign(previous_weight):
                    negated.append(split_words[ind_words])
                else:
                    amplifier.append(split_words[ind_words])
                print(i[0], i[1],'\t', j[0], j[1])
print()
print(negated)
print(amplifier)

In [64]:
for i in two_gram:
    for j in three_gram:
        after_weight = float(j[1])
        previous_weight = float(i[1])
        if i[0] in j[0] and np.absolute(after_weight)>np.absolute(previous_weight):
            print(i[0], i[1],'\t', j[0], j[1])

worst movies -3.90 	 worst movies i -inf
worst movies -3.90 	 the worst movies -4.41
worst film -3.63 	 the worst film -3.96
worst movie -3.41 	 worst movies i -inf
worst movie -3.41 	 the worst movies -4.41
worst movie -3.41 	 worst movie i -4.07
worst movie -3.41 	 the worst movie -3.66
waste your -3.39 	 not waste your -3.53
not waste -2.97 	 not waste your -3.53
not waste -2.97 	 do not waste -3.30
all costs -2.60 	 at all costs -2.77
a waste -2.50 	 a waste of -2.81
only good -2.48 	 the only good -2.91
loved this 2.45 	 i loved this 3.03
of crap -2.43 	 piece of crap -3.75
not worth -2.34 	 is not worth -2.63
this piece -2.33 	 this piece of -2.83
the worst -2.32 	 the worst movies -4.41
the worst -2.32 	 the worst film -3.96
the worst -2.32 	 the worst movie -3.66
the worst -2.32 	 of the worst -3.24
the worst -2.32 	 is the worst -2.70
a must 2.19 	 is a must 2.30
a must 2.19 	 a must see 2.28
so bad -2.14 	 was so bad -2.99
so bad -2.14 	 is so bad -2.72
so bad -2.14 	 so bad 

In [65]:
for i in three_gram:
    for j in four_gram:
        after_weight = float(j[1])
        previous_weight = float(i[1])
        if i[0] in j[0] and np.absolute(after_weight)>np.absolute(previous_weight):
            print(i[0], i[1],'\t', j[0], j[1])

the worst movie -3.66 	 of the worst movies -4.36
the worst movie -3.66 	 the worst movie i -3.94
not waste your -3.53 	 do not waste your -3.63
do not waste -3.30 	 do not waste your -3.63
waste your time -3.28 	 not waste your time -3.32
of the worst -3.24 	 of the worst movies -4.36
of the worst -3.24 	 one of the worst -3.42
a waste of -2.81 	 a waste of time -2.97
is a must 2.30 	 is a must see 2.41
a must see 2.28 	 a must see for 2.67
a must see 2.28 	 is a must see 2.41
is not even -2.14 	 it is not even -2.70
bad it is -1.93 	 so bad it is -2.22
not watch this -1.87 	 do not watch this -2.20
of the best 1.82 	 one of the best 1.98
are supposed to -1.57 	 are supposed to be -1.68
is a great 1.45 	 this is a great 2.54
is a great 1.45 	 it is a great 1.88
i first saw 1.39 	 i first saw this 1.76
is supposed to -1.38 	 is supposed to be -1.53
supposed to be -1.37 	 supposed to be a -1.82
supposed to be -1.37 	 are supposed to be -1.68
supposed to be -1.37 	 is supposed to be -1.5

In [67]:
for i in four_gram:
    for j in five_gram:
        after_weight = float(j[1])
        previous_weight = float(i[1])
        if i[0] in j[0] and np.absolute(after_weight)>np.absolute(previous_weight):
            print(i[0], i[1],'\t', j[0], j[1])

of the worst movies -4.36 	 one of the worst movies -4.32
worst movie i have -3.96 	 the worst movie i have -3.85
worst movie i have -3.96 	 worst movie i have ever -3.73
the worst movie i -3.94 	 the worst movie i have -3.85
do not waste your -3.63 	 do not waste your time -3.42
one of the worst -3.42 	 one of the worst movies -4.32
one of the worst -3.42 	 is one of the worst -3.73
not waste your time -3.32 	 do not waste your time -3.42
one of the best 1.98 	 is one of the best 2.35
supposed to be a -1.82 	 is supposed to be a -1.95
is supposed to be -1.53 	 is supposed to be a -1.95
movie i have ever -1.45 	 worst movie i have ever -3.73
movie i have ever -1.45 	 movie i have ever seen -1.41
do not get me -1.00 	 do not get me wrong -0.90
rest of the movie -0.97 	 the rest of the movie -1.00
you want to see -0.96 	 if you want to see -1.15
not get me wrong -0.90 	 do not get me wrong -0.90
if you want to -0.87 	 if you want to see -1.15
have ever seen i -0.86 	 i have ever seen i -

In [56]:


parent_index = 0
child_index = 0
parent_node = [Node("") for _ in range(len(three_gram))]
child_node = [Node("") for _ in range(len(five_gram))]

for i in three_gram:
    parent_node[parent_index] = Node(i[0])
    for j in five_gram:
        if i[0] in j[0] and j[1]>i[1]:
            print(i[0], i[1],'\t', j[0], j[1])
            child_node = Node(j[0], parent=parent_node[parent_index])
    parent_index = parent_index + 1



the worst movie -3.66 	 one of the worst movies -4.32
the worst movie -3.66 	 the worst movie i have -3.85
do not waste -3.30 	 do not waste your time -3.42
waste your time -3.28 	 do not waste your time -3.42
of the worst -3.24 	 one of the worst movies -4.32
of the worst -3.24 	 is one of the worst -3.73
of the best 1.82 	 is one of the best 2.35
is supposed to -1.38 	 is supposed to be a -1.95
supposed to be -1.37 	 is supposed to be a -1.95
ever seen and -0.78 	 i have ever seen and -0.88
movie i have -0.77 	 the worst movie i have -3.85
movie i have -0.77 	 worst movie i have ever -3.73
movie i have -0.77 	 movie i have ever seen -1.41
if you want -0.76 	 if you want to see -1.15
ever seen the -0.72 	 i have ever seen the -0.94
you want to -0.71 	 if you want to see -1.15
is one of 0.67 	 is one of the best 2.35
is one of 0.67 	 it is one of the 0.85
is one of 0.67 	 is one of the most 0.71
it is one 0.64 	 it is one of the 0.85
i have ever -0.61 	 worst movie i have ever -3.73
i 

In [57]:
for i in range(0,len(three_gram)):
    for pre, fill, node in RenderTree(parent_node[i]):
        if parent_node[i].height != 0:
            print("%s%s" % (pre, node.name))

the worst movie
├── one of the worst movies
└── the worst movie i have
do not waste
└── do not waste your time
waste your time
└── do not waste your time
of the worst
├── one of the worst movies
└── is one of the worst
of the best
└── is one of the best
is supposed to
└── is supposed to be a
supposed to be
└── is supposed to be a
ever seen and
└── i have ever seen and
movie i have
├── the worst movie i have
├── worst movie i have ever
└── movie i have ever seen
if you want
└── if you want to see
ever seen the
└── i have ever seen the
you want to
└── if you want to see
is one of
├── is one of the best
├── it is one of the
└── is one of the most
it is one
└── it is one of the
i have ever
├── worst movie i have ever
├── movie i have ever seen
├── i have ever seen the
├── i have ever seen and
├── movies i have ever seen
└── i have ever seen i
do not get
└── do not get me wrong
you have not
└── if you have not seen
want to see
└── if you want to see
have ever seen
├── movie i have ever seen