In [1]:
############ Compulsory Standard Library #################
import glob
import numpy as np
import matplotlib.pyplot as plt
import csv
import nltk
import re
import graphviz

############ Sklearn pre-processing Library #################
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.metrics import accuracy_score

############ Sklearn model Library #################
from sklearn.linear_model import LogisticRegression

In [2]:
def load_imdb(path):
    
    print("Loading the imdb data")
    
    train_neg_files = glob.glob(path+"/train/neg/*.txt")
    train_pos_files = glob.glob(path+"/train/pos/*.txt")
    
    X_train_corpus = []
    y_train = []
    
    for tnf in train_neg_files:
        f = open(tnf, 'r', encoding="utf8")
        line = f.read()
        #line = line[:len(line)/2]
        X_train_corpus.append(line)
        y_train.append(0)
        f.close()
    
    for tpf in train_pos_files:
        f = open(tpf, 'r', encoding="utf8")
        line = f.read()
        #line = line[:len(line)/2]
        X_train_corpus.append(line)
        y_train.append(1)
        f.close()
    
    print("Train Data loaded.")
    
    test_neg_files = glob.glob(path+"/test/neg/*.txt")
    test_pos_files = glob.glob(path+"/test/pos/*.txt")
    
    X_test_corpus = []
    y_test = []
    
    for tnf in test_neg_files:
        f = open(tnf, 'r', encoding="utf8")
        X_test_corpus.append(f.read())
        y_test.append(0)
        f.close()
    
    for tpf in test_pos_files:
        f = open(tpf, 'r', encoding="utf8")
        X_test_corpus.append(f.read())
        y_test.append(1)
        f.close()
    
    print("Test Data loaded.")
    
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    
    return X_train_corpus, y_train, X_test_corpus , y_test

In [3]:
def load_list(filename, split_delimiter):
    vocabulary = []
    with open(filename, 'r') as f:
        for l in f:
            vocabulary.append(l.strip().split(split_delimiter))
    return np.asarray(vocabulary)

In [4]:
def replace_contraction(corpus, cont_list):
    for i in range(0, cont_list.shape[0]):
        corpus = corpus.lower().replace(cont_list[i,0], cont_list[i,1])
    return corpus

In [12]:
def update_corpus_contraction(X_corpus):
    cont_list = load_list("contraction_list.txt", ',')
    print(cont_list.shape)
    print('corpus update start')
    for i in range(0,len(X_corpus)):
        X_corpus[i] = cleanhtml(X_corpus[i])
        X_corpus[i] = replace_contraction(X_corpus[i], cont_list)
    print('corpus update end')
    print()
    return X_corpus

In [6]:
def negative_positive_counts(X, y, word_index):
    neg_count = np.sum(X[y==0, word_index])
    pos_count = np.sum(X[y==1, word_index])    
    return neg_count, pos_count

In [7]:
def sort_top_words_with_count(X, y, words, weights, filename, top_k=10):
    sorted_indices_descending_abs = np.argsort(np.absolute(weights))[::-1]
    
    filename = filename + '.txt'
    with open(filename, mode='w', encoding='utf8') as w:
        for i in sorted_indices_descending_abs[: top_k]:
#             print("%s\t%0.2f" %(words[i], weights[i]))
            n_p=negative_positive_counts(X, y, i)
            w.write("%s\t%0.2f\t%d\t%d" %(str(words[i]), weights[i], n_p[0], n_p[1]))
            w.write('\n')
        w.close()

In [8]:
def cleanhtml(text):
    cleanr = re.compile('<.*?>')
    cleantag = re.sub(cleanr, '', text)
    cleantext = cleantag.replace('br', '')
    return cleantext

# Reference :
# https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string

### Load the data

In [9]:
X_train_corpus , y_train, X_test_corpus , y_test = load_imdb('../aclImdb')

Loading the imdb data
Train Data loaded.
Test Data loaded.


### Corpus Update (Remove the contraction)
Such as :<br>
[isn't $\rightarrow$ is not] <br>
[haven't $\rightarrow$ have not] <br>

In [14]:
X_train_corpus_update = update_corpus_contraction(X_train_corpus)
# X_test_corpus_update = update_corpus_contraction(X_test_corpus)

(58, 2)
corpus update start
corpus update end



### top 100 words extraction (1-5 grams)

In [15]:
token = r"(?u)\b[\w\'/]+\b"
tf_vectorizer = CountVectorizer(lowercase=True, max_df=1.0, min_df=5, binary=True, token_pattern=token)

In [16]:
for i in range(1,6):
    print('Processing',i,'grams')
    tf_vectorizer.set_params(ngram_range=(i,i))
    X_train = tf_vectorizer.fit_transform(X_train_corpus_update)
    words = tf_vectorizer.get_feature_names()
    
    clf = LogisticRegression(penalty='l2', C=1, random_state=42)
    clf.fit(X_train, y_train)

    filename=str(i)+"gram"
    sort_top_words_with_count(X_train, y_train, words, clf.coef_.flatten(), filename, top_k=1000)
    
    del clf
    del X_train
    del words

Processing 1 grams
Processing 2 grams
Processing 3 grams
Processing 4 grams
Processing 5 grams


### Generate tree relation

Reference : <br>
https://stackoverflow.com/questions/2358045/how-can-i-implement-a-tree-in-python-are-there-any-built-in-data-structures-in
http://anytree.readthedocs.io/en/latest/

In [17]:
one_gram = load_list("1gram.txt",'\t')
two_gram = load_list("2gram.txt",'\t')
three_gram = load_list("3gram.txt", '\t')
four_gram = load_list("4gram.txt", '\t')
five_gram = load_list("5gram.txt", '\t')

In [18]:
for i in one_gram:
    for j in two_gram:
#         abs_weight = [np.absolute(j[1]), np.absolute(i[1])]
        split_words = j[0].split()
        for k in range(0, len(split_words)):
            if i[0] == split_words[k] and (j[1]>i[1]):
                print(i[0], i[1],'\t', j[0], j[1])

worst -2.37 	 the worst -2.53
supposed -1.08 	 not supposed 0.50
bad -1.05 	 bad thing 0.67
7 1.03 	 a 7 1.22
highly 1.03 	 highly recommended 1.28
favorite 1.00 	 my favorite 1.06
loved 0.99 	 loved this 1.04
great 0.94 	 is great 1.00
unless -0.94 	 unless you -1.01
disappointed -0.88 	 not disappointed 0.62
disappointed -0.88 	 be disappointed 0.56
recommended 0.87 	 highly recommended 1.28
slow -0.81 	 slow but 0.44
masterpiece 0.81 	 a masterpiece 0.88
best 0.81 	 the best 1.00
predictable -0.79 	 predictable but 0.47
grade -0.79 	 grade b 0.62
fun 0.75 	 fun and 0.91
entertaining 0.72 	 very entertaining 0.92
1 -0.70 	 a 1 -0.86
definitely 0.68 	 definitely worth 1.27
illiant 0.66 	 is illiant 0.72
simple 0.65 	 a simple 0.66
realistic 0.64 	 more realistic 0.68
pleasantly 0.63 	 pleasantly surprised 0.68
enjoy 0.63 	 and enjoy 0.73
bit 0.62 	 a bit 0.77
shame -0.61 	 shame on -0.63
focuses 0.61 	 focuses on 0.62
surprised 0.60 	 pleasantly surprised 0.68
sorry -0.60 	 am sorry -

In [19]:
for i in two_gram:
    for j in three_gram:
        if i[0] in j[0] and j[1]>i[1]:
            print(i[0], i[1],'\t', j[0], j[1])

the worst -2.53 	 of the worst -2.80
waste of -1.55 	 a waste of -1.68
waste of -1.55 	 waste of time -1.56
than this -1.23 	 better than this -1.37
a must 1.09 	 a must see 1.44
worst movie -1.08 	 the worst movie -1.35
loved this 1.04 	 i loved this 1.37
an excellent 1.03 	 is an excellent 1.06
unless you -1.01 	 unless you are -1.35
the best 1.00 	 of the best 1.54
very bad -0.94 	 a very bad -1.05
not even -0.94 	 is not even -1.27
sit through -0.91 	 to sit through -1.22
love this 0.90 	 i love this 1.52
love this 0.90 	 love this movie 0.91
a great 0.90 	 a great job 1.20
a great 0.90 	 is a great 1.06
a great 0.90 	 with a great 0.97
very good 0.90 	 very good and 0.91
so bad -0.89 	 is so bad -1.10
the funniest 0.88 	 of the funniest 0.91
must see 0.88 	 a must see 1.44
must see 0.88 	 must see for 0.89
very disappointed -0.87 	 was very disappointed -0.94
your time -0.87 	 waste your time -1.04
at all -0.86 	 at all costs -1.21
a 1 -0.86 	 it a 10 0.56
skip this -0.85 	 skip t

would not -0.45 	 they would not -0.53
although the 0.45 	 although the film 0.52
movie can -0.45 	 movie can be -0.53
not worthy -0.45 	 not worthy of -0.65
only to -0.45 	 if only to -0.57
do not -0.45 	 do not bother -0.93
do not -0.45 	 do not even -0.93
do not -0.45 	 do not waste -0.91
do not -0.45 	 do not miss 0.90
do not -0.45 	 do not recommend -0.73
do not -0.45 	 do not want 0.61
do not -0.45 	 really do not -0.61
do not -0.45 	 do not give -0.58
do not -0.45 	 do not go -0.55
do not -0.45 	 do not help -0.53
do not -0.45 	 do not watch -0.52
do not -0.45 	 do not believe -0.51
do not -0.45 	 i do not -0.50
not like 0.45 	 does not like 0.58
fell in 0.45 	 fell in love 0.63
my pare 0.44 	 my pare nots 0.60
after watching -0.44 	 after watching this -0.65
was pleasantly 0.44 	 was pleasantly surprised 0.84
was pleasantly 0.44 	 i was pleasantly 0.52
true story 0.44 	 a true story 0.99
makes you 0.44 	 makes you think 0.66
makes you 0.44 	 makes you feel 0.59
many years 0.44 

In [20]:
for i in three_gram:
    for j in four_gram:
        if i[0] in j[0] and j[1]>i[1]:
            print(i[0], i[1],'\t', j[0], j[1])

of the worst -2.80 	 one of the worst -2.95
a waste of -1.68 	 a waste of time -1.82
is the worst -1.56 	 this is the worst -1.74
waste of time -1.56 	 a waste of time -1.82
of the best 1.54 	 one of the best 1.81
i love this 1.52 	 i love this movie 1.80
a must see 1.44 	 a must see for 1.90
i loved this 1.37 	 i loved this movie 1.58
is not even -1.27 	 it is not even -1.41
to sit through -1.22 	 to sit through this -1.30
at all costs -1.21 	 avoid at all costs -1.74
i recommend this 1.20 	 i recommend this movie 1.29
a great job 1.20 	 does a great job 1.67
the only good -1.17 	 the only good thing -1.66
none of the -1.15 	 none of the characters -1.27
that is it -1.12 	 and that is it -1.39
i highly recommend 1.11 	 i highly recommend this 1.59
i highly recommend 1.11 	 i highly recommend it 1.22
an insult to -1.09 	 is an insult to -1.10
save your money -1.06 	 save your money and -1.30
is an excellent 1.06 	 this is an excellent 1.53
is a great 1.06 	 this is a great 1.64
a very 

is just a -0.57 	 it is just another -0.81
is just a -0.57 	 is just as good 0.74
not to like 0.57 	 what's not to like 0.82
is just bad -0.57 	 it is just bad -0.91
to new york 0.57 	 to new york city 0.76
it takes a 0.57 	 it takes a little 0.79
this film i -0.57 	 about this film i -1.04
this film i -0.57 	 this film is awful -0.96
this film i -0.57 	 bad this film is -0.83
this film i -0.57 	 this film is so -0.78
this film i -0.57 	 in this film it -0.77
this film i -0.57 	 this film is just -0.75
glued to the 0.57 	 glued to the screen 1.06
no more than -0.57 	 no more than a -0.81
and in my 0.57 	 and in my opinion 0.80
hard to be -0.57 	 so hard to be -0.82
in the theater -0.57 	 sat in the theater -0.74
the very worst -0.56 	 of the very worst -0.73
is not scary -0.56 	 it is not scary -0.73
make a film -0.56 	 to make a film -1.05
appeared to be -0.56 	 what appeared to be -0.80
we would have 0.56 	 we would have a 0.74
film is so -0.56 	 this film is so -0.78
do i have -0.56

In [21]:
for i in four_gram:
    for j in five_gram:
        if i[0] in j[0] and j[1]>i[1]:
            print(i[0], i[1],'\t', j[0], j[1])

do not waste your -1.83 	 do not waste your time -2.35
do not waste your -1.83 	 do not waste your money -1.99
i really enjoyed this 1.71 	 i really enjoyed this movie 2.06
of the most boring -1.43 	 one of the most boring -2.01
it is not even -1.41 	 and it is not even -1.59
this is a must 1.36 	 this is a must see 1.94
do not miss this 1.34 	 do not miss this one 1.79
is nothing more than -1.29 	 is nothing more than a -1.78
of the most awful -1.26 	 one of the most awful -1.59
there is no plot -1.25 	 there is no plot to -1.43
worst film i have -1.24 	 the worst film i have -1.55
a lot of fun 1.24 	 is a lot of fun 1.58
complete waste of time -1.23 	 a complete waste of time -1.78
does an excellent job 1.21 	 does an excellent job of 1.30
i do not recommend -1.20 	 i do not recommend this -1.32
you won't be disappointed 1.19 	 it you won't be disappointed 1.23
the worst i have -1.19 	 the worst i have seen -1.62
this is a wonderful 1.18 	 this is a wonderful film 1.39
i was not disa

In [22]:
from anytree import Node, RenderTree

parent_index = 0
child_index = 0
parent_node = [Node("") for _ in range(len(three_gram))]
child_node = [Node("") for _ in range(len(five_gram))]

for i in three_gram:
    parent_node[parent_index] = Node(i[0])
    for j in five_gram:
        if i[0] in j[0] and j[1]>i[1]:
            print(i[0], i[1],'\t', j[0], j[1])
            child_node = Node(j[0], parent=parent_node[parent_index])
    parent_index = parent_index + 1



waste of time -1.56 	 a complete waste of time -1.78
waste of time -1.56 	 a total waste of time -1.64
of the best 1.54 	 is one of the best 1.58
a must see 1.44 	 this is a must see 1.94
the worst film -1.42 	 the worst film i have -1.55
the worst movie -1.35 	 the worst movie ever made -1.85
the worst movie -1.35 	 one of the worst movies -1.61
is not even -1.27 	 and it is not even -1.59
is not worth -1.26 	 this movie is not worth -1.45
a great job 1.20 	 does a great job of 1.35
the only good -1.17 	 the only good thing about -1.36
none of the -1.15 	 none of the characters are -1.21
i highly recommend 1.11 	 i highly recommend this film 1.57
i highly recommend 1.11 	 i highly recommend this movie 1.27
is so bad -1.10 	 this movie is so bad -1.91
is so bad -1.10 	 this film is so bad -1.37
is so bad -1.10 	 is so bad that it -1.22
is an excellent 1.06 	 this is an excellent film 1.36
is an excellent 1.06 	 is an excellent example of 1.09
is a great 1.06 	 this is a great film 1.60

of my life -0.73 	 90 minutes of my life -1.13
of my life -0.73 	 two hours of my life -1.01
of my life -0.73 	 minutes of my life i -0.97
of my life -0.73 	 minutes of my life back -0.91
is worth watching 0.73 	 it is worth watching for 1.14
was pretty good 0.73 	 the movie was pretty good 1.34
on the edge 0.72 	 you on the edge of 0.99
on the edge 0.72 	 on the edge of the 0.97
more like a -0.71 	 it is more like a -1.21
be the worst -0.71 	 has to be the worst -0.93
what a waste -0.71 	 what a waste of time -1.35
it a 2 -0.70 	 i give it a 2 -1.45
it a 2 -0.70 	 i gave it a 2 -1.14
excuse for a -0.70 	 excuse for a motion picture -0.97
excuse for a -0.70 	 this poor excuse for a -0.95
excuse for a -0.70 	 this sorry excuse for a -0.93
as well as 0.70 	 as well as the other 0.93
was looking forward -0.70 	 i was looking forward to -1.74
not one of -0.69 	 is not one of them -1.29
not one of -0.69 	 this was not one of -1.01
this movie to 0.69 	 recommend this movie to everyone 1.19
t

this film i -0.57 	 i watched this film i -1.00
this film i -0.57 	 this film is not for 0.99
this film i -0.57 	 to watch this film i -0.95
this film i -0.57 	 everything about this film is -0.94
this film i -0.57 	 star of this film is 0.93
this film i -0.57 	 about this film is that 0.93
this film i -0.57 	 while watching this film i 0.92
for anyone who 0.57 	 must see for anyone who 0.96
at all i -0.57 	 at all it is a 0.95
the very worst -0.56 	 one of the very worst -0.98
other side of 0.56 	 the other side of the 0.95
the only problem 0.56 	 the only problem i had 0.93
film is so -0.56 	 this film is so bad -1.37
a total waste -0.56 	 a total waste of time -1.64
a total waste -0.56 	 is a total waste of -1.09
not being a 0.56 	 at not being able to 0.93
sci fi channel -0.56 	 on the sci fi channel -0.91
the man in 0.56 	 the man in the moon 1.24
he is one -0.56 	 he is one of the -1.09
scene where a -0.55 	 is a scene where a -1.02
does not matter 0.55 	 it does not matter the 0

In [24]:
for i in range(0,len(three_gram)):
    for pre, fill, node in RenderTree(parent_node[i]):
        if parent_node[i].height != 0:
            print("%s%s" % (pre, node.name))
        else:
            print() 




waste of time
├── a complete waste of time
└── a total waste of time
of the best
└── is one of the best

a must see
└── this is a must see
the worst film
└── the worst film i have


the worst movie
├── the worst movie ever made
└── one of the worst movies


is not even
└── and it is not even
is not worth
└── this movie is not worth





a great job
└── does a great job of
the only good
└── the only good thing about
none of the
└── none of the characters are


i highly recommend
├── i highly recommend this film
└── i highly recommend this movie
is so bad
├── this movie is so bad
├── this film is so bad
└── is so bad that it



is an excellent
├── this is an excellent film
└── is an excellent example of
is a great
├── this is a great film
├── this is a great movie
├── it is a great movie
└── it is a great story


waste your time
└── do not waste your time

one of my
├── this is one of my
├── it is one of my
├── is actually one of my
├── one of my favorite movies
├── one of my all time

was a very
└── this was a very good

out on dvd
├── it comes out on dvd
└── to come out on dvd

of his best
├── one of his best roles
└── one of his best performances






does a great
├── does a great job of
└── does a great job portraying
sorry for the
├── feel sorry for the actors
└── i felt sorry for the



characters and the
└── of the characters and their

the acting was
├── the acting was terrible the
└── the acting was bad the
it has a
└── it has a lot of


not a masterpiece
└── it is not a masterpiece



attack of the
└── attack of the killer tomatoes





recommend it to
├── i would recommend it to
├── i recommend it to everyone
├── recommend it to anyone who
└── i'd recommend it to anyone
you will love
└── you will love this film





in all a
├── all in all a good
└── all in all an excellent

it wants to
└── what it wants to be
the most overrated
└── one of the most overrated

not much of
├── not much of a fan
├── was not much of a
└── is not much of an
fell in love
├── i 