In [95]:
############ Compulsory Standard Library #################
import glob
import numpy as np
import matplotlib.pyplot as plt
import csv
import nltk
import re
import graphviz
from anytree import Node, RenderTree
from textblob import TextBlob

############ Sklearn pre-processing Library #################
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.metrics import accuracy_score

############ Sklearn model Library #################
from sklearn.linear_model import LogisticRegression

In [2]:
def load_imdb(path):
    
    print("Loading the imdb data")
    
    train_neg_files = glob.glob(path+"/train/neg/*.txt")
    train_pos_files = glob.glob(path+"/train/pos/*.txt")
    
    X_train_corpus = []
    y_train = []
    
    for tnf in train_neg_files:
        f = open(tnf, 'r', encoding="utf8")
        line = f.read()
        #line = line[:len(line)/2]
        X_train_corpus.append(line)
        y_train.append(0)
        f.close()
    
    for tpf in train_pos_files:
        f = open(tpf, 'r', encoding="utf8")
        line = f.read()
        #line = line[:len(line)/2]
        X_train_corpus.append(line)
        y_train.append(1)
        f.close()
    
    print("Train Data loaded.")
    
    test_neg_files = glob.glob(path+"/test/neg/*.txt")
    test_pos_files = glob.glob(path+"/test/pos/*.txt")
    
    X_test_corpus = []
    y_test = []
    
    for tnf in test_neg_files:
        f = open(tnf, 'r', encoding="utf8")
        X_test_corpus.append(f.read())
        y_test.append(0)
        f.close()
    
    for tpf in test_pos_files:
        f = open(tpf, 'r', encoding="utf8")
        X_test_corpus.append(f.read())
        y_test.append(1)
        f.close()
    
    print("Test Data loaded.")
    
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    
    return X_train_corpus, y_train, X_test_corpus , y_test

In [3]:
'''
Read and load the contraction list (or any text files)
'''
def load_list(filename, split_delimiter):
    vocabulary = []
    with open(filename, 'r') as f:
        for l in f:
            vocabulary.append(l.strip().split(split_delimiter))
    return np.asarray(vocabulary)

In [9]:
'''
Clean the HTML tags from the corpus
'''
def cleanhtml(text):
    cleanr = re.compile('<.*?>')
    cleantag = re.sub(cleanr, '', text)
    cleantext = cleantag.replace('br', '')
    return cleantext

# Reference :
# https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string

In [4]:
'''
Replace the contraction words into two parts (by given contraction list)
'''
def replace_contraction(corpus, cont_list):
    for i in range(0, cont_list.shape[0]):
        corpus = corpus.lower().replace(cont_list[i,0], cont_list[i,1])
    return corpus

In [68]:
'''
Singularize the words by its POS-tag
'''
def word_singularize(corpus):
    text = TextBlob(corpus)
    for tag in text.tags:
        if tag[1] == 'NNS' and tag[0] != 'yes':
            corpus = corpus.replace(tag[0], tag[0].singularize())
    return corpus

In [65]:
'''
Update clean corpus
'''
def update_corpus_contraction(X_corpus):
    cont_list = load_list("contraction_list.txt", ',')
    print(cont_list.shape)
    print('corpus update start')
    for i in range(0,len(X_corpus)):
        X_corpus[i] = cleanhtml(X_corpus[i])
        X_corpus[i] = replace_contraction(X_corpus[i], cont_list)
        X_corpus[i] = word_singularize(X_corpus[i])
    print('corpus update end')
    print()
    return X_corpus

In [6]:
def negative_positive_counts(X, y, word_index):
    neg_count = np.sum(X[y==0, word_index])
    pos_count = np.sum(X[y==1, word_index])    
    return neg_count, pos_count

In [7]:
def log_ratio_positive_negative(X, y, word_index):
    neg_count, pos_count = negative_positive_counts(X,y, word_index)
    log_ratio = np.log(pos_count+1)-np.log(neg_count+1)
    return log_ratio, neg_count, pos_count

In [8]:
def sort_top_words_with_count(X, y, words,filename, top_k=10):
    log_ratio = []
    neg_count = []
    pos_count = []
    
    for i in range(0,len(words)):
        log_ratio_, neg_count_, pos_count_ = log_ratio_positive_negative(X, y, i)
        log_ratio.append(log_ratio_)
        neg_count.append(neg_count_)
        pos_count.append(pos_count_)
    
    sorted_indices_descending_abs = np.argsort(np.absolute(log_ratio))[::-1]
    
    filename = filename + '.txt'
    with open(filename, mode='w', encoding='utf8') as w:
        for i in sorted_indices_descending_abs[: top_k]:
#             print("%s\t%0.2f" %(words[i], weights[i]))
#             n_p=negative_positive_counts(X, y, i)
            w.write("%s\t%0.2f\t%d\t%d" %(str(words[i]), log_ratio[i], neg_count[i], pos_count[i]))
            w.write('\n')
        w.close()

### Load the data

In [66]:
X_train_corpus , y_train, X_test_corpus , y_test = load_imdb('../aclImdb')

Loading the imdb data
Train Data loaded.
Test Data loaded.


### Corpus Update 

Such as :<br>
[isn't $\rightarrow$ is not] <br>
[haven't $\rightarrow$ have not] <br>
ain't -> are not (despite it also could be a 'is not' or 'am not')
<br>

The steps are as follows :
1. Remove the HTML tags
2. Remove the contractions
3. Singularize nouns

In [69]:
X_train_corpus_update = update_corpus_contraction(X_train_corpus)
# X_test_corpus_update = update_corpus_contraction(X_test_corpus)

(72, 2)
corpus update start
corpus update end



In [94]:
text = TextBlob(X_train_corpus_update[2])
print(X_train_corpus[10])
print(text.tags)
print()
for sentences in text.sentences:
    print('->', sentences)

this film is one giant pant load. paul schrader is utterly lost in his own bad screenplay. and his directing is about as comatose as it can be without his actually having been sleepwalking during the process. the worst though is woody harrelson, whom i ordinarily like when he is properly cast. he plays "the walker", a homosexual man in d.c. who plays social companion to the bored wife of the washington elite. he could not have been more one dimensional if he had been cut out of a magazine and bounced around in front of the camera on a popsicle stick. his "southern accent" is that "off the rack" version that decrescendos from the beginning to the end of every line he delivers, as though the heat and humidity of the south is still draining him of every ounce of energy he has. it is monotonous. but, his is not the worst accent in the movie. his "boyfriend", played by moritz bleibtreau, attempt to affect some kind of a mid east accent that is so clumsy he can barely deliver the bad line wr

In [72]:
# print(text.tags)

# for tag in text.tags:
#     if tag[1] == 'NNS':
#         print(tag[0], ' ', tag[0].singularize())
#         X_train_corpus_update[2] = X_train_corpus_update[2].replace(tag[0], tag[0].singularize())

# print(X_train_corpus_update[2])

### top 100 words extraction (1-5 grams)

In [73]:
token = r"(?u)\b[\w\'/]+\b"
tf_vectorizer = CountVectorizer(lowercase=True, max_df=1.0, min_df=100, binary=True, token_pattern=token)

In [74]:
tf_vectorizer.set_params(ngram_range=(1,1))
X_train = tf_vectorizer.fit_transform(X_train_corpus_update)
words = tf_vectorizer.get_feature_names()

In [75]:
len(words)

3684

In [76]:
sort_top_words_with_count(X_train, y_train, words, '1gram', top_k=100)

In [77]:
def generate_grams_list():
    for i in range(1,6):
        print('Processing',i,'grams')
        tf_vectorizer.set_params(ngram_range=(i,i))
        X_train = tf_vectorizer.fit_transform(X_train_corpus_update)
        words = tf_vectorizer.get_feature_names()

    #     clf = LogisticRegression(penalty='l2', C=1, random_state=42)
    #     clf.fit(X_train, y_train)

        filename=str(i)+"gram"
        sort_top_words_with_count(X_train, y_train, words, filename, top_k=100)

    #     del clf
        del X_train
        del words

In [78]:
generate_grams_list()

Processing 1 grams
Processing 2 grams
Processing 3 grams
Processing 4 grams
Processing 5 grams


### Generate tree relation

Reference : <br>
https://stackoverflow.com/questions/2358045/how-can-i-implement-a-tree-in-python-are-there-any-built-in-data-structures-in
http://anytree.readthedocs.io/en/latest/ <br>
<br>
There is not sign changed when the top_words=100

In [79]:
one_gram = load_list("1gram.txt",'\t')
two_gram = load_list("2gram.txt",'\t')
three_gram = load_list("3gram.txt", '\t')
four_gram = load_list("4gram.txt", '\t')
five_gram = load_list("5gram.txt", '\t')

In [81]:
negated = []
amplifier = []
ind_words = 0
for i in one_gram:
    for j in two_gram:
        split_words = j[0].split()
        after_weight = float(j[1])
        previous_weight = float(i[1])
        for k in range(0, len(split_words)):
            if i[0] == split_words[k]:
                if k == 0:
                    ind_words = 1
                elif k == 1:
                    ind_words = 0
                   
                if np.sign(after_weight)!=np.sign(previous_weight) or np.absolute(after_weight) < np.absolute(previous_weight):
                    negated.append(split_words[ind_words])
                else:
                    amplifier.append(split_words[ind_words])
                print(i[0], i[1],'\t', j[0], j[1])
print()
print('negated')
print(np.unique(negated))
print('amplifier')
print(np.unique(amplifier))

#############################################
# top_words = 1000
# best 0.60 	 at best -1.80

# negated
# ['at']
# amplifier
# []

waste -2.52 	 waste your -3.31
waste -2.52 	 waste of -2.96
waste -2.52 	 not waste -2.92
waste -2.52 	 a waste -2.47
waste -2.52 	 to waste -1.95
redeeming -2.30 	 redeeming quality -2.24
worst -2.20 	 worst film -3.71
worst -2.20 	 worst movie -3.49
worst -2.20 	 the worst -2.31
worst -2.20 	 worst of -1.92
awful -2.13 	 awful the -3.08
awful -2.13 	 is awful -2.95
awful -2.13 	 was awful -2.84
awful -2.13 	 awful i -2.78
awful -2.13 	 awful and -2.54
poorly -2.10 	 poorly written -2.53
captures 2.01 	 captures the 2.51
existent -2.00 	 non existent -1.96
crap -1.74 	 this crap -3.86
crap -1.74 	 of crap -2.38
horrible -1.71 	 is horrible -2.15
mess -1.68 	 this mess -3.38
superb 1.68 	 is superb 1.95
superb 1.68 	 a superb 1.83
terrible -1.64 	 was terrible -2.65
terrible -1.64 	 terrible the -2.48
terrible -1.64 	 is terrible -2.41
insult -1.62 	 an insult -2.27
insult -1.62 	 insult to -2.21
worse -1.61 	 is worse -2.83
worse -1.61 	 even worse -2.39
worse -1.61 	 worse than -2.09

In [82]:
negated = []
amplifier = []
ind_words = 0
for i in one_gram:
    for j in three_gram:
        split_words = j[0].split()
        after_weight = float(j[1])
        previous_weight = float(i[1])
        for k in range(0, len(split_words)):
            if i[0] == split_words[k] and (np.absolute(after_weight)>np.absolute(previous_weight) or np.sign(after_weight)!=np.sign(previous_weight)):
                if k == 0:
                    ind_words = 1
                elif k == 1:
                    ind_words = 0
                   
                if np.sign(after_weight)!=np.sign(previous_weight):
                    negated.append(split_words[ind_words])
                else:
                    amplifier.append(split_words[ind_words])
                print(i[0], i[1],'\t', j[0], j[1])
print()
print('negated')
print(np.unique(negated))
print('amplifier')
print(np.unique(amplifier))

waste -2.52 	 not waste your -3.42
waste -2.52 	 do not waste -3.22
waste -2.52 	 waste your time -3.19
waste -2.52 	 waste of time -2.97
waste -2.52 	 a waste of -2.75
worst -2.20 	 worst movie i -4.26
worst -2.20 	 worst film i -4.24
worst -2.20 	 the worst film -3.85
worst -2.20 	 the worst movie -3.77
worst -2.20 	 of the worst -3.20
worst -2.20 	 worst movie ever -2.92
worst -2.20 	 is the worst -2.64
crap -1.74 	 piece of crap -3.50
wonderful 1.44 	 is a wonderful 1.54

negated
[]
amplifier
['a' 'do' 'film' 'movie' 'not' 'of' 'the' 'your']


In [83]:
for i in two_gram:
    for j in three_gram:
        after_weight = float(j[1])
        previous_weight = float(i[1])
        if i[0] in j[0]:
            print(i[0], i[1],'\t', j[0], j[1])

worst film -3.71 	 worst film i -4.24
worst film -3.71 	 the worst film -3.85
worst movie -3.49 	 worst movie i -4.26
worst movie -3.49 	 the worst movie -3.77
worst movie -3.49 	 worst movie ever -2.92
waste your -3.31 	 not waste your -3.42
waste your -3.31 	 waste your time -3.19
waste of -2.96 	 waste of time -2.97
waste of -2.96 	 a waste of -2.75
not waste -2.92 	 not waste your -3.42
not waste -2.92 	 do not waste -3.22
not funny -2.54 	 is not funny -2.46
all cost -2.50 	 at all cost -2.58
a waste -2.47 	 a waste of -2.75
only good -2.40 	 the only good -2.79
of crap -2.38 	 piece of crap -3.50
loved this 2.38 	 i loved this 2.86
the worst -2.31 	 the worst film -3.85
the worst -2.31 	 the worst movie -3.77
the worst -2.31 	 of the worst -3.20
the worst -2.31 	 is the worst -2.64
the worst -2.31 	 not the worst -2.04
the worst -2.31 	 the worst of -1.81
not worth -2.30 	 is not worth -2.52
this piece -2.30 	 this piece of -2.76
a must 2.17 	 is a must 2.25
a must 2.17 	 a must 

In [84]:
for i in two_gram:
    for j in four_gram:
        after_weight = float(j[1])
        previous_weight = float(i[1])
        if i[0] in j[0]:
            print(i[0], i[1],'\t', j[0], j[1])

worst film -3.71 	 worst film i have -4.89
worst film -3.71 	 the worst film i -4.16
worst movie -3.49 	 worst movie i have -4.18
worst movie -3.49 	 the worst movie i -4.15
worst movie -3.49 	 of the worst movie -3.98
worst movie -3.49 	 the worst movie ever -3.03
waste your -3.31 	 do not waste your -3.50
waste your -3.31 	 not waste your time -3.21
waste of -2.96 	 a waste of time -2.84
not waste -2.92 	 do not waste your -3.50
not waste -2.92 	 not waste your time -3.21
a waste -2.47 	 a waste of time -2.84
the worst -2.31 	 the worst film i -4.16
the worst -2.31 	 the worst movie i -4.15
the worst -2.31 	 of the worst movie -3.98
the worst -2.31 	 one of the worst -3.36
the worst -2.31 	 the worst movie ever -3.03
a must 2.17 	 a must see for 2.55
a must 2.17 	 is a must see 2.34
so bad -2.12 	 so bad it is -2.14
your time -1.97 	 not waste your time -3.21
bad i -1.88 	 so bad it is -2.14
must see 1.83 	 a must see for 2.55
must see 1.83 	 is a must see 2.34
bad it -1.82 	 so bad 

In [90]:
for i in one_gram:
    for j in five_gram:
        after_weight = float(j[1])
        previous_weight = float(i[1])
        if i[0] in j[0] and np.sign(after_weight)!=np.sign(previous_weight):
            print(i[0], i[1],'\t', j[0], j[1])

In [86]:


parent_index = 0
child_index = 0
parent_node = [Node("") for _ in range(len(three_gram))]
child_node = [Node("") for _ in range(len(five_gram))]

for i in three_gram:
    parent_node[parent_index] = Node(i[0])
    for j in five_gram:
        if i[0] in j[0] and j[1]>i[1]:
            print(i[0], i[1],'\t', j[0], j[1])
            child_node = Node(j[0], parent=parent_node[parent_index])
    parent_index = parent_index + 1



worst film i -4.24 	 the worst film i have -4.80
the worst film -3.85 	 the worst film i have -4.80
the worst movie -3.77 	 the worst movie i have -4.08
the worst movie -3.77 	 one of the worst movie -3.93
do not waste -3.22 	 do not waste your time -3.29
of the worst -3.20 	 one of the worst movie -3.93
of the worst -3.20 	 is one of the worst -3.45
waste your time -3.19 	 do not waste your time -3.29
of the best 1.81 	 is one of the best 2.31
is supposed to -1.37 	 is supposed to be a -1.89
supposed to be -1.37 	 is supposed to be a -1.89


In [87]:
for i in range(0,len(three_gram)):
    for pre, fill, node in RenderTree(parent_node[i]):
        if parent_node[i].height != 0:
            print("%s%s" % (pre, node.name))

worst film i
└── the worst film i have
the worst film
└── the worst film i have
the worst movie
├── the worst movie i have
└── one of the worst movie
do not waste
└── do not waste your time
of the worst
├── one of the worst movie
└── is one of the worst
waste your time
└── do not waste your time
of the best
└── is one of the best
is supposed to
└── is supposed to be a
supposed to be
└── is supposed to be a


In [88]:
parent_index = 0
two_index = 0
three_index = 0
four_index = 0
parent_node = [Node("") for _ in range(len(one_gram))]
two_node = [Node("") for _ in range(len(two_gram))]
three_node = [Node("") for _ in range(len(three_gram))]
four_node = [Node("") for _ in range(len(four_gram))]

for one in one_gram:
    parent_node[parent_index] = Node(one[0])
    for two in two_gram:
        if one[0] in two[0]:
            two_node[two_index] = Node(two[0], parent=parent_node[parent_index])
            for three in three_gram:
                if two[0] in three[0]:
                    three_node[three_index] = Node(three[0], parent=two_node[two_index])
                    for four in four_gram:
                        if three[0] in four[0]:
                            four_node[four_index] = Node(four[0], parent=three_node[three_index])
                            four_index = four_index + 1
                            print(one[0], one[1],'\t', two[0], two[1], '\t', three[0], three[1], '\t', four[0], four[1])
                    three_index = three_index + 1
            two_index = two_index + 1
    parent_index = parent_index + 1

waste -2.52 	 waste your -3.31 	 not waste your -3.42 	 do not waste your -3.50
waste -2.52 	 waste your -3.31 	 not waste your -3.42 	 not waste your time -3.21
waste -2.52 	 waste your -3.31 	 waste your time -3.19 	 not waste your time -3.21
waste -2.52 	 waste of -2.96 	 waste of time -2.97 	 a waste of time -2.84
waste -2.52 	 waste of -2.96 	 a waste of -2.75 	 a waste of time -2.84
waste -2.52 	 not waste -2.92 	 not waste your -3.42 	 do not waste your -3.50
waste -2.52 	 not waste -2.92 	 not waste your -3.42 	 not waste your time -3.21
waste -2.52 	 not waste -2.92 	 do not waste -3.22 	 do not waste your -3.50
waste -2.52 	 a waste -2.47 	 a waste of -2.75 	 a waste of time -2.84
worst -2.20 	 worst film -3.71 	 worst film i -4.24 	 worst film i have -4.89
worst -2.20 	 worst film -3.71 	 worst film i -4.24 	 the worst film i -4.16
worst -2.20 	 worst film -3.71 	 the worst film -3.85 	 the worst film i -4.16
worst -2.20 	 worst movie -3.49 	 worst movie i -4.26 	 worst movi

In [89]:
for i in range(0,len(one_gram)):
    for pre, fill, node in RenderTree(parent_node[i]):
        if parent_node[i].height != 0:
            print("%s%s" % (pre, node.name))

waste
├── waste your
│   ├── not waste your
│   │   ├── do not waste your
│   │   └── not waste your time
│   └── waste your time
│       └── not waste your time
├── waste of
│   ├── waste of time
│   │   └── a waste of time
│   └── a waste of
│       └── a waste of time
├── not waste
│   ├── not waste your
│   │   ├── do not waste your
│   │   └── not waste your time
│   └── do not waste
│       └── do not waste your
├── a waste
│   └── a waste of
│       └── a waste of time
└── to waste
redeeming
└── redeeming quality
worst
├── worst film
│   ├── worst film i
│   │   ├── worst film i have
│   │   └── the worst film i
│   └── the worst film
│       └── the worst film i
├── worst movie
│   ├── worst movie i
│   │   ├── worst movie i have
│   │   └── the worst movie i
│   ├── the worst movie
│   │   ├── the worst movie i
│   │   ├── of the worst movie
│   │   └── the worst movie ever
│   └── worst movie ever
│       └── the worst movie ever
├── the worst
│   ├── the worst film
│   │   └