In [15]:
############ Compulsory Standard Library #################
import glob
import numpy as np
import matplotlib.pyplot as plt
import csv
import nltk
import re
import graphviz
from anytree import Node, RenderTree
from textblob import TextBlob

############ Sklearn pre-processing Library #################
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.metrics import accuracy_score

############ Sklearn model Library #################
from sklearn.linear_model import LogisticRegression

In [16]:
def load_imdb(path, shuffle=True, random_state=42):
    
    print("Loading the imdb data")
    
    train_neg_files = glob.glob(path+"/train/neg/*.txt")
    train_pos_files = glob.glob(path+"/train/pos/*.txt")
    
    X_train_corpus = []
    y_train = []
    
    for tnf in train_neg_files:
        f = open(tnf, 'r', encoding="utf8")
        line = f.read()
        #line = line[:len(line)/2]
        X_train_corpus.append(line)
        y_train.append(0)
        f.close()
    
    for tpf in train_pos_files:
        f = open(tpf, 'r', encoding="utf8")
        line = f.read()
        #line = line[:len(line)/2]
        X_train_corpus.append(line)
        y_train.append(1)
        f.close()
    
    print("Train Data loaded.")
    
    test_neg_files = glob.glob(path+"/test/neg/*.txt")
    test_pos_files = glob.glob(path+"/test/pos/*.txt")
    
    X_test_corpus = []
    y_test = []
    
    for tnf in test_neg_files:
        f = open(tnf, 'r', encoding="utf8")
        X_test_corpus.append(f.read())
        y_test.append(0)
        f.close()
    
    for tpf in test_pos_files:
        f = open(tpf, 'r', encoding="utf8")
        X_test_corpus.append(f.read())
        y_test.append(1)
        f.close()
    
    print("Test Data loaded.")
    
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    
    if shuffle:
        np.random.seed(random_state)
        indices = np.random.permutation(len(y_train))       
        
        #X_train = X_train.tocsr()
        #X_train_corpus = X_train_corpus[indices]
        X_train_corpus = [X_train_corpus[i] for i in indices]
        y_train = y_train[indices]
        #train_corpus_shuffled = [train_corpus[i] for i in indices]
        
        indices = np.random.permutation(len(y_test))
        
        #X_test = X_test.tocsr()
        #X_test_corpus = X_test_corpus[indices]
        X_test_corpus = [X_test_corpus[i] for i in indices]
        y_test = y_test[indices]
        #test_corpus_shuffled = [test_corpus[i] for i in indices]
    #else:
        #train_corpus_shuffled = train_corpus
        #test_corpus_shuffled = test_corpus
    
    return X_train_corpus, y_train, X_test_corpus , y_test

In [17]:
'''
Read and load the contraction list (or any text files)
'''
def load_list(filename, split_delimiter):
    vocabulary = []
    with open(filename, 'r') as f:
        for l in f:
            vocabulary.append(l.strip().split(split_delimiter))
    return np.asarray(vocabulary)

'''
Clean the HTML tags from the corpus
'''
def cleanhtml(text):
#     cleanr = re.compile('<.*?>')
#     cleantag = re.sub(cleanr, '', text)
    cleantag = re.sub(re.compile('<.*?>'), '', text)
    cleantext = cleantag.replace('br', '')
    return cleantext

# Reference :
# https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string

'''
Replace the contraction words into two parts (by given contraction list)
'''
def replace_contraction(corpus, cont_list):
    for i in range(0, cont_list.shape[0]):
        corpus = corpus.lower().replace(cont_list[i,0], cont_list[i,1])
    return corpus

'''
Singularize the words by its POS-tag
'''
def word_singularize(corpus):
    from textblob import TextBlob
    
    text = TextBlob(corpus)
    for tag in text.tags:
        if tag[1] == 'NNS' and tag[0] != 'yes':
            corpus = corpus.replace(tag[0], tag[0].singularize())
    return corpus

'''
Update clean corpus
'''
def update_corpus_contraction(X_corpus):
    cont_list = load_list("contraction_list.txt", ',')
    print(cont_list.shape)
    print('corpus update start')
    for i in range(0,len(X_corpus)):
        X_corpus[i] = cleanhtml(X_corpus[i])
        X_corpus[i] = replace_contraction(X_corpus[i], cont_list)
        X_corpus[i] = word_singularize(X_corpus[i])
        X_corpus[i] = X_corpus[i].replace('&', 'and')
    print('corpus update end')
    print()
    return X_corpus

'''
Count the negative and positive frequency
'''
def negative_positive_counts(X, y, word_index):
    neg_count = np.sum(X[y==0, word_index])
    pos_count = np.sum(X[y==1, word_index])    
    return neg_count, pos_count

'''
Count the ratio : log(#pos/#neg)
'''
def log_ratio_positive_negative(X, y, word_index):
    neg_count, pos_count = negative_positive_counts(X,y, word_index)
    log_ratio = np.log(pos_count+1)-np.log(neg_count+1)
    return log_ratio, neg_count, pos_count

'''
Sort top words w.r.t log ratio and write into file
'''
def sort_top_words_with_count(X, y, words,filename, top_k=10):
    log_ratio = []
    neg_count = []
    pos_count = []
    
    for i in range(0,len(words)):
        log_ratio_, neg_count_, pos_count_ = log_ratio_positive_negative(X, y, i)
        log_ratio.append(log_ratio_)
        neg_count.append(neg_count_)
        pos_count.append(pos_count_)
    
    sorted_indices_descending_abs = np.argsort(np.absolute(log_ratio))[::-1]
    
    filename = filename + '.txt'
    with open(filename, mode='w', encoding='utf8') as w:
        for i in sorted_indices_descending_abs[: top_k]:
#             print("%s\t%0.2f" %(words[i], weights[i]))
#             n_p=negative_positive_counts(X, y, i)
            w.write("%s\t%0.2f\t%d\t%d" %(str(words[i]), log_ratio[i], neg_count[i], pos_count[i]))
            w.write('\n')
        w.close()

### Load the data

In [20]:
path = r"C:\Users\Anne Soraya\Documents\IIT_resources\Python\aclImdb"
X_train_corpus , y_train, X_test_corpus , y_test = load_imdb(path)

Loading the imdb data
Train Data loaded.
Test Data loaded.


### Corpus Update 

Such as :<br>
[isn't $\rightarrow$ is not] <br>
[haven't $\rightarrow$ have not] <br>
ain't -> are not (despite it also could be a 'is not' or 'am not')
<br>

The steps are as follows :
1. Remove the HTML tags
2. Remove the contractions
3. Singularize nouns

In [21]:
X_train_corpus_update = update_corpus_contraction(X_train_corpus)
# X_test_corpus_update = update_corpus_contraction(X_test_corpus)

(72, 2)
corpus update start
corpus update end



In [23]:
print(X_train_corpus_update[0])
sample_corpus = []

token = r"(?u)\b[\w\'/]+\b"
tf_vectorizer = CountVectorizer(lowercase=True, max_df=1.0, min_df=1, binary=False, token_pattern=token)
tf_vectorizer.set_params(ngram_range=(1,1))

sample_corpus.append(X_train_corpus_update[0])
X_sample = tf_vectorizer.fit_transform(sample_corpus)

word_feature = tf_vectorizer.get_feature_names()

print()
# for i in word_feature:
#     print(i)

print(sample_corpus)
print(len(word_feature))
print(X_sample.shape)
print()
print(X_sample[np.where(X_sample != 1)])
print("---")
# print(word_feature[np.where(X_sample[0,:] != 1)])

print(np.where(X_sample != 1))
'''
Here, we put the word frame to the sentences
'''

text = TextBlob(X_train_corpus_update[0])
# print(text.tags)
print()
# print(text.sentences)
sentence = []
for i in range(0, len(text.sentences)):
    sentence.append(text.sentences[i])
    
# print(sentence.shape)
print("Test")

silent night, deadly night 5 is the very last of the series, and like part 4, it is unrelated to the first three except by title and the fact that it is a christmas-themed horror flick.except to the oblivious, there is some obvious thing going on here...mickey rooney plays a toymaker named joe petto and his creepy son's name is pino. ring a bell, anyone? now, a little boy named derek heard a knock at the door one evening, and opened it to find a present on the doorstep for him. even though it said "do not open till christmas", he begins to open it anyway but is stopped by his dad, who scolds him and sends him to bed, and opens the gift himself. inside is a little red ball that sprouts santa arm and a head, and proceed to kill dad. oop, maybe he should have left well-enough alone. of course derek is then traumatized by the incident since he watched it from the stair, but he does not grow up to be some killer santa, he just stops talking.there is a mysterious stranger lurking around, who



ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all().

In [68]:
# text = TextBlob(X_train_corpus_update[2])
# print(X_train_corpus[2])
# print(text.tags)
# print()
# for sentences in text.sentences:
#     if "not" in sentences:
#         print('->', sentences)

In [91]:
sample_corpus = []

for i in range(30):
    sample_corpus.append(TextBlob(X_train_corpus_update[i]))
    print(i+1, ' label: ', y_train[i])
    for sentences in sample_corpus[i].sentences:
        if  "\"" in sentences:
            print('->', sentences)
    print()    

1  label:  0
-> even though it said "do not open till christmas", he begins to open it anyway but is stopped by his dad, who scolds him and sends him to bed, and opens the gift himself.
-> he does keep his landlord from evicting him by promising him to pay him in cash the next day and present him with a "larry the larvae" toy for his kid, but of course "larry" is not a good toy and gets out of the box in the car and of course, well, thing are not pretty.anyway, eventually what is going on with joe petto and pino is of course revealed, and as with the old story, pino is not a "real boy".
-> pino is probably even more agitated and naughty because he suffers from "kenitalium" (a smooth plastic crotch) so that could account for his evil way.

2  label:  1

3  label:  0

4  label:  1
-> even the codename that the bad guy use is dumb ("tango-tango").
-> oh, yes: and the unintentional humour.the film opens with some truly bad and unconvincing gay banter between our go-lucky and happy characte

In [70]:
for i in range(10):
    print(y_train[i])

0
1
0
1
1
0
0
1
0
0


In [14]:
token = r"(?u)\b[\w\'/]+\b"
tf_vectorizer = CountVectorizer(lowercase=True, max_df=1.0, min_df=100, binary=False, token_pattern=token)
tf_vectorizer.set_params(ngram_range=(1,1))

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=100,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern="(?u)\\b[\\w\\'/]+\\b",
        tokenizer=None, vocabulary=None)

In [15]:
X_matrix = tf_vectorizer.fit_transform(X_train_corpus_update)

In [50]:
words = tf_vectorizer.get_feature_names()
X_matrix_dense = X_matrix.todense()
indices = np.where(X_matrix_dense[0,:] != 0))

SyntaxError: invalid syntax (<ipython-input-50-87a7d5b5d9ed>, line 3)

In [None]:
# print(text.tags)

# for tag in text.tags:
#     if tag[1] == 'NNS':
#         print(tag[0], ' ', tag[0].singularize())
#         X_train_corpus_update[2] = X_train_corpus_update[2].replace(tag[0], tag[0].singularize())

# print(X_train_corpus_update[2])

### top 100 words extraction (1-5 grams)

In [85]:
token = r"(?u)\b[\w\'/]+\b"
tf_vectorizer = CountVectorizer(lowercase=True, max_df=1.0, min_df=100, binary=True, token_pattern=token)

In [86]:
tf_vectorizer.set_params(ngram_range=(1,1))
X_train = tf_vectorizer.fit_transform(X_train_corpus_update)
words = tf_vectorizer.get_feature_names()

In [None]:
len(words)

In [None]:
sort_top_words_with_count(X_train, y_train, words, '1gram', top_k=100)

In [None]:
def generate_grams_list():
    for i in range(1,6):
        print('Processing',i,'grams')
        tf_vectorizer.set_params(ngram_range=(i,i))
        X_train = tf_vectorizer.fit_transform(X_train_corpus_update)
        words = tf_vectorizer.get_feature_names()

    #     clf = LogisticRegression(penalty='l2', C=1, random_state=42)
    #     clf.fit(X_train, y_train)

        filename=str(i)+"gram"
        sort_top_words_with_count(X_train, y_train, words, filename, top_k=100)

    #     del clf
        del X_train
        del words

In [None]:
#generate_grams_list()

In [83]:
tf_vectorizer.set_params(ngram_range=(2,2))
X_train = tf_vectorizer.fit_transform(X_train_corpus_update)

In [89]:
words = tf_vectorizer.get_feature_names()

#print(words)

for i in range(0,len(words)):
    if 'not scary' in words[i]:
        index=i

#not_scary_slice = np.array(X_train[:,index])
#print(not_scary_slice)
corpus_indices = []

for i in range(0, X_train.shape[0]):
    if X_train[i, index] == 1:
        corpus_indices.append(i)
j=1
for i in corpus_indices:
    text = TextBlob(X_train_corpus_update[i])
    print(i, ' label: ',y_train[i])
    for sentences in text.sentences:
        if 'not scary' in sentences:
            print(' .', sentences)

3  label:  1
28  label:  1
61  label:  1
 . the cinematography enhances the story, keeping the mood dank and dense and primarily confined to the condemned building.
126  label:  1
127  label:  1
132  label:  0
139  label:  0
145  label:  1
153  label:  0
170  label:  1
218  label:  0
223  label:  1
 . the viewer may feel a lot better coming out of the cinema after part 1 than part 2 but that is the reality of che is life and not in my opinion any fault of the director.
227  label:  0
229  label:  1
 . we currently have in the cinemas the austen biopic "becoming jane", and itv have recently produced three tv movie based on austen novel.
249  label:  1
271  label:  1
278  label:  1
350  label:  0
355  label:  0
384  label:  0
476  label:  0
495  label:  0
499  label:  1
524  label:  0
529  label:  0
543  label:  1
557  label:  1
584  label:  0
 . to sum it up, this is pure cinema barf drenched in the chocolate syrup known as nudity, and topped with the cherry of horrible acting as only a

11800  label:  1
11812  label:  1
 . great viewing for art cinema lover.
11824  label:  1
11829  label:  1
 . the man behind 'seryozha, sovsem propashchiy (1972) which is an adaptation of mark twain the adventure of huckleberry finn, 33, ya shagayu po moskve (1964) aka walking the street of moscow, ne goryuy (1969) aka do not grieve, afonya (1975), mimino (1977), osenniy marafon (1979) aka autumn marathon or sad comedy which is a very fitting title for this movie as well as for the whole genre that danelium practically invented, and the cult favorite for over 20 year kin-dza-dza (1986), is illiant and deserves our true love and genuine gratitude for the unforgettable moment of cinematic happiness.
11872  label:  1
11877  label:  0
11959  label:  0
12001  label:  1
12068  label:  1
12107  label:  0
12150  label:  0
 . for some reason, director john boorman and cinematographer seamus deasy selected to film this movie in black-and-white while its style and presentation are clearly the ele

### Generate tree relation

Reference : <br>
https://stackoverflow.com/questions/2358045/how-can-i-implement-a-tree-in-python-are-there-any-built-in-data-structures-in
http://anytree.readthedocs.io/en/latest/ <br>
<br>
There is not sign changed when the top_words=100

In [None]:
one_gram = load_list("1gram.txt",'\t')
two_gram = load_list("2gram.txt",'\t')
three_gram = load_list("3gram.txt", '\t')
four_gram = load_list("4gram.txt", '\t')
five_gram = load_list("5gram.txt", '\t')

In [None]:
negated = []
amplifier = []
ind_words = 0
for i in one_gram:
    for j in two_gram:
        split_words = j[0].split()
        after_weight = float(j[1])
        previous_weight = float(i[1])
        for k in range(0, len(split_words)):
            if i[0] == split_words[k]:
                if k == 0:
                    ind_words = 1
                elif k == 1:
                    ind_words = 0
                   
                if np.sign(after_weight)!=np.sign(previous_weight) or np.absolute(after_weight) < np.absolute(previous_weight):
                    negated.append(split_words[ind_words])
                else:
                    amplifier.append(split_words[ind_words])
                print(i[0], i[1],'\t', j[0], j[1])
print()
print('negated')
print(np.unique(negated))
print('amplifier')
print(np.unique(amplifier))

#############################################
# top_words = 1000
# best 0.60 	 at best -1.80

# negated
# ['at']
# amplifier
# []

In [None]:
negated = []
amplifier = []
ind_words = 0
for i in one_gram:
    for j in three_gram:
        split_words = j[0].split()
        after_weight = float(j[1])
        previous_weight = float(i[1])
        for k in range(0, len(split_words)):
            if i[0] == split_words[k] and (np.absolute(after_weight)>np.absolute(previous_weight) or np.sign(after_weight)!=np.sign(previous_weight)):
                if k == 0:
                    ind_words = 1
                elif k == 1:
                    ind_words = 0
                   
                if np.sign(after_weight)!=np.sign(previous_weight):
                    negated.append(split_words[ind_words])
                else:
                    amplifier.append(split_words[ind_words])
                print(i[0], i[1],'\t', j[0], j[1])
print()
print('negated')
print(np.unique(negated))
print('amplifier')
print(np.unique(amplifier))

In [None]:
for i in two_gram:
    for j in three_gram:
        after_weight = float(j[1])
        previous_weight = float(i[1])
        if i[0] in j[0]:
            print(i[0], i[1],'\t', j[0], j[1])

In [None]:
for i in two_gram:
    for j in four_gram:
        after_weight = float(j[1])
        previous_weight = float(i[1])
        if i[0] in j[0]:
            print(i[0], i[1],'\t', j[0], j[1])

In [None]:
for i in one_gram:
    for j in five_gram:
        after_weight = float(j[1])
        previous_weight = float(i[1])
        if i[0] in j[0] and np.sign(after_weight)!=np.sign(previous_weight):
            print(i[0], i[1],'\t', j[0], j[1])

In [None]:


parent_index = 0
child_index = 0
parent_node = [Node("") for _ in range(len(three_gram))]
child_node = [Node("") for _ in range(len(five_gram))]

for i in three_gram:
    parent_node[parent_index] = Node(i[0])
    for j in five_gram:
        if i[0] in j[0] and j[1]>i[1]:
            print(i[0], i[1],'\t', j[0], j[1])
            child_node = Node(j[0], parent=parent_node[parent_index])
    parent_index = parent_index + 1



In [None]:
for i in range(0,len(three_gram)):
    for pre, fill, node in RenderTree(parent_node[i]):
        if parent_node[i].height != 0:
            print("%s%s" % (pre, node.name))

In [None]:
parent_index = 0
two_index = 0
three_index = 0
four_index = 0
parent_node = [Node("") for _ in range(len(one_gram))]
two_node = [Node("") for _ in range(len(two_gram))]
three_node = [Node("") for _ in range(len(three_gram))]
four_node = [Node("") for _ in range(len(four_gram))]

for one in one_gram:
    parent_node[parent_index] = Node(one[0])
    for two in two_gram:
        if one[0] in two[0]:
            two_node[two_index] = Node(two[0], parent=parent_node[parent_index])
            for three in three_gram:
                if two[0] in three[0]:
                    three_node[three_index] = Node(three[0], parent=two_node[two_index])
                    for four in four_gram:
                        if three[0] in four[0]:
                            four_node[four_index] = Node(four[0], parent=three_node[three_index])
                            four_index = four_index + 1
                            print(one[0], one[1],'\t', two[0], two[1], '\t', three[0], three[1], '\t', four[0], four[1])
                    three_index = three_index + 1
            two_index = two_index + 1
    parent_index = parent_index + 1

In [None]:
for i in range(0,len(one_gram)):
    for pre, fill, node in RenderTree(parent_node[i]):
        if parent_node[i].height != 0:
            print("%s%s" % (pre, node.name))