In [1]:
############ Compulsory Standard Library #################
import glob
import numpy as np
import matplotlib.pyplot as plt
import csv
import nltk
import re

############ Sklearn pre-processing Library #################
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.metrics import accuracy_score

############ Sklearn model Library #################
from sklearn.linear_model import LogisticRegression

In [2]:
def load_imdb(path):
    
    print("Loading the imdb data")
    
    train_neg_files = glob.glob(path+"/train/neg/*.txt")
    train_pos_files = glob.glob(path+"/train/pos/*.txt")
    
    X_train_corpus = []
    y_train = []
    
    for tnf in train_neg_files:
        f = open(tnf, 'r', encoding="utf8")
        line = f.read()
        #line = line[:len(line)/2]
        X_train_corpus.append(line)
        y_train.append(0)
        f.close()
    
    for tpf in train_pos_files:
        f = open(tpf, 'r', encoding="utf8")
        line = f.read()
        #line = line[:len(line)/2]
        X_train_corpus.append(line)
        y_train.append(1)
        f.close()
    
    print("Train Data loaded.")
    
    test_neg_files = glob.glob(path+"/test/neg/*.txt")
    test_pos_files = glob.glob(path+"/test/pos/*.txt")
    
    X_test_corpus = []
    y_test = []
    
    for tnf in test_neg_files:
        f = open(tnf, 'r', encoding="utf8")
        X_test_corpus.append(f.read())
        y_test.append(0)
        f.close()
    
    for tpf in test_pos_files:
        f = open(tpf, 'r', encoding="utf8")
        X_test_corpus.append(f.read())
        y_test.append(1)
        f.close()
    
    print("Test Data loaded.")
    
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    
    return X_train_corpus, y_train, X_test_corpus , y_test

In [21]:
def load_list(filename, split_delimiter):
    vocabulary = []
    with open(filename, 'r') as f:
        for l in f:
            vocabulary.append(l.strip().split(split_delimiter))
    return np.asarray(vocabulary)

In [4]:
def replace_contraction(corpus, cont_list):
    for i in range(0, cont_list.shape[0]):
        corpus = corpus.lower().replace(cont_list[i,0], cont_list[i,1])
    return corpus

In [22]:
def update_corpus_contraction(X_corpus):
    cont_list = load_list("contraction_list.txt", ',')

    print('corpus update start')
    for i in range(0,len(X_corpus)):
        X_corpus[i] = cleanhtml(X_corpus[i])
        X_corpus[i] = replace_contraction(X_corpus[i], cont_list)
    print('corpus update end')
    print()
    return X_corpus

In [6]:
def negative_positive_counts(X, y, word_index):
    neg_count = np.sum(X[y==0, word_index])
    pos_count = np.sum(X[y==1, word_index])    
    return neg_count, pos_count

In [14]:
def sort_top_words_with_count(X, y, words, weights, filename, top_k=10):
    sorted_indices_descending_abs = np.argsort(np.absolute(weights))[::-1]
    
    filename = filename + '.txt'
    with open(filename, mode='w', encoding='utf8') as w:
        for i in sorted_indices_descending_abs[: top_k]:
#             print("%s\t%0.2f" %(words[i], weights[i]))
            n_p=negative_positive_counts(X, y, i)
            w.write("%s\t%0.2f\t%d\t%d" %(str(words[i]), weights[i], n_p[0], n_p[1]))
            w.write('\n')
        w.close()

In [8]:
def cleanhtml(text):
    cleanr = re.compile('<.*?>')
    cleantag = re.sub(cleanr, '', text)
    cleantext = cleantag.replace('br', '')
    return cleantext

# Reference :
# https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string

### Load the data

In [9]:
X_train_corpus , y_train, X_test_corpus , y_test = load_imdb('../aclImdb')

Loading the imdb data
Train Data loaded.
Test Data loaded.


### Corpus Update (Remove the contraction)
Such as :<br>
[isn't $\rightarrow$ is not] <br>
[haven't $\rightarrow$ have not] <br>

In [10]:
X_train_corpus_update = update_corpus_contraction(X_train_corpus)
# X_test_corpus_update = update_corpus_contraction(X_test_corpus)

corpus update start
corpus update end



### top 100 words extraction (1-5 grams)

In [11]:
token = r"(?u)\b[\w\'/]+\b"
tf_vectorizer = CountVectorizer(lowercase=True, max_df=1.0, min_df=5, binary=True, token_pattern=token)

In [12]:
for i in range(1,6):
    print('Processing',i,'grams')
    tf_vectorizer.set_params(ngram_range=(i,i))
    X_train = tf_vectorizer.fit_transform(X_train_corpus_update)
    words = tf_vectorizer.get_feature_names()
    
    clf = LogisticRegression(penalty='l2', C=1)
    clf.fit(X_train, y_train)

    filename=str(i)+"gram"
    sort_top_words_with_count(X_train, y_train, words, clf.coef_.flatten(), filename, top_k=100)
    
    del clf
    del X_train
    del words

Processing 1 grams
Processing 2 grams
Processing 3 grams
Processing 4 grams
Processing 5 grams


### Generate tree relation

Reference :
https://stackoverflow.com/questions/2358045/how-can-i-implement-a-tree-in-python-are-there-any-built-in-data-structures-in

In [40]:
one_gram = load_list("1gram.txt",'\t')
two_gram = load_list("2gram.txt",'\t')
three_gram = load_list("3gram.txt", '\t')
four_gram = load_list("4gram.txt", '\t')
five_gram = load_list("5gram.txt", '\t')

In [49]:
for i in one_gram:
    for j in two_gram:
#         abs_weight = [np.absolute(j[1]), np.absolute(i[1])]
        if i[0] in j[0] and (j[1]>i[1]):
            print(i[0], i[1],'\t', j[0], j[1])

worst -2.37 	 the worst -2.52
wonder -1.06 	 is wonderful 0.84
wonder -1.06 	 a wonderful 0.81
highly 1.03 	 highly recommended 1.27


In [39]:
for i in two_gram:
    for j in three_gram:
        if i[0] in j[0] and j[1]>i[1]:
            print(i[0], i[1],'\t', j[0], j[1])

the worst -2.52 	 of the worst -2.80
waste of -1.54 	 a waste of -1.68
than this -1.23 	 better than this -1.37
a must 1.09 	 a must see 1.45
worst movie -1.08 	 the worst movie -1.37
an excellent 1.05 	 is an excellent 1.06
loved this 1.04 	 i loved this 1.40
unless you -1.04 	 unless you are -1.36
the best 1.01 	 of the best 1.54
love this 0.95 	 i love this 1.51
not even -0.94 	 is not even -1.23
not even -0.94 	 does not even -0.95
very bad -0.92 	 a very bad -1.07
very good 0.90 	 very good and 0.92
so bad -0.90 	 is so bad -1.10
so bad -0.90 	 was so bad -0.91
sit through -0.90 	 to sit through -1.23
a great 0.89 	 a great job 1.20
a great 0.89 	 is a great 1.10
a great 0.89 	 with a great 0.97
your time -0.88 	 waste your time -1.05
the funniest 0.88 	 of the funniest 0.93
must see 0.87 	 a must see 1.45
must see 0.87 	 must see for 0.91
very disappointed -0.87 	 was very disappointed -0.93
at all -0.86 	 at all costs -1.20
highly recommend 0.86 	 i highly recommend 1.13
highly 

In [41]:
for i in three_gram:
    for j in four_gram:
        if i[0] in j[0] and j[1]>i[1]:
            print(i[0], i[1],'\t', j[0], j[1])

of the worst -2.80 	 one of the worst -2.95
a waste of -1.68 	 a waste of time -1.81
is the worst -1.57 	 this is the worst -1.73
waste of time -1.54 	 a waste of time -1.81
of the best 1.54 	 one of the best 1.81
i love this 1.51 	 i love this movie 1.81
a must see 1.45 	 a must see for 1.92
i loved this 1.40 	 i loved this movie 1.61
is not even -1.23 	 it is not even -1.42
to sit through -1.23 	 to sit through this -1.36
i recommend this 1.22 	 i recommend this movie 1.33
at all costs -1.20 	 avoid at all costs -1.77
a great job 1.20 	 does a great job 1.66
none of the -1.16 	 none of the characters -1.30
the only good -1.15 	 the only good thing -1.66
i highly recommend 1.13 	 i highly recommend this 1.59
i highly recommend 1.13 	 i highly recommend it 1.23
that is it -1.12 	 and that is it -1.42
is a great 1.10 	 this is a great 1.66
a very bad -1.07 	 is a very bad -1.13
is an excellent 1.06 	 this is an excellent 1.52
save your money -1.06 	 save your money and -1.27
is definite

In [42]:
for i in four_gram:
    for j in five_gram:
        if i[0] in j[0] and j[1]>i[1]:
            print(i[0], i[1],'\t', j[0], j[1])

do not waste your -1.82 	 do not waste your time -2.36
do not waste your -1.82 	 do not waste your money -2.01
i really enjoyed this 1.72 	 i really enjoyed this movie 2.07
i highly recommend this 1.59 	 i highly recommend this film 1.60
of the most boring -1.44 	 one of the most boring -1.98
it is not even -1.42 	 and it is not even -1.54
this is a must 1.36 	 this is a must see 1.94
do not miss this 1.33 	 do not miss this one 1.80
is nothing more than -1.31 	 is nothing more than a -1.78
of the most awful -1.30 	 one of the most awful -1.60
worst film i have -1.26 	 the worst film i have -1.56
a lot of fun 1.25 	 is a lot of fun 1.44
complete waste of time -1.23 	 a complete waste of time -1.79
the worst i have -1.20 	 the worst i have seen -1.63
this is a wonderful 1.18 	 this is a wonderful film 1.37
a total waste of -1.15 	 a total waste of time -1.66
fell in love with 1.13 	 i fell in love with 1.59
one of the greatest 1.13 	 is one of the greatest 1.40
