In [1]:
############ Compulsory Standard Library #################
import glob
import numpy as np
import matplotlib.pyplot as plt
import csv
import nltk
import re

############ Sklearn pre-processing Library #################
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.metrics import accuracy_score

############ Sklearn model Library #################
from sklearn.linear_model import LogisticRegression

In [2]:
def load_imdb(path):
    
    print("Loading the imdb data")
    
    train_neg_files = glob.glob(path+"/train/neg/*.txt")
    train_pos_files = glob.glob(path+"/train/pos/*.txt")
    
    X_train_corpus = []
    y_train = []
    
    for tnf in train_neg_files:
        f = open(tnf, 'r', encoding="utf8")
        line = f.read()
        #line = line[:len(line)/2]
        X_train_corpus.append(line)
        y_train.append(0)
        f.close()
    
    for tpf in train_pos_files:
        f = open(tpf, 'r', encoding="utf8")
        line = f.read()
        #line = line[:len(line)/2]
        X_train_corpus.append(line)
        y_train.append(1)
        f.close()
    
    print("Train Data loaded.")
    
    test_neg_files = glob.glob(path+"/test/neg/*.txt")
    test_pos_files = glob.glob(path+"/test/pos/*.txt")
    
    X_test_corpus = []
    y_test = []
    
    for tnf in test_neg_files:
        f = open(tnf, 'r', encoding="utf8")
        X_test_corpus.append(f.read())
        y_test.append(0)
        f.close()
    
    for tpf in test_pos_files:
        f = open(tpf, 'r', encoding="utf8")
        X_test_corpus.append(f.read())
        y_test.append(1)
        f.close()
    
    print("Test Data loaded.")
    
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    
    return X_train_corpus, y_train, X_test_corpus , y_test

In [3]:
def load_contraction_list(filename):
    vocabulary = []
    with open(filename, 'r') as f:
        for l in f:
            vocabulary.append(l.strip().split(','))
    return np.asarray(vocabulary)

In [4]:
def replace_contraction(corpus, cont_list):
    for i in range(0, cont_list.shape[0]):
        corpus = corpus.lower().replace(cont_list[i,0], cont_list[i,1])
    return corpus

In [5]:
def update_corpus_contraction(X_corpus):
    cont_list = load_contraction_list("contraction_list.txt")

    print('corpus update start')
    for i in range(0,len(X_corpus)):
        X_corpus[i] = cleanhtml(X_corpus[i])
        X_corpus[i] = replace_contraction(X_corpus[i], cont_list)
    print('corpus update end')
    print()
    return X_corpus

In [6]:
def negative_positive_counts(X, y, word_index):
    neg_count = np.sum(X[y==0, word_index])
    pos_count = np.sum(X[y==1, word_index])    
    return neg_count, pos_count

In [14]:
def sort_top_words_with_count(X, y, words, weights, filename, top_k=10):
    sorted_indices_descending_abs = np.argsort(np.absolute(weights))[::-1]
    
    filename = filename + '.txt'
    with open(filename, mode='w', encoding='utf8') as w:
        for i in sorted_indices_descending_abs[: top_k]:
#             print("%s\t%0.2f" %(words[i], weights[i]))
            n_p=negative_positive_counts(X, y, i)
            w.write("%s\t%0.2f\t%d\t%d" %(str(words[i]), weights[i], n_p[0], n_p[1]))
            w.write('\n')
        w.close()

In [8]:
def cleanhtml(text):
    cleanr = re.compile('<.*?>')
    cleantag = re.sub(cleanr, '', text)
    cleantext = cleantag.replace('br', '')
    return cleantext

# Reference :
# https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string

### Load the data

In [9]:
X_train_corpus , y_train, X_test_corpus , y_test = load_imdb('../aclImdb')

Loading the imdb data
Train Data loaded.
Test Data loaded.


### Corpus Update (Remove the contraction)
Such as :<br>
[isn't $\rightarrow$ is not] <br>
[haven't $\rightarrow$ have not] <br>

In [10]:
X_train_corpus_update = update_corpus_contraction(X_train_corpus)
# X_test_corpus_update = update_corpus_contraction(X_test_corpus)

corpus update start
corpus update end



### top 100 words extraction (1-5 grams)

In [11]:
token = r"(?u)\b[\w\'/]+\b"
tf_vectorizer = CountVectorizer(lowercase=True, max_df=1.0, min_df=5, binary=True, token_pattern=token)

In [12]:
for i in range(1,6):
    print('Processing',i,'grams')
    tf_vectorizer.set_params(ngram_range=(i,i))
    X_train = tf_vectorizer.fit_transform(X_train_corpus_update)
    words = tf_vectorizer.get_feature_names()
    
    clf = LogisticRegression(penalty='l2', C=1)
    clf.fit(X_train, y_train)

    filename=str(i)+"gram"
    sort_top_words_with_count(X_train, y_train, words, clf.coef_.flatten(), filename, top_k=100)
    
    del clf
    del X_train
    del words

Processing 1 grams
Processing 2 grams
Processing 3 grams
Processing 4 grams
Processing 5 grams
