In [1]:
import numpy as np
import matplotlib.pyplot as plt
import nltk
import re

def load_imdb(path, shuffle=True, random_state=42):
    import glob 
    print("Loading the imdb data")
    
    train_neg_files = glob.glob(path+"/train/neg/*.txt")
    train_pos_files = glob.glob(path+"/train/pos/*.txt")
    
    X_train_corpus = []
    y_train = []
    
    for tnf in train_neg_files:
        f = open(tnf, 'r', encoding="utf8")
        line = f.read()
        #line = line[:len(line)/2]
        X_train_corpus.append(line)
        y_train.append(0)
        f.close()
    
    for tpf in train_pos_files:
        f = open(tpf, 'r', encoding="utf8")
        line = f.read()
        #line = line[:len(line)/2]
        X_train_corpus.append(line)
        y_train.append(1)
        f.close()
    
    print("Train Data loaded.")
    
    test_neg_files = glob.glob(path+"/test/neg/*.txt")
    test_pos_files = glob.glob(path+"/test/pos/*.txt")
    
    X_test_corpus = []
    y_test = []
    
    for tnf in test_neg_files:
        f = open(tnf, 'r', encoding="utf8")
        X_test_corpus.append(f.read())
        y_test.append(0)
        f.close()
    
    for tpf in test_pos_files:
        f = open(tpf, 'r', encoding="utf8")
        X_test_corpus.append(f.read())
        y_test.append(1)
        f.close()
    
    print("Test Data loaded.")
    
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    
    if shuffle:
        np.random.seed(random_state)
        indices = np.random.permutation(len(y_train))       
        
        X_train_corpus = [X_train_corpus[i] for i in indices]
        y_train = y_train[indices]
        
        indices = np.random.permutation(len(y_test))
        
        X_test_corpus = [X_test_corpus[i] for i in indices]
        y_test = y_test[indices]
       
    
    return X_train_corpus, y_train, X_test_corpus , y_test

In [2]:
'''
Read and load the contraction list (or any text files)
'''
def load_list(filename, split_delimiter):
    vocabulary = []
    with open(filename, 'r') as f:
        for l in f:
            vocabulary.append(l.strip().split(split_delimiter))
    return np.asarray(vocabulary)

'''
Clean the HTML tags from the corpus
'''
def cleanhtml(text):
#     cleanr = re.compile('<.*?>')
#     cleantag = re.sub(cleanr, '', text)
    cleantag = re.sub(re.compile('<.*?>'), '', text)
    cleantext = cleantag.replace('br', '')
    return cleantext

# Reference :
# https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string

'''
Replace the contraction words into two parts (by given contraction list)
'''
def replace_contraction(corpus, cont_list):
    for i in range(0, cont_list.shape[0]):
        corpus = corpus.lower().replace(cont_list[i,0], cont_list[i,1])
    return corpus

'''
Singularize the words by its POS-tag
'''
def word_singularize(corpus):
    from textblob import TextBlob
    
    text = TextBlob(corpus)
    for tag in text.tags:
        if tag[1] == 'NNS' and tag[0] != 'yes':
            corpus = corpus.replace(tag[0], tag[0].singularize())
    return corpus

'''
Update clean corpus
'''
def update_corpus_contraction(X_corpus):
    cont_list = load_list("contraction_list.txt", ',')
    print(cont_list.shape)
    print('corpus update start')
    for i in range(0,len(X_corpus)):
        X_corpus[i] = cleanhtml(X_corpus[i])
        X_corpus[i] = replace_contraction(X_corpus[i], cont_list)
        X_corpus[i] = word_singularize(X_corpus[i])
        X_corpus[i] = X_corpus[i].replace('&', 'and')
    print('corpus update end')
    print()
    return X_corpus

'''
Count the negative and positive frequency
'''
def negative_positive_counts(X, y, word_index):
    neg_count = np.sum(X[y==0, word_index])
    pos_count = np.sum(X[y==1, word_index])    
    return neg_count, pos_count

'''
Count the ratio : log((#pos+1)/(#neg+1)) 
'''
def log_ratio_positive_negative(X, y, word_index):
    neg_count, pos_count = negative_positive_counts(X,y, word_index)
    log_ratio = np.log(pos_count+1)-np.log(neg_count+1)
    return log_ratio, neg_count, pos_count

'''
Sort top words w.r.t log ratio and write into file
'''
def sort_top_words_with_count(X, y, words,filename, top_k=10):
    log_ratio = []
    neg_count = []
    pos_count = []
    
    for i in range(0,len(words)):
        log_ratio_, neg_count_, pos_count_ = log_ratio_positive_negative(X, y, i)
        log_ratio.append(log_ratio_)
        neg_count.append(neg_count_)
        pos_count.append(pos_count_)
    
    sorted_indices_descending_abs = np.argsort(np.absolute(log_ratio))[::-1]
    
    filename = filename + '.txt'
    with open(filename, mode='w', encoding='utf8') as w:
        for i in sorted_indices_descending_abs[: top_k]:
            w.write("%s\t%0.2f\t%d\t%d" %(str(words[i]), log_ratio[i], neg_count[i], pos_count[i]))
            w.write('\n')
        w.close()

In [7]:
# input the raw corpus from IMDB
# return the list of sentences. 

def get_sentences(corpus):
    from textblob import TextBlob
    text = TextBlob(corpus)
    i = 0
    sent = []
    for sentence in text.raw_sentences:
        sent.append(sentence)
    return sent

In [4]:
# Loading the IMDB data

X_train_corpus, y_train, X_test_corpus, y_test = load_imdb(r"C:\Users\Anneke\Documents\Dataset\aclImdb")

Loading the imdb data
Train Data loaded.
Test Data loaded.


In [24]:
'''
*** WARNING ***
This part takes a lot of time to execute. If you wish to access the pickle file which I already load and clean the data,
Please take a look on the function in the block below this block.
'''

# preprocessing the corpus
# 1. clean HTML tags
# 2. Replace contraction we'll -> we will
# 3. singularize word. movies -> movie
# 4. Replace & -> 'and'

# X_train_corpus_update = update_corpus_contraction(X_train_corpus)
# X_test_corpus_update = update_corpus_contraction(X_test_corpus)

(75, 2)
corpus update start
corpus update end



In [27]:
def open_pickle(path):
    import pickle
    
    with open(path, 'rb') as f:
        X = pickle.load(f)
    return X

X_train_original = open_pickle('./data/imdb_original_preprocessed_xtrain.pickle')
X_test_original = open_pickle('./data/imdb_original_preprocessed_xtest.pickle')
y_train_original = open_pickle('./data/imdb_original_preprocessed_ytrain.pickle')
y_test_original = open_pickle('./data/imdb_original_preprocessed_ytest.pickle')

In [28]:
# Extract sentence

sentences = get_sentences(X_train_original[0])

for i,sent in enumerate(sentences):
    print(i, sent)

0 silent night, deadly night 5 is the very last of the series, and like part 4, it is unrelated to the first three except by title and the fact that it is a christmas-themed horror flick.except to the oblivious, there is some obvious thing going on here...mickey rooney plays a toymaker named joe petto and his creepy son's name is pino.
1 ring a bell, anyone?
2 now, a little boy named derek heard a knock at the door one evening, and opened it to find a present on the doorstep for him.
3 even though it said "do not open till christmas", he begins to open it anyway but is stopped by his dad, who scolds him and sends him to bed, and opens the gift himself.
4 inside is a little red ball that sprouts santa arm and a head, and proceed to kill dad.
5 oop, maybe he should have left well-enough alone.
6 of course derek is then traumatized by the incident since he watched it from the stair, but he does not grow up to be some killer santa, he just stops talking.there is a mysterious stranger lurki