In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [2]:
def load_imdb(path, shuffle=True, random_state=42):
    import glob 
    print("Loading the imdb data")
    
    train_neg_files = glob.glob(path+"/train/neg/*.txt")
    train_pos_files = glob.glob(path+"/train/pos/*.txt")
    
    X_train_corpus = []
    y_train = []
    
    for tnf in train_neg_files:
        f = open(tnf, 'r', encoding="utf8")
        line = f.read()
        #line = line[:len(line)/2]
        X_train_corpus.append(line)
        y_train.append(0)
        f.close()
    
    for tpf in train_pos_files:
        f = open(tpf, 'r', encoding="utf8")
        line = f.read()
        #line = line[:len(line)/2]
        X_train_corpus.append(line)
        y_train.append(1)
        f.close()
    
    print("Train Data loaded.")
    
    test_neg_files = glob.glob(path+"/test/neg/*.txt")
    test_pos_files = glob.glob(path+"/test/pos/*.txt")
    
    X_test_corpus = []
    y_test = []
    
    for tnf in test_neg_files:
        f = open(tnf, 'r', encoding="utf8")
        X_test_corpus.append(f.read())
        y_test.append(0)
        f.close()
    
    for tpf in test_pos_files:
        f = open(tpf, 'r', encoding="utf8")
        X_test_corpus.append(f.read())
        y_test.append(1)
        f.close()
    
    print("Test Data loaded.")
    
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    
    if shuffle:
        np.random.seed(random_state)
        indices = np.random.permutation(len(y_train))       
        
        #X_train = X_train.tocsr()
        #X_train_corpus = X_train_corpus[indices]
        X_train_corpus = [X_train_corpus[i] for i in indices]
        y_train = y_train[indices]
        #train_corpus_shuffled = [train_corpus[i] for i in indices]
        
        indices = np.random.permutation(len(y_test))
        
        #X_test = X_test.tocsr()
        #X_test_corpus = X_test_corpus[indices]
        X_test_corpus = [X_test_corpus[i] for i in indices]
        y_test = y_test[indices]
        #test_corpus_shuffled = [test_corpus[i] for i in indices]
    #else:
        #train_corpus_shuffled = train_corpus
        #test_corpus_shuffled = test_corpus
    
    return X_train_corpus, y_train, X_test_corpus , y_test

In [3]:
import re

'''
Read and load the contraction list (or any text files)
'''
def load_list(filename, split_delimiter):
    vocabulary = []
    with open(filename, 'r') as f:
        for l in f:
            vocabulary.append(l.strip().split(split_delimiter))
    return np.asarray(vocabulary)

'''
Clean the HTML tags from the corpus
'''
def cleanhtml(text):
#     cleanr = re.compile('<.*?>')
#     cleantag = re.sub(cleanr, '', text)
    cleantag = re.sub(re.compile('<.*?>'), '', text)
    cleantext = cleantag.replace('br', '')
    return cleantext

# Reference :
# https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string

'''
Replace the contraction words into two parts (by given contraction list)
'''
def replace_contraction(corpus, cont_list):
    for i in range(0, cont_list.shape[0]):
        corpus = corpus.lower().replace(cont_list[i,0], cont_list[i,1])
    return corpus

'''
Singularize the words by its POS-tag
'''
def word_singularize(corpus):
    from textblob import TextBlob
    
    text = TextBlob(corpus)
    for tag in text.tags:
        if tag[1] == 'NNS' and tag[0] != 'yes':
            corpus = corpus.replace(tag[0], tag[0].singularize())
    return corpus

'''
Update clean corpus
'''
def update_corpus_contraction(X_corpus):
    cont_list = load_list("contraction_list.txt", ',')
    print(cont_list.shape)
    print('corpus update start')
    for i in range(0,len(X_corpus)):
        X_corpus[i] = cleanhtml(X_corpus[i])
        X_corpus[i] = replace_contraction(X_corpus[i], cont_list)
        X_corpus[i] = word_singularize(X_corpus[i])
        X_corpus[i] = X_corpus[i].replace('&', 'and')
    print('corpus update end')
    print()
    return X_corpus

'''
Count the negative and positive frequency
'''
def negative_positive_counts(X, y, word_index):
    neg_count = np.sum(X[y==0, word_index])
    pos_count = np.sum(X[y==1, word_index])    
    return neg_count, pos_count

'''
Count the ratio : log(#pos/#neg)
'''
def log_ratio_positive_negative(X, y, word_index):
    neg_count, pos_count = negative_positive_counts(X,y, word_index)
    log_ratio = np.log(pos_count+1)-np.log(neg_count+1)
    return log_ratio, neg_count, pos_count

'''
Sort top words w.r.t log ratio and write into file
'''
def sort_top_words_with_count(X, y, words,filename, top_k=10):
    log_ratio = []
    neg_count = []
    pos_count = []
    
    for i in range(0,len(words)):
        log_ratio_, neg_count_, pos_count_ = log_ratio_positive_negative(X, y, i)
        log_ratio.append(log_ratio_)
        neg_count.append(neg_count_)
        pos_count.append(pos_count_)
    
    sorted_indices_descending_abs = np.argsort(np.absolute(log_ratio))[::-1]
    
    filename = filename + '.txt'
    with open(filename, mode='w', encoding='utf8') as w:
        for i in sorted_indices_descending_abs[: top_k]:
#             print("%s\t%0.2f" %(words[i], weights[i]))
#             n_p=negative_positive_counts(X, y, i)
            w.write("%s\t%0.2f\t%d\t%d" %(str(words[i]), log_ratio[i], neg_count[i], pos_count[i]))
            w.write('\n')
        w.close()

In [4]:
path = r"C:\Users\Anne Soraya\Documents\IIT_resources\Python\aclImdb"
# X_train_corpus , y_train, X_test_corpus , y_test = load_imdb(path)

In [5]:
# X_train_corpus_update = update_corpus_contraction(X_train_corpus)
# X_test_corpus_update = update_corpus_contraction(X_test_corpus)

In [29]:
def save_pickle(path, X):
    with open(path, 'wb') as f:
        pickle.dump(X, f)
def open_pickle(path):
    with open(path, 'rb') as f:
        X = pickle.load(f)
    return X
    
# save_pickle("./pickles/imdb_X_tr_corpus.pickle", X_train_corpus)
# save_pickle("./pickles/imdb_y__tr_corpus.pickle", y_train)
# save_pickle("./pickles/imdb_x_tr_clean.pickle", X_train_corpus_update)
X_tr_clean = open_pickle("./pickles/imdb_x_tr_clean.pickle")
y_tr = open_pickle("./pickles/imdb_y_tr.pickle")

In [84]:
'''
record the positive and negative indices
'''

y_pos_indices = np.asarray(np.where(y_tr == 1)).reshape(12500)
y_neg_indices = np.asarray(np.where(y_tr == 0)).reshape(12500)

'''
Make a random indices (0,12500) s.t it distribute among the corpus
'''

import random
random.seed(42)
rand = random.sample(range(0, 12500), 100)
rand_indices = sorted(rand)
print(rand_indices)

[106, 409, 434, 488, 520, 711, 750, 1139, 1169, 1291, 1307, 1424, 1519, 1535, 1584, 1654, 1674, 1679, 1824, 2045, 2286, 2547, 2615, 2664, 2677, 2803, 3150, 3257, 3432, 3527, 3582, 3611, 3657, 3733, 3811, 3814, 4010, 4012, 4333, 4374, 4506, 4552, 4554, 4557, 4741, 4803, 5514, 5574, 5635, 5820, 5881, 5925, 5977, 6065, 6201, 6216, 6224, 6227, 6873, 6912, 6924, 7359, 7428, 7527, 7573, 8279, 8751, 8785, 8928, 8935, 9044, 9195, 9459, 9654, 9674, 9863, 9891, 9980, 10133, 10299, 10403, 10415, 10476, 10617, 10647, 10834, 10980, 11087, 11199, 11438, 11490, 11498, 11543, 11731, 11946, 11955, 12066, 12135, 12149, 12432]


In [86]:
def print_sentence(corpus):
    from textblob import TextBlob
    text = TextBlob(corpus)
    i = 0
    for sentence in text.sentences:
        print(i, ':', sentence)
        i += 1

print(rand_indices[0])
print('corpus index : ', y_pos_indices[rand_indices[0]])
print('label : ', y_tr[y_pos_indices[rand_indices[0]]])
print_sentence(X_tr_clean[y_pos_indices[rand_indices[0]]])

106
216
label :  1
0 : if the lion king was a disney version of hamlet, then the lion king 3: hakuna matata is a disney version of guildenstern and rosencrantz are dead.
1 : just like tom stoppard's beguiling film, we get to view the action from the point of view of two of the minor character from the original: timon, the meerkat with a penchant for eaking into song at the drop of a hat, and pumbaa, the warthog with flatulence issue.
2 : by following their story - rather than simba's - we get to see why all the animal bowed down as simba was presented from pride rock.
3 : we find out what made timon and pumbaa decide to follow simba back to pride rock to oust scar.
4 : and we find out how they dealt with the hyena's once and for all.
5 : nathan lane as timon gets most of the best joke, but he is ably supported by ernie sabella as pumbaa.
6 : it is also good to hear matthew broderick and whoopi goldberg reprising their role.
7 : julie kavner and jerry stiller lend their distinctive voic