In [1]:
import glob
import pickle
import re
import numpy as np
import pandas as pd
from tensorflow.contrib import learn

In [21]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

def get_reviews(path, clean = True):
    complete_path = path + '/*.txt'
    files = glob.glob(complete_path)    
    reviews = [str(open(rev).readlines()[0]).strip() for rev in files]
    # Removes the tag <br />
    reviews = [rev.replace('<br />',' ') for rev in reviews]
    if clean:
        reviews = [clean_str(rev) for rev in reviews]
    return reviews

In [22]:
# Gets all the reviews
train_positive_reviews = get_reviews("data/aclImdb/train/pos")
train_negative_reviews = get_reviews("data/aclImdb/train/neg")
test_positive_reviews = get_reviews("data/aclImdb/test/pos")
test_negative_reviews = get_reviews("data/aclImdb/test/neg")

# Divide The train set into train and validation

# Concat all train reviews and write it on a file
train_reviews = train_positive_reviews + train_negative_reviews
output_train = open('data/all_train.txt', 'w')
for rev in train_reviews:
    print>>output_train, rev
output_train.close()

In [23]:
# Saves the Train/Test lists into pickle objects
pickle.dump(train_positive_reviews, open( "data/train_pos.p", "wb" ))
pickle.dump(train_negative_reviews, open( "data/train_neg.p", "wb" ))
pickle.dump(test_positive_reviews, open( "data/test_pos.p", "wb" ))
pickle.dump(test_negative_reviews, open( "data/test_neg.p", "wb" ))

In [68]:
# Loads the Train/Test objects
train_positive_reviews = pickle.load(open("data/train_pos.p","rb"))
train_negative_reviews = pickle.load(open("data/train_neg.p","rb"))
test_positive_reviews = pickle.load(open("data/test_pos.p","rb"))
test_negative_reviews = pickle.load(open("data/test_neg.p","rb"))

In [75]:
def get_train_sets():
    train_positive_reviews = pickle.load(open("data/train_pos.p","rb"))
    train_negative_reviews = pickle.load(open("data/train_neg.p","rb"))
    return train_positive_reviews, train_negative_reviews

def get_test_sets():
    test_positive_reviews = pickle.load(open("data/test_pos.p","rb"))
    test_negative_reviews = pickle.load(open("data/test_neg.p","rb"))
    return test_positive_reviews, test_negative_reviews

def label_data(positive_revs, negative_revs):
    # Generate the labels
    positive_labels = [[0, 1] for _ in positive_revs]
    negative_labels = [[1, 0] for _ in negative_revs]
    
    # Concatenates the positive and negative labels for train and val
    y_labels = np.concatenate([positive_labels, negative_labels], 0)
    
    x_train = positive_revs + negative_revs
     
    return [x_train, y_labels]
    
def split_train_validation(x_train, y_train, amount_val=.25):
    x_train = np.array(x_train)
    y_train = np.array(y_train)
    
    # Randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y_train)))
    print (shuffle_indices)
    x_shuffled = x_train[shuffle_indices]
    y_shuffled = y_train[shuffle_indices]
    
    total_reviews = len(x_shuffled)
    training_num = total_reviews - int(total_reviews * amount_val)
    
    x_t = x_shuffled[:training_num]
    y_t = y_shuffled[:training_num]
    
    x_dev = x_shuffled[training_num:]
    y_dev = y_shuffled[training_num:]
    
    return [x_t, y_t], [x_dev, y_dev]

def get_train_validation(train_pos, train_neg, amount_val=.25):
    # Divides the sets
    total_reviews = len(train_pos)
    print("Num Total Reviews in set:", total_reviews)
    training_num = total_reviews - int(total_reviews * amount_val)
    print("Num Training Reviews:", training_num)
    
    train_pos_reviews_t = train_pos[:training_num]
    train_neg_reviews_t = train_neg[:training_num]
    train_pos_reviews_v = train_pos[training_num:]
    train_neg_reviews_v = train_neg[training_num:]
    
    # Generate the labels
    train_positive_labels = [[0, 1] for _ in train_pos_reviews_t]
    val_positive_labels = [[0, 1] for _ in train_pos_reviews_v]
    
    train_negative_labels = [[1, 0] for _ in train_neg_reviews_t]
    val_negative_labels = [[1, 0] for _ in train_neg_reviews_v]
    
    # Concatenates the positive and negative labels for train and val
    y_train = np.concatenate([train_positive_labels, train_negative_labels], 0)
    y_val = np.concatenate([val_positive_labels, val_negative_labels], 0)
    
    # Creates one list for positive and negative reviews
    x_train = train_pos_reviews_t + train_neg_reviews_t
    x_val = train_pos_reviews_v + train_neg_reviews_v
    
    print("x_train:", len(x_train))
    print("y_train:", len(y_train))
    print("x_val:", len(x_val))
    print("y_val:", len(y_val))
    
    return [x_train, y_train],[x_val, y_val]

def get_test_labeled(test_pos, test_neg):
    # Generate the labels
    test_positive_labels = [[0, 1] for _ in test_pos]
    test_negative_labels = [[1, 0] for _ in test_neg]
    
    y = np.concatenate([test_positive_labels, test_negative_labels], 0)
    x_test = test_pos + test_neg
    
    return [x_test, y]
    
#train, validation = get_train_validation(train_positive_reviews, train_negative_reviews)
x_t, y_t = label_data(train_positive_reviews, train_negative_reviews)

In [76]:
# Label the data
x_train, y_train = label_data(train_positive_reviews, train_negative_reviews)
# Separates in Train and Dev
x_train_list, x_dev_list = split_train_validation(x_train, y_train)

[18634  1333 20315 ..., 17728  7293 17673]


In [15]:
# Loads the vocabulary
def load_vocabulary(file_path, num_words=10000):
    with open(file_path) as vocab:
        vocab_list = [next(vocab) for x in range(num_words)]
    vocab_list = [str(vocab).strip() for vocab in vocab_list]
    return vocab_list
#
#load_vocabulary("data/vocab_unigrams_no_counts/part-00000")

In [None]:
# Spark Unigrams
text_file = sc.textFile('all_train.txt')
counts = text_file.flatMap(lambda line: line.split(" ")).map(lambda word:(word, 1)).reduceByKey(lambda a, b: a+b).sortBy(lambda a: -a[1])
# Comment this line, if you want tuples
just_words = counts.map(lambda tuple: tuple[0])
just_words.saveAsTextFile("vocab_unigrams_no_counts")

# Spark Bi-grams
bigrams = text_file.map(lambda x:x.split()).flatMap(lambda x: [((x[i],x[i+1]),1) for i in range(0,len(x)-1)])
count_bigrams = bigrams.reduceByKey(lambda x, y: x+y).sortBy(lambda a: -a[1])
just_bigrams = count_bigrams.map(lambda tuple: tuple[0][0] + ' ' + tuple[0][1])
just_bigrams.saveAsTextFile("vocab_bigrams_no_counts")

In [16]:
# This is a test for the vocabulary

vocabulary = load_vocabulary("data/vocab_unigrams_no_counts/part-00000")
vocabulary = [str(vocab).strip() for vocab in vocabulary]
vocabulary[:5]
max_len_vocabulary = len(vocabulary)
print (max_len_vocabulary)

10000


In [5]:
train_reviews = train_positive_reviews + train_negative_reviews
print(len(train_reviews))
print(train_reviews[0])

25000
reda is a young frenchman of moroccan descent despite his muslim heritage , he is very french in attitudes and values out of the blue , his father announces that reda will be driving him to the hajj \( pilgrimage \) to mecca something that reda has no interest in doing but agrees only out of obligation as a result , from the start , reda is angry but being a traditional muslim man , his father is difficult to talk to or discuss his misgivings both father and son seem very rigid and inflexible and it 's very ironic when the dad tells his son that he should not be so stubborn when i read the summary , it talks about how much the characters grew and began to know each other however , i really do n't think they did and that is the fascinating and sad aspect of the film sure , there were times of understanding , but so often there was an undercurrent of hostility and repression i actually liked this and appreciated that there was n't complete resolution of this as it would have seemed

In [67]:
def set_oov(reviews, vocabulary):
    updated_reviews = []
    for review in reviews:
        up_review = []
        splitted_review = review.split(" ")
        for i, word in enumerate(splitted_review):
            if word not in vocabulary:
                splitted_review[i] = 'oov'
            else:
                splitted_review[i] = word
        new_review = (' ').join(splitted_review)
        updated_reviews.append(new_review)
        print(new_review)
        break
    return updated_reviews
            
def set_oov_tag(reviews, vocabulary):
    updated_reviews = []
    set_vocabulary = set(vocabulary)
    for review in reviews:
        set_review = set(review.split(" "))
        oov_words = set_review - set_vocabulary
        #print(list(oov_words))
        
        dic_oov_words = {k:'oov' for k in oov_words}
        #print(dic_oov_words)
        if len(dic_oov_words) >= 1:
            rep = dict((re.escape(k), v) for k, v in dic_oov_words.items())
            pattern = re.compile("|".join(rep.keys()))
            oov_review = pattern.sub(lambda m: rep[re.escape(m.group(0))], review)
            updated_reviews.append(oov_review)
        else:
            updated_reviews.append(review)
    return updated_reviews

set_oov(train_reviews, vocabulary)
#print(len(new_reviews))

oov is a young frenchman of oov descent despite his muslim oov , he is very french in attitudes and values out of the blue , his father oov that oov will be driving him to the oov \( oov \) to oov something that oov has no interest in doing but agrees only out of oov as a result , from the start , oov is angry but being a traditional muslim man , his father is difficult to talk to or discuss his oov both father and son seem very rigid and oov and it 's very ironic when the dad tells his son that he should not be so stubborn when i read the summary , it talks about how much the characters grew and began to know each other however , i really do n't think they did and that is the fascinating and sad aspect of the film sure , there were times of understanding , but so often there was an oov of oov and oov i actually liked this and appreciated that there was n't complete resolution of this as it would have seemed phony overall , the film is well acted and fascinating giving oov an unusual i

["oov is a young frenchman of oov descent despite his muslim oov , he is very french in attitudes and values out of the blue , his father oov that oov will be driving him to the oov \\( oov \\) to oov something that oov has no interest in doing but agrees only out of oov as a result , from the start , oov is angry but being a traditional muslim man , his father is difficult to talk to or discuss his oov both father and son seem very rigid and oov and it 's very ironic when the dad tells his son that he should not be so stubborn when i read the summary , it talks about how much the characters grew and began to know each other however , i really do n't think they did and that is the fascinating and sad aspect of the film sure , there were times of understanding , but so often there was an oov of oov and oov i actually liked this and appreciated that there was n't complete resolution of this as it would have seemed phony overall , the film is well acted and fascinating giving oov an unusu

In [46]:
new_reviews[1000]

"christopher smith is an obvious horror fan and this is made clear in his debut horror flick oov oov although a little bit loose on information , proves itself worthy of a true gory classic a little less glossy than recent us horrors \\( oov horror remake , house of wax remake \\) this dark and gruesome tale follows kate \\( oov oov \\) through the oov of underground oov and oov oov as she , and a number of others along the way , try and flee a murderous oov though some bad reviews have oov this film , i truly believe that on a tight budget and for a uk production from first time director smith that oov truly does live up to its name it delivers fast paced gory action more or less from the beginning , sometimes too fast as the story is oov in some areas , but with a perfect location and the best character reaction at the end i 've seen in a while , oov delivers some scenes that are definitely the stuff of nightmares"

In [26]:
train_data_features = train_data_features.toarray()
print(len(vectorizer.vocabulary_))
print("Review 1 Original:", len(train_reviews[0]))
print(""len(train_data_features[0]))
for i in train_data_features[0]:
    print(i)

10000
Review 1 Original: 1435
10000
15
0
15
0
9
5
6
6
2
0
2
5
0
2
2
0
0
0
3
6
0
0
0
0
0
2
0
2
5
1
2
0
0
0
0
1
0
2
0
2
1
1
3
0
1
0
1
3
1
2
0
0
0
2
0
0
0
2
3
1
0
1
0
0
0
0
0
1
1
0
0
0
1
0
0
0
1
1
0
0
1
0
0
0
1
0
1
1
1
1
0
0
0
1
1
0
1
0
0
0
0
0
0
0
0
0
2
1
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
1
0
0
1
0
0
0
0
0
0
0
0
1
1
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
1
0
0
0
0
0
0
1
0
0
1
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
3
0
0
0
0
0
0
0
0
0
1
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
1
1
0
0
2
0
0
0
0
0
0
0
0
0
0
0
0


In [50]:
MAX_SENTENCE = 200

vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length=MAX_SENTENCE, 
                                                          vocabulary=vocabulary)

In [51]:
list(vocab_processor.fit_transform(train_reviews[0]))

AttributeError: 'list' object has no attribute 'add'

In [27]:
print(x_vocab[0])

[  1   2   3   4   5   6   7   8   9  10  11  12  13   2  14  15  16  17
  18  19  20   6  21  22  10  23  24  25   1  26  27  28  29  30  21  31
  32  30  33  34  25   1  35  36  37  16  38  39  40  41  20   6  42  43
   3  44  45  21  46   1   2  47  39  48   3  49  11  50  10  23   2  51
  30  52  30  53  54  10  55  56  23  18  57  58  14  59  18  60  18  61
  62  14  63  64  21  65  66  10  57  25  13  67  68  27  69  70  64  71
  72  21  73  61  74  75  76  77  21  78  79  18  80  30  81  82  83  84
  71  85  86  87  88  89  90  18  25   2  21  91  18  92  93   6  21  94
  95  96  97  98   6  99  39  69 100  96 101 102 103   6 104  18 105  71
 106 107 108  18 109  25  96 101  87 110 111   6 108  43  61 112 113 114
 115 116  21  94   2 117 118  18  91 119 120 102 121 122 123 124  18  21
  31  61]
