In [26]:
import glob
import pickle
import re
import numpy as np
import pandas as pd

In [21]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

def get_reviews(path, clean = True):
    complete_path = path + '/*.txt'
    files = glob.glob(complete_path)    
    reviews = [str(open(rev).readlines()[0]).strip() for rev in files]
    # Removes the tag <br />
    reviews = [rev.replace('<br />',' ') for rev in reviews]
    if clean:
        reviews = [clean_str(rev) for rev in reviews]
    return reviews

In [22]:
# Gets all the reviews
train_positive_reviews = get_reviews("data/aclImdb/train/pos")
train_negative_reviews = get_reviews("data/aclImdb/train/neg")
test_positive_reviews = get_reviews("data/aclImdb/test/pos")
test_negative_reviews = get_reviews("data/aclImdb/test/neg")

# Divide The train set into train and validation

# Concat all train reviews and write it on a file
train_reviews = train_positive_reviews + train_negative_reviews
output_train = open('data/all_train.txt', 'w')
for rev in train_reviews:
    print>>output_train, rev
output_train.close()

In [23]:
# Saves the Train/Test lists into pickle objects
pickle.dump(train_positive_reviews, open( "data/train_pos.p", "wb" ))
pickle.dump(train_negative_reviews, open( "data/train_neg.p", "wb" ))
pickle.dump(test_positive_reviews, open( "data/test_pos.p", "wb" ))
pickle.dump(test_negative_reviews, open( "data/test_neg.p", "wb" ))

In [24]:
# Loads the Train/Test objects
train_positive_reviews = pickle.load(open("data/train_pos.p","rb"))
train_negative_reviews = pickle.load(open("data/train_neg.p","rb"))
test_positive_reviews = pickle.load(open("data/test_pos.p","rb"))
test_negative_reviews = pickle.load(open("data/test_neg.p","rb"))

In [29]:
def add_labes(reviews_list, pos=True):
    # Generate labels
    positive_labels = [[0, 1] for _ in train_pos]
    negative_labels = [[1, 0] for _ in train_neg]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return [x_text, y]
train_data = add_labes(train_positive_reviews, train_negative_reviews)

[[0 1]
 [0 1]
 [0 1]
 ..., 
 [1 0]
 [1 0]
 [1 0]]


In [17]:
def divide_train(train_pos, train_neg, amount_val=.25):
    total_reviews = len(train_pos)
    training_num = total_reviews - int(total_reviews * amount_val)
    
    train_pos_reviews_t = train_pos[:training_num]
    train_neg_reviews_t = train_neg[:training_num]
    train_pos_reviews_v = train_pos[training_num:]
    train_neg_reviews_v = train_neg[training_num:]
    
    train_reviews_t = train_pos_reviews_t + train_neg_reviews_t
    train_reviews_v = train_pos_reviews_v + train_neg_reviews_v
    
    return train_reviews_t, train_reviews_v

divide_train(train_positive_reviews, train_negative_reviews)

Total Reviews: 12500
9375
Training 18750
Validation 6250


In [9]:
# Loads the vocabulary
def load_vocabulary(file_path, num_words=10000):
    with open(file_path) as vocab:
        vocab_list = [next(vocab) for x in xrange(num_words)]
    return vocab_list

load_vocabulary("data/vocab_unigrams_no_counts/part-00000")

10000
the



In [None]:
# Spark Unigrams
text_file = sc.textFile('all_train.txt')
counts = text_file.flatMap(lambda line: line.split(" ")).map(lambda word:(word, 1)).reduceByKey(lambda a, b: a+b).sortBy(lambda a: -a[1])
# Comment this line, if you want tuples
just_words = counts.map(lambda tuple: tuple[0])
just_words.saveAsTextFile("vocab_unigrams_no_counts")

# Spark Bi-grams
bigrams = text_file.map(lambda x:x.split()).flatMap(lambda x: [((x[i],x[i+1]),1) for i in range(0,len(x)-1)])
count_bigrams = bigrams.reduceByKey(lambda x, y: x+y).sortBy(lambda a: -a[1])
just_bigrams = count_bigrams.map(lambda tuple: tuple[0][0] + ' ' + tuple[0][1])
just_bigrams.saveAsTextFile("vocab_bigrams_no_counts")