In [126]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import spacy
import string
import random
import pickle as pkl
import time 
from os import listdir 
from collections import Counter
import torch
from torch.utils.data import Dataset
from collections import Counter

# Import and Process Data 

In [111]:
# helper methods to load reviews from directories 

def load_single_review(fdir, fname): 
    """ Takes as input file directory and file name of a single review, returns review as string """
    fpath = fdir + '/' + fname 
    with open(fpath, 'r') as f: 
        review = f.read()
        return review 
    
def load_dir_reviews(fdir): 
    """ Takes as input file directory where reviews are stored, returns them as a list of review strings """
    fnames = [f for f in listdir(fdir)]
    reviews = [load_single_review(fdir, fname) for fname in fnames]
    return reviews

def combine_data(neg_reviews, pos_reviews): 
    """ Combines lists of negative and positive reviews, returns a combined dataset comprising reviews and labels """
    neg_with_labels = [(review, 0) for review in neg_reviews] 
    pos_with_labels = [(review, 1) for review in pos_reviews]
    combined = neg_with_labels + pos_with_labels
    combined = random.sample(combined, len(combined))
    reviews = [comb[0] for comb in combined]
    labels = [comb[1] for comb in combined]
    return reviews, labels 

In [112]:
# load reviews into lists 
train_val_neg = load_dir_reviews('aclImdb/train/neg')
train_val_pos = load_dir_reviews('aclImdb/train/pos')
test_neg = load_dir_reviews('aclImdb/test/neg')
test_pos = load_dir_reviews('aclImdb/test/pos')

In [113]:
# randomly split train into train vs. validation sets 
train_split = int(20000 / 2) 
train_neg = train_val_neg[:train_split]
train_pos = train_val_pos[:train_split]
val_neg = train_val_neg[train_split:]
val_pos = train_val_pos[train_split:]

In [114]:
# combine pos and neg reviews to get unified datasets 
train_data, train_labels = combine_data(train_neg, train_pos)
val_data, val_labels = combine_data(val_neg, val_pos)
test_data, test_labels = combine_data(test_neg, test_pos)
print ("Train dataset size is {}".format(len(train_data)))
print ("Validation dataset size is {}".format(len(val_data)))
print ("Test dataset size is {}".format(len(test_data)))

Train dataset size is 20000
Validation dataset size is 5000
Test dataset size is 25000


# Tokenization

In [115]:
# helper functions to tokenize reviews 

tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation 

def tokenize(review): 
    """ Takes review as input and outputs a list of tokens in lowercase without punctuation """ 
    tokens = tokenizer(review)
    return [token.text.lower() for token in tokens if (token.text not in punctuations)]

def tokenize_dataset(dataset):
    """ Takes as input a dataset comprising a list of reviews, outputs the tokenized dataset along with 
        a list comprising all the tokens from the dataset """
    token_dataset = []
    for sample in dataset:
        tokens = tokenize(sample)
        token_dataset.append(tokens)
    return token_dataset 

def save_tokens_to_disk(dataset, destination_path): 
    """ Tokenize dataset as save as pickle to destination path """
    start_time = time.time() 
    token_dataset = tokenize_dataset(dataset)
    with open(destination_path, "wb") as f: 
        pkl.dump(token_dataset, f)
    time_elapsed = (time.time() - start_time) / 60.0 
    print("Data tokenized and saved as {} in {:.1f} minutes".format(destination_path, time_elapsed))

In [116]:
save_tokens_to_disk(val_data, "data/val_data_tokens.p")
save_tokens_to_disk(train_data, "data/train_data_tokens.p")
save_tokens_to_disk(test_data, "data/test_data_tokens.p")

Data tokenized and saved as data/val_data_tokens.p in 4.9 minutes
Data tokenized and saved as data/train_data_tokens.p in 15.6 minutes
Data tokenized and saved as data/test_data_tokens.p in 16.7 minutes


In [178]:
# load saved tokens 
train_data_tokens = pkl.load(open("data/train_data_tokens.p", "rb"))
val_data_tokens = pkl.load(open("data/val_data_tokens.p", "rb"))
test_data_tokens = pkl.load(open("data/test_data_tokens.p", "rb"))
all_train_tokens = [item for sublist in train_data_tokens for item in sublist] 

# double check 
print ("Train dataset size is {}".format(len(train_data_tokens)))
print ("Val dataset size is {}".format(len(val_data_tokens)))
print ("Test dataset size is {}".format(len(test_data_tokens)))
print ("Total number of tokens in train dataset is {}".format(len(all_train_tokens)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000
Total number of tokens in train dataset is 4808696


In [179]:
token_counter = Counter(all_train_tokens) 
vocab, count = zip(*token_counter.most_common(10))
count

(261965, 130352, 128951, 116768, 107941, 87781, 74745, 74062, 66302, 58666)

In [180]:
# create vocabulary from 10000 most common tokens in the training set 

max_vocab_size = 10000 
PAD_IDX = 0 
UNK_IDX = 1

def build_vocab(all_tokens): 
    """ Takes list of all tokens and returns:
        - id2token: list of tokens, where id2token[i] returns token that corresponds to i-th token 
        - token2id: dictionary where keys represent tokens and corresponding values represent their indices
    """
    token_counter = Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2, 2+len(vocab))))
    id2token = ['<pad>', '<unk>'] + id2token 
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token 
    
token2id, id2token = build_vocab(all_train_tokens)

In [181]:
# check the dictionary by loading random token from it

random_token_id = random.randint(0, len(id2token)-1)
random_token = id2token[random_token_id]

print ("Token id {} ; token {}".format(random_token_id, id2token[random_token_id]))
print ("Token {}; token id {}".format(random_token, token2id[random_token]))

Token id 5716 ; token receiving
Token receiving; token id 5716


In [190]:
# convert token to id in the dataset 

def token2index_dataset(tokens_data): 
    indices_data = []
    for datum in tokens_data: 
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in datum]
        indices_data.append(index_list)
    return indices_data 

train_data_indices = token2index_dataset(train_data_tokens)
val_data_indices = token2index_dataset(val_data_tokens)
test_data_indices = token2index_dataset(test_data_tokens)

In [191]:
# check size of data 
print ("Train dataset size is {}".format(len(train_data_indices)))
print ("Val dataset size is {}".format(len(val_data_indices)))
print ("Test dataset size is {}".format(len(test_data_indices)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000


In [192]:
# check tokenization of dataset 
print(train_data_tokens[0])
print(train_data_indices[0])

['i', 'was', 'lying', 'on', 'my', 'bed', 'with', 'a', 'really', 'bad', 'cold', 'or', 'flu', 'or', 'whatever', 'i', 'figure', 'maybe', 'i', "'d", 'kill', 'some', 'time', 'watching', 'some', 'horror', 'movies', 'my', 'mom', 'bought', 'for', 'me', 'a', 'little', 'while', 'ago', 'i', 'wish', 'i', 'never', 'picked', 'this', 'movie', 'after', 'i', 'watched', 'it', 'i', 'felt', 'even', 'more', 'sick', 'and', 'i', 'wanted', 'to', 'throw', 'up', 'afterwords(when', 'i', 'got', 'better', 'of', 'course', 'i', 'did', 'some', 'research', 'on', 'dennis', 'l.rader', 'and', 'i', 'noticed', 'that', 'the', 'dennis', 'in', 'the', 'movie', 'was', 'nothing', 'like', 'the', 'real', 'one', 'i', 'hope', 'that', 'no', 'one', 'ever', 'watches', 'this', 'movie', 'but', 'if', 'they', 'ever', 'do', 'do', "n't", 'eat', 'or', 'you', "'ll", 'feel', 'the', 'way', 'i', 'felt', 'after', 'i', 'first', 'watched', 'it', 'i', 'think', 'you', 'would', 'have', 'a', 'better', 'time', 'watching', 'the', 'santa', 'claus', '3', 'a

# Create PyTorch DataLoader

In [194]:
# define custom dataset class and collate function for data loader 

class MovieReviewsDataset(Dataset): 
    """ 
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list, label_list): 
        """ 
        Initialize dataset by passing in a list of movie review tokens and a list of labels 
        """
        self.data_list = data_list 
        self.label_list = label_list 
        assert (len(self.data_list) == len(self.label_list))
        
    def __len__(self): 
        return len(self.data_list)
    
    def __getitem__(self, key): 
        """
        Triggered when dataset[i] is called, outputs a list of tokens, length of list, and label of the data point
        """
        token_idx = self.data_list[key][:MAX_SENTENCE_LENGTH]
        label = self.label_list[key]
        return [token_idx, len(token_idx), label]
    
def collate_func(batch): 
    """ 
    Customized function for DataLoader that dynamically pads the batch so that the data have the same length
    """
    data_list = [] 
    label_list = [] 
    length_list = [] 
    
    for datum in batch:         
        label_list.append(datum[2])
        length_list.append(datum[1])
        # pad data before appending 
        padded_vec = np.pad(array = np.array(datum[0]), 
                            pad_width = ((0, MAX_SENTENCE_LENGTH - datum[1])), 
                            mode = 'constant', 
                            constant_values = 0)
        data_list.append(padded_vec)
        
    return [torch.from_numpy(np.array(data_list)), torch.LongTensor(length_list), torch.LongTensor(label_list)]

In [204]:
# create data loaders for train/val/test datasets 

MAX_SENTENCE_LENGTH = 200 
BATCH_SIZE = 32 

train_dataset = MovieReviewsDataset(train_data_indices, train_labels) 
train_loader = torch.utils.data.DataLoader(dataset = train_dataset, batch_size = BATCH_SIZE, 
                                           collate_fn = collate_func, shuffle = True)

val_dataset = MovieReviewsDataset(val_data_indices, val_labels) 
val_loader = torch.utils.data.DataLoader(dataset = val_dataset, batch_size = BATCH_SIZE, 
                                         collate_fn = collate_func, shuffle = True)

test_dataset = MovieReviewsDataset(test_data_indices, test_labels) 
test_loader = torch.utils.data.DataLoader(dataset = test_dataset, batch_size = BATCH_SIZE, 
                                          collate_fn = collate_func, shuffle = True)

In [205]:
# test data loader on training data 
for i, (data, lengths, labels) in enumerate(train_loader):
    print ("Printing a sample datum (of data length {}):".format(lengths[0]))
    print ("Data is: {}".format((data[0])))
    print ("Label is {}".format((labels[0])))
    break 

Printing a sample datum (of data length 200):
Data is: tensor([   2, 2171,   37,  430,   11,   19,    4, 8224,   46,  687, 1038,    2,
         215,    8,   13,  573,   27,   46,   24,   81,  184,   42, 1315,    9,
           4,   55,  195,   60,   42,   27,  231,   29, 2710,   12,    2, 1670,
           5,    8,   15,   12,    2,   76, 1246,  184,  108,  167,  198,   34,
          80,   97,    4, 1548,   35, 1455,   58, 5509,    3, 2278,    4, 6865,
          17, 2731,   49,   65,    8,   15,   12,  782,   20,    2,    1,  852,
           3,    1,   33,   84,   37,    1,  359,   71,  121,    6,   97,  193,
         132,   16,    2,  926,  214,  455,    3,  455,  174,  637,  249,   76,
          49,   23,  187,   25,  175,    5,    2, 5104,   23,  187,  175,    5,
           2,    1,   14,  106, 1325,  768,    7,   12, 1214,    1,   54, 1214,
          64,    1,   61,   54,    9,   36,  127,    1, 7650,  201, 3139,   83,
         362,   23, 2889,    5,    1, 2580,   35,    2,   87,  48