In [1]:
import numpy as np
import os
import shutil
import sys

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
datasets = '../datasets/'
glove_file_location = os.path.join(datasets, 'glove.6B/glove.6B.100d.txt')

In [4]:
%%time 
# Process the Glove vectors into a dictionary
embeddings_index = {}
max_database_size = 400000
with open(glove_file_location, encoding='utf-8') as f:
    for line in f:
        parsed_line = line.split()
        # The explicit string conversion below seems to fix a weird unicode runtime issue
        word = str(parsed_line[0]) 
        vector = np.array(parsed_line[1:], dtype='float32')
        embeddings_index[word] = vector
        if len(embeddings_index) >= max_database_size:
            break

print('Loaded {0} words from database.'.format(len(embeddings_index)))
    

Loaded 400000 words from database.
CPU times: user 14.3 s, sys: 492 ms, total: 14.8 s
Wall time: 15 s


#### Tokenize raw imdb database

- Use 10000 words to tokenize
- Use 200 training samples only
- Use 10000 validation samples
- Use 100 words from each sample 

In [5]:
num_words = 10000
num_train = 200
num_validation = 10000
maxlen = 100

In [6]:
imdb_location = os.path.join(datasets, 'aclImdb')
train_location = os.path.join(imdb_location, 'train')

In [7]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [8]:
%%time

text = []
labels = []
folders = ['neg', 'pos']

for folder in folders:
    complete_path = os.path.join(train_location, folder)
    for filename in os.listdir(complete_path):
        if filename[-4:] == '.txt':
            file = os.path.join(complete_path, filename)
            with open(file, encoding='utf-8') as f:
                review = f.read()
                text.append(str(review))
                if folder == 'neg':
                    labels.append(0)
                else:
                    labels.append(1)
                
    print('Completed processing *{0}* folder. Total reviews: {1}'.format(folder, len(labels)))


Completed processing *neg* folder. Total reviews: 12500
Completed processing *pos* folder. Total reviews: 25000
CPU times: user 1.73 s, sys: 2.35 s, total: 4.08 s
Wall time: 5.59 s
