In [1]:
import numpy as np
import pandas as pd
import os
import csv
from datetime import datetime
from sklearn.model_selection import train_test_split
import pickle

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split, GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers.core import Activation
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, RNN
from keras.layers.embeddings import Embedding

from keras import backend as K

from keras.models import load_model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
DATA_ROOT = 'data'
SEED = 1000

In [3]:
def load_data(file_name):
    pickle_name = os.path.join(DATA_ROOT, file_name + '.pkl')
    time_start = datetime.now()
    if os.path.isfile(pickle_name):
        print('loading from pickle...')
        review_data = pd.read_pickle(pickle_name)
    else:
        print('loading from csv...')
        review_data = pd.read_csv(os.path.join(DATA_ROOT, file_name))
        review_data.to_pickle(pickle_name)
    print('Loaded in ' + str(datetime.now() - time_start) + ' seconds')
    return review_data

In [4]:
def get_tokenizer(vocab_size, train_text=None):
    tokenizer_file_name = os.path.join(DATA_ROOT, 'tokenizers', 'tokenizer_' + str(vocab_size) + '.pkl')
    time_start = datetime.now()
    if os.path.isfile(tokenizer_file_name):
        print('Loading tokenizer...')
        with open(tokenizer_file_name, 'rb') as file:
            tokenizer = pickle.load(file)
    else:
        print('Training tokenizer...')
        tokenizer = Tokenizer(num_words=vocab_size)
        tokenizer.fit_on_texts(train_text)
        
        with open(tokenizer_file_name, 'wb') as file:
            pickle.dump(tokenizer, file)
        
    print('Got tokenizer for vocab size: ' + str(vocab_size) + ' in ' + str(datetime.now() - time_start))
    return tokenizer

In [5]:
def prep_data(vocab_size=100, review_length=250, num_reviews=None):
    print('Retrieving/Preparing data for: vocab size = ' + str(vocab_size) + ' review_length = ' + str(review_length) + ' num_reviews = ' + str(num_reviews))
    file_name = os.path.join(DATA_ROOT, 'processed_data', str(vocab_size) + '_' + str(review_length) + '_' + str(num_reviews) + '.pkl')
    if not os.path.isfile(file_name):
        review_data = load_data('yelp_review.csv')
        review_data.drop(['review_id', 'user_id', 'business_id', 'date', 'useful', 'funny', 'cool'], axis=1, inplace=True)
        
        x = review_data['text'].as_matrix()
        y = pd.get_dummies(review_data['stars']).as_matrix()
        
        # We want our tokenizer on all of the data
        tokenizer = get_tokenizer(vocab_size)

        x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.3, train_size=0.7, random_state=SEED)
        
        # The data is truncated after the split so we get a stratified sample
        
        if num_reviews:
            print('Truncating data...')
            x_train = x_train[0:num_reviews]
            x_test = x_test[0:num_reviews]
            y_train = y_train[0:num_reviews]
            y_test = y_test[0:num_reviews]
        
        x_train = tokenizer.texts_to_sequences(x_train)
        x_train = pad_sequences(x_train, maxlen=review_length)

        # Fit our testing data
        x_test = tokenizer.texts_to_sequences(x_test)
        x_test = pad_sequences(x_test, maxlen=review_length)
        
        with open(file_name, 'wb') as file:
            pickle.dump([x_train, x_test, y_train, y_test], file)
    return

In [6]:
def get_data(vocab_size=100, review_length=250, num_reviews=None):
    prep_data(vocab_size=vocab_size, review_length=review_length, num_reviews=num_reviews)
    
    file_name = os.path.join(DATA_ROOT, 'processed_data', str(vocab_size) + '_' + str(review_length) + '_' + str(num_reviews) + '.pkl')
    
    with open(file_name, 'rb') as file:
            x_train, x_test, y_train, y_test = pickle.load(file)
    
    return x_train, x_test, y_train, y_test

In [7]:
from joblib import Parallel, delayed

review_lengths = [100, 250, 500]
vocab_sizes = [50, 100, 150, 200, 250, 500]

_ = Parallel(n_jobs=4)(
    delayed(prep_data)(
        vocab_size=vocab_size, review_length=review_length, num_reviews=25000)
    for vocab_size in vocab_sizes
    for review_length in review_lengths)

Retrieving/Preparing data for: vocab size = 50 review_length = 500 num_reviews = 25000
Retrieving/Preparing data for: vocab size = 50 review_length = 250 num_reviews = 25000
Retrieving/Preparing data for: vocab size = 50 review_length = 100 num_reviews = 25000
Retrieving/Preparing data for: vocab size = 100 review_length = 100 num_reviews = 25000
Retrieving/Preparing data for: vocab size = 100 review_length = 250 num_reviews = 25000
Retrieving/Preparing data for: vocab size = 100 review_length = 500 num_reviews = 25000
Retrieving/Preparing data for: vocab size = 150 review_length = 250 num_reviews = 25000
Retrieving/Preparing data for: vocab size = 150 review_length = 100 num_reviews = 25000
Retrieving/Preparing data for: vocab size = 150 review_length = 500 num_reviews = 25000
Retrieving/Preparing data for: vocab size = 200 review_length = 100 num_reviews = 25000
Retrieving/Preparing data for: vocab size = 200 review_length = 250 num_reviews = 25000
Retrieving/Preparing data for: voca

In [8]:
'''
Explain rational behind this
'''
def mean_star_diff(y_true, y_pred):
    return K.mean(K.abs(K.argmax(y_true) - K.argmax(y_pred)))

In [9]:
def basic_lstm_model(embedding_vector_length=32, dropout_rate=0.2, vocab_size=500, review_length=250):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_vector_length, input_length=review_length))
    model.add(Dropout(dropout_rate))
    model.add(LSTM(100))
    model.add(Dropout(dropout_rate))
    model.add(Dense(5, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', mean_star_diff])
    return model

In [None]:
vocab_sizes = [50, 100, 150, 200, 250, 500]
review_lengths = [150, 250, 500]
num_epochs = 5

for vocab_size in vocab_sizes:
    for review_length in review_lengths:
        file_name = os.path.join(DATA_ROOT, str(vocab_size) + '.' + str(review_length) + '.' + str(num_epochs) + 'epochs.hd5')
        
        if not os.path.isfile(file_name):
            x_train, x_test, y_train, y_test = get_data(vocab_size=vocab_size, review_length=review_length, num_reviews=25000)
            model = basic_lstm_model(vocab_size=vocab_size, review_length=review_length)
            model.fit(x_train, y_train, epochs=num_epochs, batch_size=64)
            model.save(file_name)

Retrieving/Preparing data for: vocab size = 50 review_length = 150 num_reviews = 25000
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Retrieving/Preparing data for: vocab size = 50 review_length = 250 num_reviews = 25000
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Retrieving/Preparing data for: vocab size = 50 review_length = 500 num_reviews = 25000
Epoch 1/5
Epoch 2/5