In [3]:
import numpy as np
import pandas as pd
import os
import csv
from datetime import datetime
from sklearn.model_selection import train_test_split
import pickle

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split, GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers.core import Activation
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, RNN
from keras.layers.embeddings import Embedding

In [4]:
DATA_ROOT = 'data'
SEED = 1000
NUM_EPOCHS = 5
BATCH_SIZE = 64

In [5]:
def load_data(file_name):
    pickle_name = os.path.join(DATA_ROOT, file_name + '.pkl')
    time_start = datetime.now()
    if os.path.isfile(pickle_name):
        print('loading from pickle...')
        review_data = pd.read_pickle(pickle_name)
    else:
        print('loading from csv...')
        review_data = pd.read_csv(os.path.join(DATA_ROOT, file_name))
        review_data.to_pickle(pickle_name)
    print('Loaded in ' + str(datetime.now() - time_start) + ' seconds')
    return review_data

In [None]:
review_data = load_data('yelp_review.csv')

In [None]:
review_data.shape

In [None]:
# review_data.drop(['review_id', 'user_id', 'business_id', 'date', 'useful', 'funny', 'cool'], axis=1, inplace=True)

In [None]:
# x = review_data['text'].as_matrix()
# y = pd.get_dummies(review_data['stars']).as_matrix()

In [None]:
# x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.3, train_size=0.7, random_state=SEED)

In [6]:
def get_tokenizer(vocab_size, train_text=None):
    tokenizer_file_name = os.path.join(DATA_ROOT, 'tokenizer_' + str(vocab_size) + '.pkl')
    time_start = datetime.now()
    if os.path.isfile(tokenizer_file_name):
        print('Loading tokenizer...')
        with open(tokenizer_file_name, 'rb') as file:
            tokenizer = pickle.load(file)
    else:
        print('Training tokenizer...')
        tokenizer = Tokenizer(num_words=vocab_size)
        tokenizer.fit_on_texts(train_text)
        
        with open(tokenizer_file_name, 'wb') as file:
            pickle.dump(tokenizer, file)
        
    print('Got tokenizer for vocab size: ' + str(vocab_size) + ' in ' + str(datetime.now() - time_start))
    return tokenizer

In [12]:
def get_data(vocab_size=100, review_length=250, num_reviews=None):
    file_name = os.path.join(DATA_ROOT, str(vocab_size) + '_' + str(review_length) + '_' + str(num_reviews) + '.pkl')
    if os.path.isfile(file_name):
        with open(file_name, 'rb') as file:
            x_train, x_test, y_train, y_test = pickle.load(file)
    else:
        review_data = load_data('yelp_review.csv')
        review_data.drop(['review_id', 'user_id', 'business_id', 'date', 'useful', 'funny', 'cool'], axis=1, inplace=True)
        
        x = review_data['text'].as_matrix()
        y = pd.get_dummies(review_data['stars']).as_matrix()
        
        # We want our tokenizer on all of the data
        tokenizer = get_tokenizer(vocab_size)

        x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.3, train_size=0.7, random_state=SEED)
        
        # The data is truncated after the split so we get a stratified sample
        
        if num_reviews:
            print('Truncating data...')
            x_train = x_train[0:num_reviews]
            x_test = x_test[0:num_reviews]
            y_train = y_train[0:num_reviews]
            y_test = y_test[0:num_reviews]
        
        x_train = tokenizer.texts_to_sequences(x_train)
        x_train = pad_sequences(x_train, maxlen=review_length)

        # Fit our testing data
        x_test = tokenizer.texts_to_sequences(x_test)
        x_test = pad_sequences(x_test, maxlen=review_length)
        
        with open(file_name, 'wb') as file:
            pickle.dump([x_train, x_test, y_train, y_test], file)
    
    return x_train, x_test, y_train, y_test

In [10]:
# Prepping Data for reuse
vocab_sizes = [50, 100, 150, 200, 250, 500]
review_length = [50, 100, 150, 200, 250, 300]



In [13]:
x_train, x_test, y_train, y_test = get_data(vocab_size=50, num_reviews=1000)

In [None]:
x_train.shape

In [14]:
from keras import backend as K

In [15]:
'''
Explain rational behind this
'''


def mean_star_diff(y_true, y_pred):
    return K.mean(K.abs(K.argmax(y_true) - K.argmax(y_pred)))
    

In [None]:
y_true = [
    [0, 1, 0, 0, 0],
    [1, 0, 0, 0, 0],
    [0, 0, 0, 0, 1],
    [1, 0, 0, 0, 0]
]

y_pred = [
    [1, 0, 0, 0, 0],
    [0, 0, 0, 1, 0],
    [0, 0, 1, 0, 0],
    [1, 0, 0, 0, 0]
]

In [None]:
print(mean_star_diff(y_true, y_pred))

In [16]:
def basic_lstm_model(embedding_vector_length=32, dropout_rate=0.2, vocab_size=500, review_length=250):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_vector_length, input_length=review_length))
    model.add(Dropout(dropout_rate))
    model.add(LSTM(100))
    model.add(Dropout(dropout_rate))
    model.add(Dense(5, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', mean_star_diff])
    return model

In [17]:
model = basic_lstm_model(vocab_size=50)
model.fit(x_train, y_train, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE)
scores = model.evaluate(x_test, y_test, verbose=0)
print(scores)
print("Accuracy: %.2f%%" % (scores[1]*100))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[1.3351446018218993, 0.442, 0.84]
Accuracy: 44.20%


In [18]:
x_train, x_test, y_train, y_test = get_data(vocab_size=50, num_reviews=10000)

In [None]:
model = basic_lstm_model(vocab_size=50)
model.fit(x_train, y_train, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE)
scores = model.evaluate(x_test, y_test, verbose=0)
print(scores)

Epoch 1/5
Epoch 2/5

In [None]:
def basic_lstm_model_star_loss(embedding_vector_length=32, dropout_rate=0.2, vocab_size=500, review_length=250):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_vector_length, input_length=review_length))
    model.add(Dropout(dropout_rate))
    model.add(LSTM(100))
    model.add(Dropout(dropout_rate))
    model.add(Dense(5, activation='softmax'))
    model.compile(loss=mean_star_diff, optimizer='adam', metrics=['accuracy', mean_star_diff])
    return model

In [None]:
model = basic_lstm_model_star_loss(voacb_size=50)
model.fit(x_train, y_train, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE)
scores = model.evaluate(x_test, y_test, verbose=0)
print(scores)