#### LSTM 

In [2]:
import os
import string

import numpy as np
import pandas as pd
from utils import load_sparse_csr
from __future__ import print_function

from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.datasets import imdb

Using Theano backend.


In [3]:
SEED_VAL = 200
n_words=10000
data_subset = "_10Percent"
VALIDATION_DATA_PERCENTAGE = 0.1
WORK_DIR = os.getcwd()
YELP_DATA_CSV_DIR = os.path.join(WORK_DIR, "data", "csv")
YELP_DATA_WORD_2_VEC_MODEL_DIR = os.path.join(WORK_DIR, "data", "word2vec_model")
YELP_DATA_SPARSE_MATRIX_DIR = os.path.join(WORK_DIR, "data", "sparse_matrix")

In [4]:
read_filename = os.path.join(YELP_DATA_CSV_DIR, 'business_review_user'+ data_subset+ '.csv')
df_data = pd.read_csv(read_filename, engine='c', encoding='utf-8')

  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
def myLSTM(trainDataVecs, y,SEED_VAL=SEED_VAL):
    '''
    Function to train LSTM and print the accuracy for train and test.
    
    Divides the data in train 90% and test 10%. 
    
    Inputs
    trainDataVecs - Numpy darray matrix
    y - Numpy darray for vector
    SEED_VAL = seed for randomly shuffling the data
    
    Output
    prints the accuracy of trained model on training and testing data
    '''
    # Divide the data in test and train
    np.random.seed = SEED_VAL
    n_samples = len(trainDataVecs)
    sidx = np.random.permutation(n_samples)
    data_set_x = trainDataVecs.tolist()
    b = y
    
    # b[(b == 1) | (b == 2) | (b == 3)] = 0
    # b[(b == 4) | (b == 5)] = 1
    data_set_y = b.tolist()

    n_train = int(np.round(n_samples * (1. - VALIDATION_DATA_PERCENTAGE)))
    valid_set_x = [data_set_x[s] for s in sidx[n_train:]]
    valid_set_y = [data_set_y[s] for s in sidx[n_train:]]
    train_set_x = [data_set_x[s] for s in sidx[:n_train]]
    train_set_y = [data_set_y[s] for s in sidx[:n_train]]

    # def remove_unk(x):
    #     return [[1 if w >= n_words else w for w in sen] for sen in x]

    # train_set_x = remove_unk(train_set_x)
    # valid_set_x = remove_unk(valid_set_x)

    train = (train_set_x, train_set_y)
    valid = (valid_set_x, valid_set_y)

    max_features = 100
    maxlen = trainDataVecs.shape[1]  # cut texts after this number of words (among top max_features most common words)
    batch_size = 32

    X_train, y_train=train[0], train[1]
    X_test, y_test= valid[0], valid[1]

    print("Pad sequences (samples x time)")
    # http://keras.io/preprocessing/sequence/
    X_train = sequence.pad_sequences(X_train, maxlen=maxlen, dtype='float32')
    X_test = sequence.pad_sequences(X_test, maxlen=maxlen, dtype='float32')

    y_train = np.array(y_train, dtype='int32')
    y_test = np.array(y_test, dtype='int32')

    print('Build model...')
    # http://keras.io/objectives/
    # http://keras.io/optimizers/

    model = Sequential()
    model.add(Embedding(max_features, 128, input_length=maxlen))
    model.add(LSTM(128))  # try using a GRU instead, for fun
    model.add(Dropout(0.5))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    # try using different optimizers and different optimizer configs
    # model.compile(loss='binary_crossentropy',
    #               optimizer='adam',
    #               class_mode="binary")

    # 'mean_squared_error', binary_crossentropy

    model.compile(loss='mean_squared_error', 
                  optimizer='adam', metrics=["accuracy"])

    print("Train...")
    %time model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=3, validation_data=(X_test, y_test))


    score1, accuracy1 = model.evaluate(X_train, y_train,
                                batch_size=batch_size,
                                show_accuracy=True)
    
    print('Train score:', score1)
    print ('Train Accuracy: ', accuracy1)
    
    score2, accuracy2 = model.evaluate(X_test, y_test,
                            batch_size=batch_size,
                            show_accuracy=True)
    
    print('Test score:', score2)
    print ('Test Accuracy: ', accuracy2)
    
    
y = np.array(df_data.review_stars.copy(), dtype='int32')

# Bag of words

In [9]:
spare_matrix_file = os.path.join(YELP_DATA_SPARSE_MATRIX_DIR, "bagWords"+ data_subset)
bag_of_words_sparse_matrix = load_sparse_csr(spare_matrix_file + ".npz")

In [None]:
matrix_bag_of_words = bag_of_words_sparse_matrix.toarray()
myLSTM(matrix_bag_of_words, y, SEED_VAL)

Pad sequences (samples x time)
Build model...
Train...
Train on 197356 samples, validate on 21929 samples
Epoch 1/3
   320/197356 [..............................] - ETA: 17156s - loss: 11.1349 - acc: 0.1094

# Bag of words + Hand craft features

In [19]:
spare_matrix_file = os.path.join(YELP_DATA_SPARSE_MATRIX_DIR, "bagWords_feat_add" + data_subset)
feature_matrix_bag_of_words_and_hand_craft_features = load_sparse_csr(spare_matrix_file + ".npz")

IOError: [Errno 2] No such file or directory: 'D:\\_Active_Projects\\yelp\\yelp\\data\\sparse_matrix\\bagWords_feat_add_10Percent.npz'

In [21]:
myLSTM(feature_matrix_bag_of_words_and_hand_craft_features.toarray(), y, SEED_VAL)

Pad sequences (samples x time)
Build model...
Train...
Train on 2002 samples, validate on 223 samples
Epoch 1/3
 128/2002 [>.............................] - ETA: 3715s - loss: 12.4253 - acc: 0.0938

KeyboardInterrupt: 

# Word Embedding

In [11]:
word2vec_feature_matrix_file = os.path.join(YELP_DATA_WORD_2_VEC_MODEL_DIR, "word2vec_feature_matrix" + data_subset+ ".csv")
feature_matrix_word2vec = np.genfromtxt(word2vec_feature_matrix_file, delimiter=',')   

In [13]:
myLSTM(feature_matrix_word2vec, y, SEED_VAL)

Pad sequences (samples x time)
Build model...
Train...
Train on 2002 samples, validate on 223 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Train score: 9.51211987533
Train Accuracy:  0.120879120887
Test score: 9.87009063002
Test Accuracy:  0.112107623051


# Word Embedding + Hand craft features

In [14]:
word2vec_feature_matrix_file = os.path.join(YELP_DATA_WORD_2_VEC_MODEL_DIR, "word2vec_add_feature_matrix" + data_subset+ ".csv")
feature_matrix_word2vec_and_hand_craft_features = np.genfromtxt(word2vec_feature_matrix_file, delimiter=',')  

In [15]:
myLSTM(feature_matrix_word2vec_and_hand_craft_features, y, SEED_VAL)

Pad sequences (samples x time)
Build model...
Train...
Train on 2002 samples, validate on 223 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Train score: 9.49611577407
Train Accuracy:  0.118381618426
Test score: 10.0135671675
Test Accuracy:  0.134529147314


# Hand craft features

In [16]:
feature_matrix_hand_craft_features = feature_matrix_word2vec_and_hand_craft_features[:,100:104]

In [17]:
myLSTM(feature_matrix_hand_craft_features, y, SEED_VAL)

Pad sequences (samples x time)
Build model...
Train...
Train on 2002 samples, validate on 223 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Train score: 9.44807190947
Train Accuracy:  0.120879120883


ValueError: I/O operation on closed file