In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import gzip
import simplejson
import json
import pickle
import os


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import sequence
from keras.utils import np_utils


Using TensorFlow backend.


In [177]:
def get_raw_input_data(path_to_data='data/reviews_Amazon_Instant_Video_5.json', save=False):
    print('Aggregating all the review text')
    rawData = []
    with open(path_to_data,'r') as f:
        for i in f:
            line = f.readline()
            lineObj = json.loads(line)
            user = lineObj['reviewerID']
            movie = lineObj['asin']
            rating = lineObj['overall']
            review = lineObj['reviewText']
            rawInputDataObj = {'user':user, 'movieID':movie, 'rating':rating, 'review':review}
            rawData.append(rawInputDataObj)
            
    if save:
        pickle.dump((rawData), open('raw_input_data.pkl','wb'))
    return rawData

# I think this is what Rich wants but not sure...
def get_model_data(raw_data):
    users = {}
    movies = {}
    users_and_reviews = {}
    movies_and_reviews = {}
    for item in raw_data:
        user = item['user']
        movie = item['movieID']
        review = item['review']
        users.setdefault(user, []).append(movie)
        movies.setdefault(movie, []).append(user)
        users_and_reviews.setdefault(user, []).append(review)
        movies_and_reviews.setdefault(movie, []).append(review)
    return (users, movies, users_and_reviews,movies_and_reviews)

def get_vect_review_data(tokenizer, word_len=250, path_to_data='data/reviews_Amazon_Instant_Video_5.json', save=False):
    print('Constructing vectorized model input data')
    vecUserReviews = []
    vecMovieReviews = []
    ratingsData = []
    with open(path_to_data,'r') as f:
        for i in f:
            line = f.readline()
            lineObj = json.loads(line)
            user = lineObj['reviewerID']
            movie = lineObj['asin']
            rating = lineObj['overall']
            review = lineObj['reviewText']
            # Tokenize the review then add to dict1 user|review, dict2 movie|review
            vectReview = tokenize_reviews(review, tokenizer, word_len)
            user_and_vectReview = {'user':user, 'review':vectReview}
            movie_and_vectReview = {'movieID':movie, 'review':vectReview}
            # Aggregate user|review dicts, movie|review dicts, ratings into seperate lists
            vecUserReviews.append(user_and_vectReview)
            vecMovieReviews.append(movie_and_vectReview)
            ratingsData.append(rating)      
    if save:
        pickle.dump((vecUserReviews), open('vec_user_rev_data.pkl','wb'))
        pickle.dump((vecMovieReviews), open('vec_movie_rev_data.pkl','wb'))
        pickle.dump((ratingsData), open('ratings_data.pkl','wb'))
    print('Returning vectorized user|review data, movie|review data, and ratings data')
    return vecUserReviews, vecMovieReviews, ratingsData
    

def aggregate_all_reviews(path_to_data='data/reviews_Amazon_Instant_Video_5.json', save=False):
    print('Aggregating all the review text')
    rawReviewData = []
    with open(path_to_data,'r') as f:
        for i in f:
            line = f.readline()
            lineObj = json.loads(line)
            review = lineObj['reviewText']
            rawReviewData.append(review)
            
    if save:
        pickle.dump((rawReviewData), open('agg_review_data.pkl','wb'))
    return rawReviewData

def build_vocab(agg_text, word_len=250, vocab_size=25000):
    #length of vocab, Tokenizer will only use vocab_len most common words
    print("Length of vocabulary: ", vocab_size)

    #we tokenize the texts and convert all the words to tokens
    tokenizer = Tokenizer(num_words=vocab_size)
    tokenizer.fit_on_texts(agg_text)
    print("Fitting and returning tokenizer object")
    return tokenizer

def tokenize_reviews(text, token, word_len=250):
    #clip the sentence length to first (word_len) words.
    # print("Max word length: ", word_len)
    # Default text_to_sq removes punc,lowercases, + splits" "
    token_data = token.texts_to_sequences(text) #No filters, lowercase, not sure which is better
    
    #Ensure all reviews have the same length, we pad the smaller reviews with 0, 
    #print("Padding sequences and returning sequence")
    return sequence.pad_sequences(token_data, maxlen=word_len)



In [55]:
path_to_data = 'data/reviews_Amazon_Instant_Video_5.json'
review_data = aggregate_all_reviews(path_to_data)
token = build_vocab(review_data)
tokenized_reviews = tokenize_reviews(review_data, token, word_len=250)
#print("tokenized_reviews:\n", tokenized_reviews)

Aggregating all the review text
Length of vocabulary:  25000
Fitting and returning tokenizer object
18563


In [365]:
sample_review = review_data[0:2]
print("Sample review:\n", sample_review)
tokens = tokenize_reviews(sample_review, tokenizer, word_len=250)
print("\nPadded tokenized review:\n", tokens)
len(tokens[0][:])

Sample review:
 ['I highly recommend this series. It is a must for anyone who is yearning to watch "grown up" television. Complex characters and plots to keep one totally involved. Thank you Amazin Prime.', 'Mysteries are interesting.  The tension between Robson and the tall blond is good but not always believable.  She often seemed uncomfortable.']

Padded tokenized review:
 [[    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0    

250

In [72]:
"""
Returns aggregated raw data where each entry in list is dictionary of a unique (user, review, rating, movie)
So user can show up multiple times in list but not for same review/movie
"""
raw_data = get_raw_input_data()
raw_data[0:2]

Aggregating all the review text


[{'movieID': 'B000H00VBQ',
  'rating': 5.0,
  'review': 'I highly recommend this series. It is a must for anyone who is yearning to watch "grown up" television. Complex characters and plots to keep one totally involved. Thank you Amazin Prime.',
  'user': 'A3BC8O2KCL29V2'},
 {'movieID': 'B000H00VBQ',
  'rating': 4.0,
  'review': 'Mysteries are interesting.  The tension between Robson and the tall blond is good but not always believable.  She often seemed uncomfortable.',
  'user': 'A1RJPIGRSNX4PW'}]

In [73]:
"""
Returns lists of dictionaries 
user_data: users and list of moviesIDs they reviewed
movie_data: movies with userIDs of users who have rated them
user_to_reviews: (*important*) users with their reviews
movie_to_reviews: (*important*) movies with their reviews
"""
user_data, movie_data, user_to_reviews, movie_to_reviews = get_model_data(raw_data)

In [47]:
"""
If you run the entire thing you may/will get IO warning (requires alot of memory to print)
Here's one example of a user's reviews before Encoding!
"""
print(user_to_reviews['A3BC8O2KCL29V2']) #A3BC8O2KCL29V2 #A1RJPIGRSNX4PW
print()
# Example of a movie's reviews
print(movie_to_reviews['B000HKWE3O'])

['I highly recommend this series. It is a must for anyone who is yearning to watch "grown up" television. Complex characters and plots to keep one totally involved. Thank you Amazin Prime.', "I watched this film on Saturday and it is now Wednesday and I am stilling thinking about it. This is an Australian film, so the pace of the story is somewhat different...the crescendos are quieter but the sentinel events are clear. The film explores the boundaries of  a friendship and what liberties are accepted and why. Both Naomi Watts and Robin Wright give performances which ring true and are compelling.  The young male leads are also quite strong. It's good and surprising story telling.", "I was not sure what I was getting but this film kept me watching. I was curious about. Each character's history and and motivation. The acting was good and there were familiar faces sprinkled throughout. I continue to be curious web out what comes next. All in all, I think this is a well written and well act

In [61]:
path_to_data = 'data/reviews_Amazon_Instant_Video_5.json'
word_len = 250
# tokenizer - returned above
vecUserReviews, vecMovieReviews, ratingsData = get_vect_review_data(token, word_len, path_to_data, save=False)

Constructing vectorized model input data
Returning vectorized user|review data, movie|review data, and ratings data


In [393]:
def get_raw_reviews(filePath):
    reviews = []
    with open(filePath,'r') as f:
        for i in f:
            line = f.readline()
            lineObj = json.loads(line)
            review = lineObj['reviewText']
            reviews.append(review) 
    return reviews

def get_ratings(filePath):
    ratings = []
    with open(filePath,'r') as f:
        for i in f:
            line = f.readline()
            lineObj = json.loads(line)
            rating = lineObj['overall']
            ratings.append(rating) 
    return pd.DataFrame(ratings)

def get_test_train_split(data, filePath, frac_tect=0.1):
    ratings_data = get_ratings(filePath)
    num_reviews = len(data)
    assert(num_reviews == len(ratings_data))
    
    test_indices = np.random.choice(num_reviews,
                                    size=int(num_reviews * frac_test), 
                                    replace=False)
    # Split raw data into train/test
    x_test = data.iloc[test_indices, :]
    y_test = ratings.iloc[test_indices, :]
    x_train = data.drop(test_indices)
    y_train = ratings.drop(test_indices)
    # Review dimensions
    print("x_train samples:", len(x_train))
    print("x_test samples:", len(x_test))
    print("y_train samples:", len(y_train))
    print("y_test samples:", len(y_test))
    return X_train, y_train, X_test, y_test  

def get_embedding_weights(tokenizer, embedding_dim=100, fileName='data/glove.6B/glove.6B.100d.txt'):
    embedding_map = {}
    BASE_DIR = ''
    GLOVE_DIR = os.path.join(BASE_DIR, 'data/glove.6B')
    glove = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
    
    print('constructing embedding dictionary...pls wait ~2min')
    #dimension of Glove 300 Embeddings
    EMBEDDING_DIM = embedding_dim

    word_index = tokenizer.word_index
    print('Found %s unique tokens' % len(word_index))

    #load glove embeddings
    embedding_map = {}
    for line in glove:
        values = line.split(' ')
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embedding_map[word] = embedding
    glove.close()
    print('GloVe Word embeddings:', len(embedding_map))

    # nb_words contains the total length of vocab
    nb_words = len(word_index) + 1

    #get glove embeddings for each word in tokenizer.
    #word_embedding_matrix holds the embeddings dictionary
    word_embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))

    for word, i in word_index.items():
        embedding_vector = embedding_map.get(word)
        if embedding_vector is not None:
            word_embedding_matrix[i] = embedding_vector

    #total words in the tokenizer not in Embedding matrix
    print('Null words in GloVe embeddings: %d' % np.sum(np.sum(word_embedding_matrix, axis=1) == 0))
    return word_embedding_matrix
    

In [408]:
filePath = "data/reviews_Amazon_Instant_Video_5.json"

raw_reviews = get_raw_reviews(filePath)
ratings = get_ratings(filePath)

tokenizer = Tokenizer(num_words=25000)
tokenizer.fit_on_texts(raw_reviews)
train_seq = tokenizer.texts_to_sequences(raw_reviews)

X_data = tokenizer.texts_to_sequences(raw_reviews)
X_data = sequence.pad_sequences(X_data, maxlen = 250)
X_data = pd.DataFrame(X_data)

# Get test/ train split 
frac_test = 0.1
test_indices = np.random.choice(num_reviews,
                                    size=int(num_reviews * frac_test), 
                                    replace=False)
# Split raw data into train/test
X_test = X_data.iloc[test_indices, :]
y_test = ratings.iloc[test_indices, :]
X_train = X_data.drop(test_indices)
y_train = ratings.drop(test_indices)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)


X_train shape: (16707, 250)
y_train shape: (16707, 1)
X_test shape: (1856, 250)
y_test shape: (1856, 1)


In [402]:
""" This may take a few. The greater embedding dim the longer. Preset to GloVe 100d"""
embedding_matrix = get_embedding_map(tokenizer)

constructing embedding dictionary...pls wait ~2min
Found 43445 unique tokens
GloVe Word embeddings: 400000
Null words in GloVe embeddings: 9032


In [419]:
model = Sequential()
model.add(Embedding(input_dim=embedding_matrix.shape[0],
          output_dim=100,
          weights=[embedding_matrix],
          input_length=250,
          trainable=False))
model.add(LSTM(100))
model.add(Dense(5, activation='sigmoid'))
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
print(model.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 250, 100)          4344600   
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_6 (Dense)              (None, 5)                 505       
Total params: 4,425,505
Trainable params: 80,905
Non-trainable params: 4,344,600
_________________________________________________________________
None


In [424]:
print('Train...')
model.fit(X_train, y_train,
          batch_size=32,
          epochs=4)

Train...


UnboundLocalError: local variable 'arrays' referenced before assignment

In [414]:
print('Build model...')

## implement model here
from keras.layers import Conv1D, MaxPooling1D
from keras.models import Sequential
from keras.layers import Embedding, Dense, Dropout, Reshape, Merge, BatchNormalization, TimeDistributed, Lambda, Activation, LSTM, Flatten, Convolution1D,Conv1D, GRU, MaxPooling1D


model = Sequential()
model.add(Embedding(input_dim=embedding_matrix.shape[0],
          output_dim=100,
          weights=[embedding_matrix],
          input_length=250,
          trainable=False))
     
model.add(Conv1D(64, 3))
model.add(Dropout(0.2))
model.add(Activation('relu'))

model.add(Conv1D(32, 3))
model.add(Activation('relu'))

model.add(Flatten())

model.add(Dense(128))
model.add(Activation('relu'))

model.add(Dense(5))
model.add(Activation('softmax'))


model.compile(loss='mean_squared_error',
              optimizer='Adam',
              metrics=['accuracy'])

model.summary()

print("Model Built")

Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 250, 100)          4344600   
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 248, 128)          38528     
_________________________________________________________________
dropout_5 (Dropout)          (None, 248, 128)          0         
_________________________________________________________________
activation_6 (Activation)    (None, 248, 128)          0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 246, 64)           24640     
_________________________________________________________________
dropout_6 (Dropout)          (None, 246, 64)           0         
_________________________________________________________________
activation_7 (Activation)    (None, 246, 64)           0     

In [415]:
print('Train...')
model.fit(X_train, y_train,
          batch_size=64,
          epochs=6)

Train...


UnboundLocalError: local variable 'arrays' referenced before assignment