In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import gzip
import simplejson
import json
import pickle

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import sequence
from keras.utils import np_utils


Using TensorFlow backend.


In [28]:
def get_raw_input_data(path_to_data='data/reviews_Amazon_Instant_Video_5.json', save=False):
    print('Aggregating all the review text')
    rawData = []
    with open(path_to_data,'r') as f:
        for i in f:
            line = f.readline()
            lineObj = json.loads(line)
            user = lineObj['reviewerID']
            movie = lineObj['asin']
            rating = lineObj['overall']
            review = lineObj['reviewText']
            rawInputDataObj = {'user':user, 'movieID':movie, 'rating':rating, 'review':review}
            rawData.append(rawInputDataObj)
            
    if save:
        pickle.dump((rawData), open('raw_input_data.pkl','wb'))
    return rawData

def get_model_data(inputData):
    users = {}
    movies = {}
    users_and_reviews = {}
    movies_and_reviews = {}
    for item in inputData:
        user = item['user']
        movie = item['movieID']
        review = item['review']
        users.setdefault(user, []).append(movie)
        movies.setdefault(movie, []).append(user)
        users_and_reviews.setdefault(user, []).append(review)
        movies_and_reviews.setdefault(movie, []).append(review)
    return (users, movies, users_and_reviews,movies_and_reviews)

def aggregate_all_reviews(path_to_data='data/reviews_Amazon_Instant_Video_5.json', save=False):
    print('Aggregating all the review text')
    rawReviewData = []
    with open(path_to_data,'r') as f:
        for i in f:
            line = f.readline()
            lineObj = json.loads(line)
            review = lineObj['reviewText']
            rawReviewData.append(review)
            
    if save:
        pickle.dump((rawReviewData), open('agg_review_data.pkl','wb'))
    return rawReviewData

def build_vocab(agg_text, word_len=250, vocab_size=25000):
    #length of vocab, Tokenizer will only use vocab_len most common words
    print("Length of vocabulary: ", vocab_size)

    #we tokenize the texts and convert all the words to tokens
    tokenizer = Tokenizer(num_words=vocab_size)
    tokenizer.fit_on_texts(agg_text)
    print("Fitting and returning tokenizer object")
    return tokenizer

def tokenize_reviews(data, tokenizer, word_len=250):
    #clip the sentence length to first (word_len) words.
    print("Max word length: ", word_len)
    # Default text_to_sq removes punc,lowercases, + splits" "
    token_data = tokenizer.texts_to_sequences(data) #No filters, lowercase, not sure which is better
    
    #Ensure all reviews have the same length, we pad the smaller reviews with 0, 
    #and cut the larger reviews to a max length 
    #(we clip from the top, as the end of the reviews generally have a conclusion which provides better features)
    print("Padding sequences and returning sequence")
    return sequence.pad_sequences(token_data, maxlen=word_len)

In [48]:
path_to_data = 'data/reviews_Amazon_Instant_Video_5.json'
review_data = aggregate_all_reviews(path_to_data)
tokenizer = build_vocab(review_data)
tokenized_reviews = tokenize_reviews(review_data, tokenizer, word_len=250)
#print("tokenized_reviews:\n", tokenized_reviews)

Aggregating all the review text
Length of vocabulary:  25000
Fitting and returning tokenizer object
Max word length:  250
Padding sequences and returning sequence


In [27]:
sample_review = review_data[0]
print("Sample review:\n", sample_review)
tokens = tokenize_reviews(sample_review, tokenizer, word_len=250)
print("\nPadded tokenized review:\n", tokens)

Sample review:
 I highly recommend this series. It is a must for anyone who is yearning to watch "grown up" television. Complex characters and plots to keep one totally involved. Thank you Amazin Prime.
Max word length:  250
Padding sequences and returning sequence

Padded tokenized review:
 [[   0    0    0 ...,    0    0    7]
 [   0    0    0 ...,    0    0    0]
 [   0    0    0 ...,    0    0 1593]
 ..., 
 [   0    0    0 ...,    0    0 1673]
 [   0    0    0 ...,    0    0  852]
 [   0    0    0 ...,    0    0    0]]


In [32]:
"""
Returns aggregated raw data where each entry in list is dictionary of a unique (user, review, rating, movie)
So user can show up multiple times in list but not for same review/movie
"""
raw_data = get_raw_input_data()
raw_data[0:2]

Aggregating all the review text


[{'movieID': 'B000H00VBQ',
  'rating': 5.0,
  'review': 'I highly recommend this series. It is a must for anyone who is yearning to watch "grown up" television. Complex characters and plots to keep one totally involved. Thank you Amazin Prime.',
  'user': 'A3BC8O2KCL29V2'},
 {'movieID': 'B000H00VBQ',
  'rating': 4.0,
  'review': 'Mysteries are interesting.  The tension between Robson and the tall blond is good but not always believable.  She often seemed uncomfortable.',
  'user': 'A1RJPIGRSNX4PW'}]

In [37]:
"""
Returns lists of dictionaries 
user_data: users and list of moviesIDs they reviewed
movie_data: movies with userIDs of users who have rated them
user_to_reviews: (*important*) users with their reviews
movie_to_reviews: (*important*) movies with their reviews
"""
user_data, movie_data, user_to_reviews, movie_to_reviews = get_model_data(raw_data)

In [47]:
"""
If you run the entire thing you may/will get IO warning (requires alot of memory to print)
Here's one example of a user's reviews before Encoding!
"""
print(user_to_reviews['A3BC8O2KCL29V2']) #A3BC8O2KCL29V2 #A1RJPIGRSNX4PW
print()
# Example of a movie's reviews
print(movie_to_reviews['B000HKWE3O'])

['I highly recommend this series. It is a must for anyone who is yearning to watch "grown up" television. Complex characters and plots to keep one totally involved. Thank you Amazin Prime.', "I watched this film on Saturday and it is now Wednesday and I am stilling thinking about it. This is an Australian film, so the pace of the story is somewhat different...the crescendos are quieter but the sentinel events are clear. The film explores the boundaries of  a friendship and what liberties are accepted and why. Both Naomi Watts and Robin Wright give performances which ring true and are compelling.  The young male leads are also quite strong. It's good and surprising story telling.", "I was not sure what I was getting but this film kept me watching. I was curious about. Each character's history and and motivation. The acting was good and there were familiar faces sprinkled throughout. I continue to be curious web out what comes next. All in all, I think this is a well written and well act