### TODO List
1. Move the data loading functions to another file that we can import
1. The data is sorted, needs to be shuffled

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import gzip
import simplejson

In [39]:
keys = {i[i.find('/') + 1:]: i for i in ["product/productId",
                                         "review/userId",
                                         "review/profileName",
                                         "review/helpfulness",
                                         "review/score",
                                         "review/time",
                                         "review/summary",
                                         "review/text"]}


def parse(filename):
    f = gzip.open(filename, 'r')
    entry = {}
    for l in f:
        l = str(l.strip())
        colonPos = l.find(':')
        if colonPos == -1:
            yield entry
            entry = {}
            continue
        eName = l[:colonPos][2:]
        rest = l[colonPos + 2:-1]
        entry[eName] = rest
    yield entry


def get_n_reviews(filename, num_reviews):
    gen = parse(filename)
    results = []
    for i in range(num_reviews):
        results.append(next(gen))
    return results


In [114]:
class WideAndDeep(object):
    def __init__(self, data, test_fraction):
        train_fraction = 1 - test_fraction
        self.data_size = len(data)
        train_indices = np.random.choice(self.data_size,
                                         int(self.data_size * train_fraction),
                                         replace=False)
        # thank god this works... i did NOT feel like converting json to a dataframe...
        df = pd.DataFrame(data)
        
        # convert user ids to integers
        self.user_ids = df[keys["userId"]].unique()
        self.numbers_to_product_ids = {i: j for i, j in zip(range(len(self.user_ids)), self.user_ids)}
        self.user_ids_to_numbers = {j: i for i, j in self.numbers_to_product_ids.items()}
        self.num_user_ids = len(self.user_ids_to_numbers)
        
        # convert product ids to integers
        self.product_ids = df[keys["productId"]].unique()
        self.numbers_to_product_ids = {i: j for i, j in zip(range(len(self.product_ids)), self.product_ids)}
        self.product_ids_to_numbers = {j: i for i, j in self.numbers_to_product_ids.items()}
        self.num_product_ids = len(self.product_ids_to_numbers)
        
        # convert profileName to integer (not running any embedding on profile name, only on the review)
        self.profile_names = df[keys["profileName"]].unique()
        self.numbers_to_profile_names = {i: j for i, j in zip(range(len(self.profile_names)), self.profile_names)}
        self.profile_names_to_numbers = {j: i for i, j in self.numbers_to_profile_names.items()}
        self.num_profile_names = len(self.profile_names_to_numbers)
        
        # user-review matrix.
        self.user_score = np.zeros((self.num_user_ids, self.num_product_ids))
        for idx, row in df.iterrows():
            self.user_score[idx, self.product_ids_to_numbers[row[keys["productId"]]]] = row[keys["score"]]
        
        # this might be useful later when we implement the text-analysis version
        self.user_text = {}
        self.product_text = {}
        for idx, review in df.iterrows():
            text = review[keys["text"]]
            try:
                self.user_text[review[keys["userId"]]].append(text)
            except:
                self.user_text[review[keys["userId"]]] = [text]
            try:
                self.product_text[review[keys["productId"]]].append(text)
            except:
                self.product_text[review[keys["productId"]]] = [text]
    
    def define_feature_columns():
        helpfulness = tf.feature_column.numeric_column("review/helpfulness")
        user_id = tf.feature_column.categorical_column_with_identity("review/userId",
                                                                     num_buckets=self.num_user_ids)
        product_id = tf.feature_column.categorical_column_with_identity("product/productId",
                                                                        num_buckets=self.num_product_ids)
        profile_name = tf.feature_column.categorical_column_with_identity("review/profileName",
                                                                         num_buckets=self.num_profile_names)

In [115]:

wad = WideAndDeep(get_n_reviews("data/movies.txt.gz", 100), 0.1)

[[ 3.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 3.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 5.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 3.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 3.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 2.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  5.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  5.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  5.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  4.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  5.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  5.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  5.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  4.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  5.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  5.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  5.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  5.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  5.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  5.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  5.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  5.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  