In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import gzip
import simplejson
import json

In [2]:
class DataImporter(object):
    def __init__(self, path_to_data):
        # get the raw data
        raw_data = [json.loads(i) for i in gzip.open(path_to_data, "rt")]
        self.data = pd.DataFrame(raw_data)
        self.num_reviews = len(self.data)
        
        # get the counts of users and products
        self.reviewers = self.data["reviewerID"].unique()
        self.num_reviewers = len(self.reviewers)
        self.products = self.data["asin"].unique()
        self.num_products = len(self.products)
        
        # create int-to-name dictionaries for each
        self.reviewer_to_num = {reviewer: idx for idx, reviewer in enumerate(self.reviewers)}
        self.num_to_reviewer = {idx: reviewer for idx, reviewer in enumerate(self.reviewers)}
        self.product_to_num = {product: idx for idx, product in enumerate(self.products)}
        self.num_to_product = {idx: product for idx, product in enumerate(self.products)}
        
    def create_train_test_split(self, frac_test):
        # get test data indices
        test_indices = np.random.choice(self.num_reviews,
                                         size=int(self.num_reviews * frac_test),
                                         replace=False)
        
        # split raw data into train/test
        raw_test = self.data.iloc[test_indices, :]
        raw_train = self.data.drop(test_indices).dropna()
        
        # get dimensions of the matrices
        dim = (self.num_reviewers, self.num_products)
        self.test_matrix = self.populate_user_product_review_matrix(dim, raw_test)
        self.train_matrix = self.populate_user_product_review_matrix(dim, raw_train)
        
        self.test = self.permute_matrix(self.test_matrix)
        self.train = self.permute_matrix(self.train_matrix)
        
    def permute_matrix(self, mat):
        perm = []
        for u_idx, row in enumerate(mat):
            for prod_idx, score in enumerate(row):
                if score > 0.0:
                    user_cpy = row[:]
                    score = user_cpy[prod_idx]
                    user_cpy[prod_idx] = 0
                    prod_cpy = mat[:, prod_idx]
                    prod_cpy[u_idx] = 0
                    perm.append({"user": user_cpy,
                                 "reviewerID": self.num_to_reviewer[u_idx],
                                 "product": prod_cpy,
                                 "score": score,
                                 "asin": self.num_to_product[prod_idx]
                                 })
        return perm
    
    def populate_user_product_review_matrix(self, dimensions, dataset):
        assert isinstance(dataset, pd.DataFrame)
        ret_matrix = np.zeros(dimensions)
        for idx, row in dataset.iterrows():
            ret_matrix[self.reviewer_to_num[row["reviewerID"]], self.product_to_num[row["asin"]]] = row["overall"]
        return ret_matrix

In [3]:
data_importer = DataImporter("data/reviews_Amazon_Instant_Video_5.json.gz")
data_importer.create_train_test_split(0.1)

In [71]:
class wide_and_deep(object):
    
    def __init__(self, train, test):
        
        self.train = np.copy(train)
        self.test = np.copy(test)
        
        np.random.shuffle(self.train)
        np.random.shuffle(self.test)
        
        self.train = pd.DataFrame(list(self.train))
        self.test = pd.DataFrame(list(self.test))
        
    def add_all_features_columns(self):
        self.train = np.array( list(map(self.add_num_ratings, self.train)) )
        self.train = np.array( list(map(self.add_top_ratings, self.train)) )
        self.train = np.array( list(map(self.add_bottom_ratings, self.train)) )
        self.train = np.array( list(map(self.add_average_ratings, self.train)) )
        self.train = np.array( list(map(self.add_percent_rating, self.train)) )
        
    def add_num_ratings(self, data):
        data['num_user_ratings'] = np.count_nonzero(data['user'])
        data['num_movie_ratings'] = np.count_nonzero(data['product'])
        return data
        
    def add_top_ratings(self, data):
        data['top_user_rating'] = np.amax(data['user'])
        data['top_movie_rating'] = np.amax(data['product'])
        return data
    
    def add_bottom_ratings(self, data):
        data['bottom_user_rating'] = np.amin(data['user'])
        data['bottom_movie_rating'] = np.amin(data['product'])
        return data
    
    def add_average_ratings(self, data):
        data['bottom_user_rating'] = np.average(data['user'])
        data['bottom_movie_rating'] = np.average(data['product'])
        return data
    
    def add_percent_rating(self, data):
        data['percent_one_star'] = len(np.where(data['user'] == 1.0)[0])/len(data['user'])
        data['percent_two_star'] = len(np.where(data['user'] == 2.0)[0])/len(data['user'])
        data['percent_three_star'] = len(np.where(data['user'] == 3.0)[0])/len(data['user'])
        data['percent_four_star'] = len(np.where(data['user'] == 4.0)[0])/len(data['user'])
        data['percent_five_star'] = len(np.where(data['user'] == 5.0)[0])/len(data['user'])
        return data
        
    def total_ratings(self, data):
        count = 0.0
        for i in data:
            if i['score'] != 0.0:
                count += 1
        print(count)
        
    def build_model(self):
        N, D = self.train.shape
        X = tf.placeholder(dtype=tf.float32, shape=[None, D])
        
        def reshape_and_matmul(x):
            return tf.matmul(tf.reshape(x, [x.shape[0], 1]), tf.reshape(x, [1, x.shape[0]]))
        
        cross_prod_mat = tf.map_fn(reshape_and_matmul, X)
        cross_prod = tf.map_fn(lambda x: tf.reshape(x, [-1]), cross_prod_mat)
        

In [72]:
wad = wide_and_deep(data_importer.train, data_importer.test)

In [73]:
wad.build_model()

Tensor("map_9/TensorArrayStack/TensorArrayGatherV3:0", shape=(?, 25), dtype=float32)
