In [3]:
import tensorflow as tf
import pandas as pd
import numpy as np
import gzip
import simplejson
import json

In [4]:
class DataImporter(object):
    def __init__(self, path_to_data):
        # get the raw data
        raw_data = [json.loads(i) for i in gzip.open(path_to_data, "rt")]
        self.data = pd.DataFrame(raw_data)
        self.num_reviews = len(self.data)
        
        # get the counts of users and products
        self.reviewers = self.data["reviewerID"].unique()
        self.num_reviewers = len(self.reviewers)
        self.products = self.data["asin"].unique()
        self.num_products = len(self.products)
        
        # create int-to-name dictionaries for each
        self.reviewer_to_num = {reviewer: idx for idx, reviewer in enumerate(self.reviewers)}
        self.num_to_reviewer = {idx: reviewer for idx, reviewer in enumerate(self.reviewers)}
        self.product_to_num = {product: idx for idx, product in enumerate(self.products)}
        self.num_to_product = {idx: product for idx, product in enumerate(self.products)}
        
    def create_train_test_split(self, frac_test):
        # get test data indices
        test_indices = np.random.choice(self.num_reviews,
                                         size=int(self.num_reviews * frac_test),
                                         replace=False)
        
        # split raw data into train/test
        raw_test = self.data.iloc[test_indices, :]
        raw_train = self.data.drop(test_indices).dropna()
        
        # get dimensions of the matrices
        dim = (self.num_reviewers, self.num_products)
        self.test_matrix = self.populate_user_product_review_matrix(dim, raw_test)
        self.train_matrix = self.populate_user_product_review_matrix(dim, raw_train)
        
        self.test = self.permute_matrix(self.test_matrix)
        self.train = self.permute_matrix(self.train_matrix)
        
    def permute_matrix(self, mat):
        perm = []
        for u_idx, row in enumerate(mat):
            for prod_idx, score in enumerate(row):
                user_cpy = row[:]
                score = user_cpy[prod_idx]
                user_cpy[prod_idx] = 0
                prod_cpy = mat[:, prod_idx]
                prod_cpy[u_idx] = 0
                perm.append({"user": user_cpy,
                             "reviewerID": self.num_to_reviewer[u_idx],
                             "product": prod_cpy,
                             "score": score,
                             "asin": self.num_to_product[prod_idx]
                             })
        return perm
    
    def populate_user_product_review_matrix(self, dimensions, dataset):
        assert isinstance(dataset, pd.DataFrame)
        ret_matrix = np.zeros(dimensions)
        for idx, row in dataset.iterrows():
            ret_matrix[self.reviewer_to_num[row["reviewerID"]], self.product_to_num[row["asin"]]] = row["overall"]
        return ret_matrix

In [5]:
data_importer = DataImporter("data/reviews_Amazon_Instant_Video_5.json.gz")
data_importer.create_train_test_split(0.1)

In [None]:
with open("data/train.npy", "wb") as train:
    np.save(train, data_importer.train)

In [None]:
with open("data/test.npy", "wb") as test:
    np.save(test, data_importer.test)