In [5]:
# built-in imports
import sys
import os
import re
import json
import gzip
import pickle
import array

# third-party imports
import pandas as pd
import numpy as np
import scipy.sparse as ssp
import dgl
import torch
import torchtext

# local imports
sys.path.insert(0, '../src/pinsage')
from builder import PandasGraphBuilder
from data_utils import *

In [6]:
# get directory of data files
directory = '../data'
# get output file path
output_path = './processed-amazon.pkl'

In [7]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

def readImageFeatures(path):
    f = open(path, 'rb')
    while True:
        asin = str(f.read(10), 'utf-8')
        if asin == '': break
        a = array.array('f')
        a.fromfile(f, 4096)
        yield asin, a.tolist()

## Review Data

In [8]:
reviews_path = os.path.join(directory, 'reviews_Electronics_5.json.gz')
reviews = getDF(reviews_path)
reviews.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,AO94DHGC771SJ,528881469,amazdnu,"[0, 0]",We got this GPS for my husband who is an (OTR)...,5.0,Gotta have GPS!,1370131200,"06 2, 2013"
1,AMO214LNFCEI4,528881469,Amazon Customer,"[12, 15]","I'm a professional OTR truck driver, and I bou...",1.0,Very Disappointed,1290643200,"11 25, 2010"
2,A3N7T0DY83Y4IG,528881469,C. A. Freeman,"[43, 45]","Well, what can I say. I've had this unit in m...",3.0,1st impression,1283990400,"09 9, 2010"
3,A1H8PY3QHMQQA0,528881469,"Dave M. Shaw ""mack dave""","[9, 10]","Not going to write a long review, even thought...",2.0,"Great grafics, POOR GPS",1290556800,"11 24, 2010"
4,A24EV6RXELQZ63,528881469,Wayne Smith,"[0, 0]",I've had mine for a year and here's what we go...,1.0,"Major issues, only excuses for support",1317254400,"09 29, 2011"


In [9]:
reviews.shape

(1689188, 9)

## Clean Reviews Data

In [10]:
reviews = reviews.dropna()
reviews.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,AO94DHGC771SJ,528881469,amazdnu,"[0, 0]",We got this GPS for my husband who is an (OTR)...,5.0,Gotta have GPS!,1370131200,"06 2, 2013"
1,AMO214LNFCEI4,528881469,Amazon Customer,"[12, 15]","I'm a professional OTR truck driver, and I bou...",1.0,Very Disappointed,1290643200,"11 25, 2010"
2,A3N7T0DY83Y4IG,528881469,C. A. Freeman,"[43, 45]","Well, what can I say. I've had this unit in m...",3.0,1st impression,1283990400,"09 9, 2010"
3,A1H8PY3QHMQQA0,528881469,"Dave M. Shaw ""mack dave""","[9, 10]","Not going to write a long review, even thought...",2.0,"Great grafics, POOR GPS",1290556800,"11 24, 2010"
4,A24EV6RXELQZ63,528881469,Wayne Smith,"[0, 0]",I've had mine for a year and here's what we go...,1.0,"Major issues, only excuses for support",1317254400,"09 29, 2011"


## Users

In [30]:
users = reviews[['reviewerID']].drop_duplicates()
users

Unnamed: 0,reviewerID
0,AO94DHGC771SJ
1,AMO214LNFCEI4
2,A3N7T0DY83Y4IG
3,A1H8PY3QHMQQA0
4,A24EV6RXELQZ63
...,...
1675080,A2WBEUEO4HFDCA
1677359,A2RCXXH309JREN
1681966,AO4O1VN2FB7B6
1683562,A2U6AWBGY1QUR1


## Products

In [24]:
products_path = os.path.join(directory, 'meta_Electronics.json.gz')
products = getDF(products_path)
products.head()

Unnamed: 0,asin,imUrl,description,categories,title,price,salesRank,related,brand
0,132793040,http://ecx.images-amazon.com/images/I/31JIPhp%...,The Kelby Training DVD Mastering Blend Modes i...,"[[Electronics, Computers & Accessories, Cables...",Kelby Training DVD: Mastering Blend Modes in A...,,,,
1,321732944,http://ecx.images-amazon.com/images/I/31uogm6Y...,,"[[Electronics, Computers & Accessories, Cables...",Kelby Training DVD: Adobe Photoshop CS5 Crash ...,,,,
2,439886341,http://ecx.images-amazon.com/images/I/51k0qa8f...,Digital Organizer and Messenger,"[[Electronics, Computers & Accessories, PDAs, ...",Digital Organizer and Messenger,8.15,{'Electronics': 144944},"{'also_viewed': ['0545016266', 'B009ECM8QY', '...",
3,511189877,http://ecx.images-amazon.com/images/I/41HaAhbv...,The CLIKR-5 UR5U-8780L remote control is desig...,"[[Electronics, Accessories & Supplies, Audio &...",CLIKR-5 Time Warner Cable Remote Control UR5U-...,23.36,,"{'also_viewed': ['B001KC08A4', 'B00KUL8O0W', '...",
4,528881469,http://ecx.images-amazon.com/images/I/51FnRkJq...,"Like its award-winning predecessor, the Intell...","[[Electronics, GPS & Navigation, Vehicle GPS, ...",Rand McNally 528881469 7-inch Intelliroute TND...,299.99,,"{'also_viewed': ['B006ZOI9OY', 'B00C7FKT2A', '...",


In [25]:
print(products.shape)
print(products.dtypes)

(498196, 9)
asin            object
imUrl           object
description     object
categories      object
title           object
price          float64
salesRank       object
related         object
brand           object
dtype: object


In [31]:
# Filter the users and items that never appear in the rating table.
distinct_users_in_reviews = reviews['reviewerID'].unique()
users = users.copy()[users['reviewerID'].isin(distinct_users_in_reviews)]
users

Unnamed: 0,reviewerID
0,AO94DHGC771SJ
1,AMO214LNFCEI4
2,A3N7T0DY83Y4IG
3,A1H8PY3QHMQQA0
4,A24EV6RXELQZ63
...,...
1675080,A2WBEUEO4HFDCA
1677359,A2RCXXH309JREN
1681966,AO4O1VN2FB7B6
1683562,A2U6AWBGY1QUR1


In [32]:
distinct_products_in_reviews = reviews['asin'].unique()
products = products.copy()[products['asin'].isin(distinct_products_in_reviews)]
products

Unnamed: 0,asin,imUrl,description,categories,title,price,salesRank,related,brand
4,0528881469,http://ecx.images-amazon.com/images/I/51FnRkJq...,"Like its award-winning predecessor, the Intell...","[[Electronics, GPS & Navigation, Vehicle GPS, ...",Rand McNally 528881469 7-inch Intelliroute TND...,299.99,,"{'also_viewed': ['B006ZOI9OY', 'B00C7FKT2A', '...",
15,0594451647,http://ecx.images-amazon.com/images/I/51RjSETO...,HDTV Adapter Kit for NOOK HD and NOOK HD+\nThi...,"[[Electronics, Computers & Accessories, Touch ...",Barnes &amp; Noble HDTV Adapter Kit for NOOK H...,49.95,,"{'also_bought': ['B009L7EEZA', 'B00AGAYQEU', '...",
20,0594481813,http://ecx.images-amazon.com/images/I/41K7ymN5...,Power up your device with this Barnes &amp; No...,"[[Electronics, eBook Readers & Accessories, Po...",Barnes &amp; Noble OV/HB-ADP Universal Power Kit,19.65,,"{'also_bought': ['B00AAKLIIS', 'B00A668GUO', '...",Barnes &amp; Noble
38,0972683275,http://ecx.images-amazon.com/images/I/41hYJ9Mw...,The VideoSecu TV mount is a mounting solution ...,"[[Electronics, Accessories & Supplies, Audio &...",VideoSecu 24&quot; Long Arm TV Wall Mount Low ...,29.99,{},"{'also_bought': ['B000X3KOD2', 'B0074FGR74', '...",VideoSecu
53,1400532620,http://ecx.images-amazon.com/images/I/519ca3cu...,Barnes & Noble Nook eReader - no 3GMeet nook. ...,"[[Electronics, eBook Readers & Accessories]]",Barnes &amp; Noble Nook eReader - no 3G,74.95,{'Electronics': 23071},"{'also_bought': ['B0035CLBT4', 'B004X18N24', '...",Barnes &amp; Noble
...,...,...,...,...,...,...,...,...,...
497900,B00L2442H0,http://ecx.images-amazon.com/images/I/51NFBdDZ...,Description:Add up to 4 peripherals quickly an...,"[[Electronics, Computers & Accessories, Networ...",Sabrent 4 Port Portable USB 2.0 Hub (9.5&quot;...,5.99,{'Electronics': 1383},"{'also_bought': ['B00IRV2DL8', 'B00GU8OIYA', '...",Sabrent
497904,B00L26YDA4,http://ecx.images-amazon.com/images/I/41061q4C...,Description:The Sabrent Wifi Receiver lets you...,"[[Electronics, Car & Vehicle Electronics, Vehi...",Sabrent Wifi Audio Receiver (Supports DLNA and...,35.99,{'Cell Phones & Accessories': 6139},"{'also_bought': ['B00KB5QEYK', 'B00L2JQ8AE', '...",Sabrent
497905,B00L21HC7A,http://ecx.images-amazon.com/images/I/513kT0it...,Description:The Sabrent CR-CCU3 3-Slot Card Re...,"[[Electronics, Computers & Accessories, Cables...",Sabrent USB 3.0 SuperSpeed 3 slot Memory Card ...,14.99,{},"{'also_viewed': ['B00GAKX34E', 'B00L2442H0', '...",Sabrent
497926,B00L3YHF6O,http://ecx.images-amazon.com/images/I/41SBx7QY...,"Mind-Shattering Performance, Precision-Tuned F...","[[Electronics, Home Audio, Stereo Components, ...",NEW! Creative Sound Blaster Roar: Portable NFC...,149.99,{'Cell Phones & Accessories': 131},"{'also_bought': ['B00LBNW2TC', 'B00L8I6SFY', '...",


In [39]:
products.isna().any()

asin           False
imUrl           True
description     True
categories     False
title           True
price           True
salesRank       True
related         True
brand           True
dtype: bool

## Events

In [23]:
events = reviews[['reviewerID', 'asin', 'unixReviewTime']]
events

Unnamed: 0,reviewerID,asin,unixReviewTime
0,AO94DHGC771SJ,0528881469,1370131200
1,AMO214LNFCEI4,0528881469,1290643200
2,A3N7T0DY83Y4IG,0528881469,1283990400
3,A1H8PY3QHMQQA0,0528881469,1290556800
4,A24EV6RXELQZ63,0528881469,1317254400
...,...,...,...
1689183,A34BZM6S9L7QI4,B00LGQ6HL8,1405555200
1689184,A1G650TTTHEAL5,B00LGQ6HL8,1405382400
1689185,A25C2M3QF9G7OQ,B00LGQ6HL8,1405555200
1689186,A1E1LEVQ9VQNK,B00LGQ6HL8,1405641600


## Image Features

In [None]:
image_features_path = os.path.join(directory, 'image_features_Electronics.b')

i = 0
df = {}
for d in readImageFeatures(image_features_path):
    asin, image_vector = d
    if asin in reviews['asin'].unique():
        df[i] = [asin, image_vector]
        i += 1

In [None]:
image_features = pd.DataFrame.from_dict(df, orient='index')
image_features