In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../../src')

In [3]:
import numpy as np
import pandas as pd
import glob
import json
import logging

In [4]:
DATASET_PATH = '../datasets/amazon-sneakers'

In [5]:
products_detail_files         = !find {DATASET_PATH} -type f -name '??????????.json'
products_detail_variant_files = !find {DATASET_PATH} -type f -name '*_variant_*.json'

products_detail_files = products_detail_files + products_detail_variant_files
# products_detail_files

In [6]:
def load_json(path):
    with open(path,'r') as f:
        return json.load(f)

def to_product_detail_json(path):
    json = load_json(path)
    json['variant'] = ('variant' in path)
    return json
    
product_details = [to_product_detail_json(file) for file in products_detail_files]

In [7]:
def price(data):    
    if 'price_range' in data and data['price_range'] != None:
        price_range = data['price_range']
        
        if isinstance(price_range, str):
            return price_range.split('Details The List Price')[0].replace('List Price:', '').replace('$', '')
        elif 'from' in price_range and 'to' in price_range:
            return (price_range['to'] - price_range['from']) / 2
        elif 'from' in price_range:
            return price_range['from']
        elif 'to' in price_range:
            return price_range['to']

    return 0

def to_product_detail_row(data):
    try:        
        return {
            'id'         : data['id'],
            'title'      : data['title'],
            'description': data['description'],
            'variant'    : data['variant'],
            'url'        : data['images'][0][0],
            'rating'     : data['rating'],
            'reviews'    : data['reviews_count'],
            'price'      : price(data)
        }
    except Exception as e:
        logging.error(f'{e}. Data: {data}')
        

product_detail_rows = [to_product_detail_row(pd) for pd in product_details if pd['id']]

In [8]:
product_detail_rows[0]

In [9]:
product_detail = pd.DataFrame(product_detail_rows)
product_detail

Unnamed: 0,id,title,description,variant,url,rating,reviews,price
0,B0B8V49Z1V,FLYFUPPY Women's Slip-on Canvas Comfort Fashio...,,False,https://m.media-amazon.com/images/I/81FUVhjkB1...,4.6,16,0.0
1,B08TRDBGQF,Nike Women's Low-Top Sneakers,These shoes provide the user with great comfor...,False,https://m.media-amazon.com/images/I/61DOlkBbfJ...,2.7,7,31.0
2,B06XVY23RT,Reebok Women's Club C Sneaker,,False,https://m.media-amazon.com/images/I/81B-Vy65uk...,4.6,16053,13.075
3,B0785RGRWX,Reebok Men's Classic Harman Run Sneaker,,False,https://m.media-amazon.com/images/I/71FaYc6pSM...,4.6,6501,10.465
4,B0BDZV1LCS,Steve Madden Ganzo-R Sneaker,,False,https://m.media-amazon.com/images/I/71C-5pfm1D...,5.0,1 rating,0.0
...,...,...,...,...,...,...,...,...
1780,B08M41C7LJ,New Balance Women's 237 V1 Classic Sneaker,The running shoes of the 1970s might best be d...,True,https://m.media-amazon.com/images/I/71R7sXYcIH...,4.3,331,0.0
1781,B0BCKJCPCC,Niluber Women's Chunky Platform Dad Lace-Up Ca...,Product Attributes Product Volume：‎ 11.52 x 9....,True,https://m.media-amazon.com/images/I/51OzTAM+i2...,4.5,2,0.0
1782,B08MQ3KH5J,GOOBON Air Shoes for Women Tennis Sports Athle...,,True,https://m.media-amazon.com/images/I/81JUPSpLy4...,4.2,4033,0
1783,B083M256V2,Women's Walking Shoes Sock Sneakers Slip on Me...,,True,https://m.media-amazon.com/images/I/71SsHeFWTy...,4.1,13313,9.0


In [10]:
product_detail.price.value_counts()

In [44]:
product_detail.to_json(f'{DATASET_PATH}/items.json', orient="records")

In [37]:
def to_reviews_json(path):
    reviews = load_json(path)
    for review in reviews:
        review['item_id']  = path.split('/')[-1].split('_')[0]
        review['username'] = review['user']
        del review['user']
    return reviews


product_reviews = []
for file in glob.glob(f'{DATASET_PATH}/*_reviews.json'):
    product_reviews.extend(to_reviews_json(file))

In [42]:
user_interactions = pd.DataFrame(product_reviews)
user_interactions = user_interactions[['username', 'item_id', 'rating', 'message']]

In [45]:
user_interactions.to_json(f'{DATASET_PATH}/interactions.json', orient="records")