In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../../src')

In [3]:
import numpy as np
import pandas as pd
import glob
import json
import logging

In [4]:
DATASET_PATH = '../../datasets/amazon-sneakers'

In [38]:
products_detail_files         = !find {DATASET_PATH} -type f -name '??????????.json'
products_detail_variant_files = !find {DATASET_PATH} -type f -name '*_variant_*.json'

products_detail_files = products_detail_files + products_detail_variant_files

In [39]:
def load_json(path):
    with open(path,'r') as f:
        return json.load(f)

def to_product_detail_json(path):
    json = load_json(path)
    json['path'] = path
    json['variant'] = ('variant' in path)
    return json
    
product_details = [to_product_detail_json(file) for file in products_detail_files]

In [40]:
def price(data):    
    if 'price_range' in data and data['price_range'] != None:
        price_range = data['price_range']
        
        if isinstance(price_range, str):
            return price_range.split('Details The List Price')[0].replace('List Price:', '').replace('$', '')
        elif 'from' in price_range and 'to' in price_range:
            return (price_range['to'] - price_range['from']) / 2
        elif 'from' in price_range:
            return price_range['from']
        elif 'to' in price_range:
            return price_range['to']

    return 0

def id(data):
    filename = data['path'].split('/')[-1]
    
    if '_variant_' in filename:
        return filename.split('_variant_')[-1].replace('.json', '')
    else:
        return filename.replace('.json', '')

def description(data):
    desc = data['description']
    return desc if desc else ''
    
def to_product_detail_row(data):
    try:        
        return {
            'id'         : id(data),
            'title'      : data['title'],
            'description': description(data),
            'variant'    : data['variant'],
            'url'        : data['images'][0][0],
            'rating'     : data['rating'],
            'reviews'    : data['reviews_count'],
            'price'      : price(data)
        }
    except Exception as e:
        logging.error(f'{e}. Data: {data}')
        

product_detail_rows = [to_product_detail_row(pd) for pd in product_details if pd['id']]

In [42]:
product_detail_rows[100]

In [43]:
product_detail = pd.DataFrame(product_detail_rows)
product_detail.head(5)

Unnamed: 0,id,title,description,variant,url,rating,reviews,price
0,B089PV3D1G,Superga Women's 2750 Cotu Classic Sneaker,,False,https://m.media-amazon.com/images/I/612WcQ1+bI...,4.2,7396,17.525
1,B07MTVLHYS,New Suede Shoe Cleaner Kit. Suede Brush & Sued...,,False,https://m.media-amazon.com/images/I/71tbpuN6ZI...,4.4,872,0.0
2,B093QLZJ4F,New Balance Women's Dynasoft Nergize V3 Cross ...,,False,https://m.media-amazon.com/images/I/71pR+J-gp4...,4.4,117,0.0
3,B086187DQY,NCNDB Women's Camouflage Wedges Sneakers Inner...,welcome.our Sneaker shoes size tabs are chines...,False,https://m.media-amazon.com/images/I/61reE-na08...,4.2,191,1.82
4,B092XC424L,Reebok Men's Classic Harman Run Sneaker,,False,https://m.media-amazon.com/images/I/71FaYc6pSM...,4.6,6501,10.465


In [44]:
product_detail.id.value_counts()

In [45]:
product_detail.to_json(f'{DATASET_PATH}/items.json', orient="records")

In [46]:
def to_reviews_json(path):
    reviews = load_json(path)
    for review in reviews:
        review['item_id']  = path.split('/')[-1].split('_')[0]
        review['username'] = review['user']
        del review['user']
    return reviews


product_reviews = []
for file in glob.glob(f'{DATASET_PATH}/*_reviews.json'):
    product_reviews.extend(to_reviews_json(file))

In [47]:
user_interactions = pd.DataFrame(product_reviews)
user_interactions = user_interactions[['username', 'item_id', 'rating', 'message']]

In [48]:
user_interactions.to_json(f'{DATASET_PATH}/interactions.json', orient="records")

# Database prepare

In [64]:
items        = pd.read_json(f'{DATASET_PATH}/items.json')
interactions = pd.read_json(f'{DATASET_PATH}/interactions.json')

In [65]:
def desc_price(value):
    return ' USD ' + str(value) if value else ''

items['description'] = items['description'] + [desc_price(i) for i in items['price'].values]

In [66]:
def sequence_from(df, source, target):
    df[target] = pd.factorize(df[source])[0] + 1
    return df

select = lambda df, columns: df[columns]
rename = lambda df, columns: df.rename(columns=columns)


items = items \
    .pipe(sequence_from, source='id', target='item_id') \
    .pipe(select, ['item_id', 'id', 'title', 'description', 'url']) \
    .pipe(rename, { 'id': 'original_id', 'title': 'name', 'url': 'image'})

items.head(5)

Unnamed: 0,item_id,original_id,name,description,image
0,1,B089PV3D1G,Superga Women's 2750 Cotu Classic Sneaker,USD 17.525,https://m.media-amazon.com/images/I/612WcQ1+bI...
1,2,B07MTVLHYS,New Suede Shoe Cleaner Kit. Suede Brush & Sued...,,https://m.media-amazon.com/images/I/71tbpuN6ZI...
2,3,B093QLZJ4F,New Balance Women's Dynasoft Nergize V3 Cross ...,,https://m.media-amazon.com/images/I/71pR+J-gp4...
3,4,B086187DQY,NCNDB Women's Camouflage Wedges Sneakers Inner...,welcome.our Sneaker shoes size tabs are chines...,https://m.media-amazon.com/images/I/61reE-na08...
4,5,B092XC424L,Reebok Men's Classic Harman Run Sneaker,USD 10.465,https://m.media-amazon.com/images/I/71FaYc6pSM...


In [67]:
interactions = interactions \
    .pipe(sequence_from, source='username', target='user_id') \
    .pipe(select, ['item_id', 'rating', 'user_id']) \
    .pipe(rename, {'item_id': 'original_item_id'})

interactions.head(5)

Unnamed: 0,original_item_id,rating,user_id
0,B0BG8YCNZM,4.0,1
1,B0BG8YCNZM,4.0,2
2,B0BG8YCNZM,4.0,3
3,B0BG8YCNZM,5.0,4
4,B0BG8YCNZM,5.0,5


In [68]:
interactions = interactions.merge(items, left_on  = 'original_item_id', right_on = 'original_id') \
    .pipe(select, ['user_id', 'item_id', 'rating'])

interactions.head(5)

Unnamed: 0,user_id,item_id,rating
0,98,1082,4.0
1,99,1082,5.0
2,10,1082,4.0
3,100,1082,4.0
4,101,1082,5.0


In [69]:
interactions.to_json(f'{DATASET_PATH}/db_interactions.json', orient="records")

In [70]:
items = items.pipe(select, ['item_id', 'name', 'description', 'image']) \
              .pipe(rename, {'item_id': 'id'})
items.head(5)

Unnamed: 0,id,name,description,image
0,1,Superga Women's 2750 Cotu Classic Sneaker,USD 17.525,https://m.media-amazon.com/images/I/612WcQ1+bI...
1,2,New Suede Shoe Cleaner Kit. Suede Brush & Sued...,,https://m.media-amazon.com/images/I/71tbpuN6ZI...
2,3,New Balance Women's Dynasoft Nergize V3 Cross ...,,https://m.media-amazon.com/images/I/71pR+J-gp4...
3,4,NCNDB Women's Camouflage Wedges Sneakers Inner...,welcome.our Sneaker shoes size tabs are chines...,https://m.media-amazon.com/images/I/61reE-na08...
4,5,Reebok Men's Classic Harman Run Sneaker,USD 10.465,https://m.media-amazon.com/images/I/71FaYc6pSM...


In [71]:
items.to_json(f'{DATASET_PATH}/db_items.json', orient="records")