In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../../src')

In [3]:
import numpy as np
import pandas as pd
import glob
import json
import logging

In [4]:
def columns_to_dict(df, key_col, value_col):
    return pd.Series(df[value_col].values,index=df[key_col]).to_dict()

In [5]:
DATASET_PATH = '../../datasets/amazon-books'

In [6]:
books = pd.read_json(f'{DATASET_PATH}/user_500_1500_books.json')

In [7]:
books.columns

In [8]:
books['description'] = books['description'] + books['main_category'].apply(lambda x: f' Category: {x}' if x else '')

In [9]:
books_columns = ['item_id', 'title', 'description']
books = books[books_columns]

In [10]:
books.head(5)

Unnamed: 0,item_id,title,description
0,000105001X,Lady of Hay,"Jo Clifford, a successful journalist, is all s..."
1,0001050230,Love's Labour's Lost: Performed by Derek Jacob...,William Shakespeare is widely regarded as the ...
2,0001052292,Microserfs,
3,0000230022,The Simple Truths of Service: Inspired by John...,Simple Truths of Service: Inspired by Jonny th...
4,0001047868,Kidnapped (HarperCollinsAudioBooks),Grade 6 Up-Kidnapped by Robert Louis Stevenson...


In [11]:
books.to_json(f'{DATASET_PATH}/items.json', orient="records")

In [12]:
interations = pd.read_json(f'{DATASET_PATH}/user_500_1500_interactions.json')

In [13]:
interations = interations[['item_id', 'user_id', 'rating']]
interations.head(5)

Unnamed: 0,item_id,user_id,rating
0,2006448,A2B1GZIRD6W646,4
1,2006448,A1DK5AZMXS1QA3,4
2,1712799,A1BNWEJ7RVPLQ1,5
3,2005263,ARVOX5K5ZJOJR,3
4,2005263,A1XQ1JB4A3UWIK,5


In [14]:
interations.to_json(f'{DATASET_PATH}/interactions.json', orient="records")

# Database prepare

In [15]:
items        = pd.read_json(f'{DATASET_PATH}/items.json')
interactions = pd.read_json(f'{DATASET_PATH}/interactions.json')

In [16]:
def sequence_from(df, source, target):
    df[target] = pd.factorize(df[source])[0] + 1
    return df

select = lambda df, columns: df[columns]
rename = lambda df, columns: df.rename(columns=columns)

In [17]:
interactions = interactions \
    .pipe(sequence_from, source='user_id', target='user_seq') \
    .pipe(sequence_from, source='item_id', target='item_seq')

interactions.head(5)

Unnamed: 0,item_id,user_id,rating,user_seq,item_seq
0,2006448,A2B1GZIRD6W646,4,1,1
1,2006448,A1DK5AZMXS1QA3,4,2,1
2,1712799,A1BNWEJ7RVPLQ1,5,3,2
3,2005263,ARVOX5K5ZJOJR,3,4,3
4,2005263,A1XQ1JB4A3UWIK,5,5,3


In [18]:
asin_seq = interactions[['item_id', 'item_seq']].dropna()

asin_seq.head(5)

Unnamed: 0,item_id,item_seq
0,2006448,1
1,2006448,1
2,1712799,2
3,2005263,3
4,2005263,3


In [19]:
filtered_items = items[items['item_id'].isin(asin_seq['item_id'].unique())]
filtered_items.shape, items.shape

In [20]:
asin_seq_dic = columns_to_dict(asin_seq, key_col='item_id', value_col='item_seq')

In [21]:
items['image'] = items['item_id'].apply(lambda id: f'http://images.amazon.com/images/P/{id}.01._SCLZZZZZZZ_.jpg')
items.head(5)

Unnamed: 0,item_id,title,description,image
0,000105001X,Lady of Hay,"Jo Clifford, a successful journalist, is all s...",http://images.amazon.com/images/P/000105001X.0...
1,0001050230,Love's Labour's Lost: Performed by Derek Jacob...,William Shakespeare is widely regarded as the ...,http://images.amazon.com/images/P/0001050230.0...
2,0001052292,Microserfs,,http://images.amazon.com/images/P/0001052292.0...
3,0000230022,The Simple Truths of Service: Inspired by John...,Simple Truths of Service: Inspired by Jonny th...,http://images.amazon.com/images/P/0000230022.0...
4,0001047868,Kidnapped (HarperCollinsAudioBooks),Grade 6 Up-Kidnapped by Robert Louis Stevenson...,http://images.amazon.com/images/P/0001047868.0...


In [22]:
items['item_id'] = items['item_id'].apply(lambda item_id: asin_seq_dic[str(item_id)])
items = items.rename(columns={'item_id': 'id', 'title': 'name'})

In [33]:
items['name']        = items['name'].str.replace('"', '')
items['description'] = items['description'].str.replace('"', '')
items['name']        = items['name'].str.replace("\\", '', regex=False)
items['description'] = items['description'].str.replace("\\", '', regex=False)

In [34]:
items.head(5)

Unnamed: 0,id,name,description,image
0,25,Lady of Hay,"Jo Clifford, a successful journalist, is all s...",http://images.amazon.com/images/P/000105001X.0...
1,92,Love's Labour's Lost: Performed by Derek Jacob...,William Shakespeare is widely regarded as the ...,http://images.amazon.com/images/P/0001050230.0...
2,22,Microserfs,,http://images.amazon.com/images/P/0001052292.0...
3,7,The Simple Truths of Service: Inspired by John...,Simple Truths of Service: Inspired by Jonny th...,http://images.amazon.com/images/P/0000230022.0...
4,188,Kidnapped (HarperCollinsAudioBooks),Grade 6 Up-Kidnapped by Robert Louis Stevenson...,http://images.amazon.com/images/P/0001047868.0...


In [35]:
interactions['user_id'] = interactions['user_seq']
interactions['item_id'] = interactions['item_seq']

interactions = interactions[['user_id', 'item_id', 'rating']]
interactions.head(5)

Unnamed: 0,user_id,item_id,rating
0,1,1,4
1,2,1,4
2,3,2,5
3,4,3,3
4,5,3,5


In [36]:
interactions.to_json(f'{DATASET_PATH}/db_interactions.json', orient="records")

In [37]:
items.to_json(f'{DATASET_PATH}/db_items.json', orient="records")