In [11]:
import pandas as pd
import re
import json
import random
import numpy as np

In [12]:
df = pd.read_csv('data/amazon/amazon.csv')

In [13]:
df.rename(columns={'uniq_id': 'id', 'product_name': 'name', 'manufacturer': 'brand', 'number_available_in_stock': 'stock', 'average_review_rating': 'rating', 'amazon_category_and_sub_category': 'category', 'customer_reviews': 'reviews'}, inplace=True)
df.drop(['number_of_reviews', 'number_of_answered_questions', 'customers_who_bought_this_item_also_bought', 'product_description', 'items_customers_buy_after_viewing_this_item', 'customer_questions_and_answers', 'sellers'], inplace=True, axis=1)

In [14]:
df.drop_duplicates(subset=['name'], inplace=True)

In [15]:
# PRICE
df['price'] = df['price'].str.replace('£','') # remove £ - price
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df['price'].fillna(random.uniform(0.01, 300), inplace=True)
df = df.round({"price": 2})

In [16]:
# STOCK
df['stock'] = np.random.randint(0, 300, df.shape[0])

In [17]:
# RATING
df['rating'] = df['rating'].str.replace(' out of 5 stars','')
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

In [18]:
# CATEGORY
df['category'] = df['category'].str.split(' > ')

In [19]:
# REVIEWS
df['reviews'] = df['reviews'].str.split('|')

In [20]:
def parseDate(string):
    d = string.split()
    month = d[1]
    months = {'Jan.': 1, 'Feb.': 2, 'Mar.': 3, 'April': 4, 'May': 5, 'Jun.': 6, 'July': 7, 'Aug.': 8, 'Sept.': 9, 'Oct.': 10, 'Nov.': 11, 'Dec.': 12}
    month = months[month]
    date = d[2] + "-" + str(month) + "-" + d[0]
    return date

In [21]:
def parseAuthor(string):
    try:
        s = re.findall(r'By (.+?) on', string)
        return s[0]
    except:
        return ""
    

In [22]:
def splitreviews(row):
    reviews = row.reviews
    r = []
    try:
        for review in reviews:
            review = " ".join(review.split())
            rev = review.split(' // ')
            author = parseAuthor(rev[3])
            date = parseDate(rev[2])
            aux = {'author': author, 'rating': rev[1], 'date': date, 'title': rev[0], 'body': rev[4]}
            r.append(aux)
        return r
    except:
        return []

df['reviews'] = df.apply(splitreviews, axis=1)

In [23]:
df['store'] = "amazon"

df['Id'] = ["products/amazon/" + str(x) for x in range(1, len(df) + 1)]

df.drop(columns='id', inplace=True)

In [24]:
df.to_csv('data/amazon/products.csv', encoding='utf-8', index=False)

In [25]:
parsed = json.loads(df.to_json(orient="records", default_handler=str))

json_string = json.dumps(parsed, indent=4)
with open('data/output/amazon.json', 'w') as outfile:
    outfile.write(json_string)