In [130]:
import requests
import json
import pandas as pd

# Items

In [120]:
response=requests.get("https://grepp-programmers-challenges.s3.ap-northeast-2.amazonaws.com/2020-birdview/item-data.json")
item_list = json.loads(response.content.decode("utf-8"))
item_df = pd.DataFrame.from_dict(item_list).astype({"price": int})

In [121]:
# ingredients would be better to unified with lower case for comparing purpose
item_df["ingredients"] = item_df["ingredients"].str.lower()

In [122]:
item_df.dtypes

category        object
gender          object
id               int64
imageId         object
ingredients     object
monthlySales     int64
name            object
price            int64
dtype: object

In [123]:
# to check proper column length
for column in item_df.columns:
    if item_df[column].dtype != object:
        continue
    print (column, item_df[column].map(len).max())

category 10
gender 6
imageId 36
ingredients 96
name 70


In [124]:
item_list[4]["imageId"]

'1647f43c-2919-4cbf-9de4-b56a4817779d'

In [125]:
# to prepare for one to many table
item_ingr_df = item_df[["id", "ingredients"]]

# Ingredients

In [9]:
response=requests.get("https://grepp-programmers-challenges.s3.ap-northeast-2.amazonaws.com/2020-birdview/ingredient-data.json")
ingr_list = json.loads(response.content.decode("utf-8"))
ingr_df = pd.DataFrame.from_dict(ingr_list)


In [10]:
# unify with lower case as above
ingr_df["name"] = ingr_df["name"].str.lower()

In [11]:
# index column would be pkey for ingredient table
ingr_df = ingr_df.sort_values(by="name").reset_index(drop=True)

In [12]:
def convert_score(mark):
    if mark == "O":
        return 1
    elif mark == "X":
        return -1
    elif mark == "":
        return 0
    else:
        raise ValueError("Unexpected mark: {}".format(mark))
        
for column in ingr_df.columns:
    if column == "name":
        continue
    ingr_df[column] = ingr_df[column].map(convert_score)

In [78]:
# check max len for proper column len
print (ingr_df["name"].map(len).max())

18


In [79]:
ingr_df["id"] = ingr_df.index

# Item-Ingredients (many to many)

In [14]:
# ingredient - primary key dict
ingr_pkey_dict = dict(zip(ingr_df["name"], ingr_df.index))

In [29]:
itemkey_ingrkey_list = list()
unknown_ingr_list = list()

for i in item_ingr_df.index:
    row = item_ingr_df.loc[i]
    ingredients = row["ingredients"].split(",")
    
    # remove potential duplicates
    ingredients = list(set(ingredients))
    
    for ingr in ingredients:
        pkey = ingr_pkey_dict.get(ingr)
        if pkey is None:
            unknown_ingr_list.append(ingr)
        else:
            itemkey_ingrkey_list.append((row["id"], pkey))

if len(unknown_ingr_list):
    print ("Unknown ingredient list: {}".format(len(unknown_ingr_list)))

In [98]:
# ingredient - primary key dict
ingr_pkey_dict = dict(zip(ingr_df["name"], ingr_df.index))
itemkey_ingrkey_dict = dict()
unknown_ingr_list = list()

for i in item_ingr_df.index:
    row = item_ingr_df.loc[i]
    ingredients = row["ingredients"].split(",")
    
    # remove potential duplicates
    ingredients = list(set(ingredients))
    
    for ingr in ingredients:
        pkey = ingr_pkey_dict.get(ingr)
        if pkey is None:
            unknown_ingr_list.append(ingr)
        else:
            data = itemkey_ingrkey_dict.get(row["id"])
            if data is None:
                itemkey_ingrkey_dict[row["id"]] = list()
            itemkey_ingrkey_dict[row["id"]].append(pkey)
if len(unknown_ingr_list):
    print ("Unknown ingredient list: {}".format(len(unknown_ingr_list)))

# dump fixtures

In [97]:
dump_item_ingredients = list()

row_num = len(itemkey_ingrkey_list)
for i in range(row_num):
    row = itemkey_ingrkey_list[i]
    dump_list.append({
        "model": "api.item_ingredients",
        "fields": {
            "id": i,
            "item_id": int(row[0]),
            "ingredient_id": int(row[1])
        }
    })

with open("./item_ingredients.json", 'w') as f:
    f.write(json.dumps(dump_item_ingredients, indent=4))

In [103]:
ingr_dict = ingr_df.T.to_dict()

dump_ingr_list = list()
for key in ingr_dict:
    dump_ingr_list.append({
        "model": "api.ingredient",
        "fields": ingr_dict[key]
    })
with open("./ingredient.json", 'w') as f:
    f.write(json.dumps(dump_ingr_list, indent=4))

In [126]:
# without ingredients
item_dict = item_df[['category', 'gender', 'id', 'imageId', 'monthlySales',
       'name', 'price']].T.to_dict()
dump_item_list = list()
for key in item_dict:
    row_dict = item_dict[key]
    row_dict.update({
        "ingredients": itemkey_ingrkey_dict[row_dict["id"]]
    })
    dump_item_list.append({
        "model": "api.item",
        "fields": item_dict[key]
    })
with open("./item.json", 'w') as f:
    f.write(json.dumps(dump_item_list, indent=4, ensure_ascii=False))