In [1]:
import requests
import json
import pandas as pd

# ingredient

In [3]:
response=requests.get("https://grepp-programmers-challenges.s3.ap-northeast-2.amazonaws.com/2020-birdview/ingredient-data.json")
ingr_list = json.loads(response.content.decode("utf-8"))
ingr_df = pd.DataFrame.from_dict(ingr_list)

# unify with lower case as above
ingr_df["name"] = ingr_df["name"].str.lower()

In [4]:
def convert_score(mark):
    if mark == "O":
        return 1
    elif mark == "X":
        return -1
    elif mark == "":
        return 0
    else:
        raise ValueError("Unexpected mark: {}".format(mark))
        
for column in ingr_df.columns:
    if column == "name":
        continue
    ingr_df[column] = ingr_df[column].map(convert_score)

# Prepare for validation object(valid_df), calculate each score by skin_type

In [5]:
response=requests.get("https://grepp-programmers-challenges.s3.ap-northeast-2.amazonaws.com/2020-birdview/item-data.json")
item_list = json.loads(response.content.decode("utf-8"))
item_df = pd.DataFrame.from_dict(item_list).astype({"price": int})

In [6]:
# ingredients would be better to unified with lower case for comparing purpose
item_df["ingredients"] = item_df["ingredients"].str.lower()

In [7]:
item_df["oily"] = 0
item_df["sensitive"] = 0
item_df["dry"] = 0

results = list()
for index in range(0, len(item_df.index)):
    item_srs = item_df.iloc[index]
    ingr_list = list(set(item_srs["ingredients"].split(",")))
    
    item_oily_score = 0
    item_sensitive_score = 0
    item_dry_score = 0
    for ingr in ingr_list:
        row_df = ingr_df[ingr_df["name"] == ingr]
        if len(row_df) != 1:
            raise ValueError
        row_srs = row_df.iloc[-1]

        item_oily_score += row_srs["oily"]
        item_sensitive_score += row_srs["sensitive"]
        item_dry_score += row_srs["dry"]
        
    results.append([item_srs["id"], item_srs["category"], item_srs["price"], item_oily_score, item_sensitive_score, item_dry_score])

In [8]:
valid_df = pd.DataFrame(results, columns=["id", "category", "price", "oily", "sensitive", "dry"])

# Validation, compare results between data from django server and valid_df

In [9]:
loop = 0

for _id in range(1, 1001):
    for skin_type in ["oily", "sensitive", "dry"]:
        if loop % 100 == 0:
            print ("proceed: {:.2f} %".format(loop/3000.0 * 100.0))
        try:
            response = requests.get("http://127.0.0.1:8000/test/data/{}?skin_type={}".format(_id, skin_type))
            target_data = json.loads(response.content.decode("utf-8"))
            main_item = target_data[0]
            sub_items = target_data[1:]
            assert len(sub_items) == 3

            # validate main
            main_srs = valid_df[item_df["id"] == _id].iloc[-1]
            assert main_srs["id"] == main_item["id"], "{}, {}".format(main_srs["id"], main_item["id"])
            assert main_srs["category"] == main_item["category"], "{}, {}".format(main_srs["category"], main_item["category"])
            assert main_srs["price"] == main_item["price"], "{}, {}".format(main_srs["price"], main_item["price"])

            # validate sub (recommended)
            category = main_item["category"]

            sub_df = valid_df[valid_df["category"] == category].sort_values(by=[skin_type, "price"], ascending=[False, True]).head(3)
            assert len(sub_df) == 3

            for index, sub_dict in enumerate(sub_items):
                sub_srs = sub_df.iloc[index]
                assert sub_srs["id"] == sub_dict["id"], "{}, {}".format(sub_srs["id"], sub_dict["id"])
                assert sub_srs[skin_type] == sub_dict["score"], "{}, {}".format(sub_srs[skin_type], sub_dict["score"])
        except Exception as e:
            print (e)
            print (_id, skin_type)
            print (target_data)
            raise ValueError
        loop += 1
            

proceed: 0.00 %
proceed: 3.33 %
proceed: 6.67 %
proceed: 10.00 %
proceed: 13.33 %
proceed: 16.67 %
proceed: 20.00 %
proceed: 23.33 %
proceed: 26.67 %
proceed: 30.00 %
proceed: 33.33 %
proceed: 36.67 %
proceed: 40.00 %
proceed: 43.33 %
proceed: 46.67 %
proceed: 50.00 %
proceed: 53.33 %
proceed: 56.67 %
proceed: 60.00 %
proceed: 63.33 %
proceed: 66.67 %
proceed: 70.00 %
proceed: 73.33 %
proceed: 76.67 %
proceed: 80.00 %
proceed: 83.33 %
proceed: 86.67 %
proceed: 90.00 %
proceed: 93.33 %
proceed: 96.67 %


# dump valid_df into cache for unit testing

In [28]:
valid_df.to_pickle("./test/valid.pickle", compression="gzip")

In [None]:
valid_df