## Comparison between products.json and categories.json

In [15]:
import json
import pandas as pd

In [13]:
# Load products json into dic
with open("../data/products.json") as f:
    products_dicts = json.load(f)

# # Load categories json into dic
with open("data/categories.json") as f:
    categories_dicts = json.load(f)

### Base dataframes and auxiliary series

DataFrame with all categories as labels

In [14]:
new_dataset = []
for dic in products_dicts:
    new_dict = {}
    # Add name and description
    new_dict["name"] = dic["name"]
    new_dict["description"] = dic["description"]
    
    # Add category names respecting hierarchy
    for i in range(len(dic["category"])):
        new_dict["category_" + str(i)] = dic["category"][i]["id"]
    
    new_dataset.append(new_dict)

all_categories_df = pd.DataFrame(new_dataset)
display(all_categories_df.head())
display(all_categories_df.describe())

Unnamed: 0,name,description,category_0,category_1,category_2,category_3,category_4,category_5,category_6
0,Duracell - AAA Batteries (4-Pack),Compatible with select electronic devices; AAA...,pcmcat312300050015,pcmcat248700050021,pcmcat303600050001,abcat0208002,,,
1,Duracell - AA 1.5V CopperTop Batteries (4-Pack),Long-lasting energy; DURALOCK Power Preserve t...,pcmcat312300050015,pcmcat248700050021,pcmcat303600050001,abcat0208002,,,
2,Duracell - AA Batteries (8-Pack),Compatible with select electronic devices; AA ...,pcmcat312300050015,pcmcat248700050021,pcmcat303600050001,abcat0208002,,,
3,Energizer - MAX Batteries AA (4-Pack),4-pack AA alkaline batteries; battery tester i...,pcmcat312300050015,pcmcat248700050021,pcmcat303600050001,abcat0208002,,,
4,Duracell - C Batteries (4-Pack),Compatible with select electronic devices; C s...,pcmcat312300050015,pcmcat248700050021,pcmcat303600050001,abcat0208002,,,


Unnamed: 0,name,description,category_0,category_1,category_2,category_3,category_4,category_5,category_6
count,51645,51646,51646,50891,45003,26158,5646,346,1
unique,48557,38537,62,172,627,777,202,23,1
top,Lenmar - Lithium-Ion Battery - Black,Perfect gift card? Piece of cake. All Best Buy...,abcat0900000,abcat0811002,pcmcat191200050015,pcmcat214700050000,pcmcat165900050033,abcat0511004,pcmcat223500050009
freq,31,370,9317,6176,2079,1892,739,51,1


Series with all the categories on products.json and categories.json

In [None]:
# Categories in products.json
prod_json_categories = pd.Series(dtype="object")
for i in range(5):
    prod_json_categories = pd.concat([prod_json_categories, all_categories_df["category_" + str(i)]], ignore_index=True)
prod_json_categories = prod_json_categories.dropna()

# Categories in categories.json
cat_json_categories = pd.Series(dtype="object")
for dic in categories_dicts:
    cat_json_categories = pd.concat([cat_json_categories, pd.Series(dic["id"])], ignore_index=True)

Dictionary to map ids with category names

In [None]:
mapping_dict_prods = {}
for dic in products_dicts:
    for cat in dic["category"]:
        mapping_dict_prods[cat["id"]] = cat["name"]

### Relation between products.json and categories.json

Missing categories

In [None]:
# Get the categories from products.json that are not as a category in categories.json
# There could not be at all on categories.json, or be only as a subcategory
missing_categories = prod_json_categories[~prod_json_categories.isin(cat_json_categories)]

Products not tracked

In [None]:
# Products with a category not tracked on categories.json
products_not_tracked = pd.Series(dtype="object")
for i in range(6):
    products_not_tracked = pd.concat([products_not_tracked, missing_categories[~missing_categories.isin(all_categories_df["category_" + str(i)])]])

Results

In [None]:
print("Total categories on products.json:", prod_json_categories.nunique())
print("Total categories on categories.json:", cat_json_categories.nunique())
print("Categories in products.json not in categories.json:", missing_categories.nunique())
print("Products not tracked on categories.json:", len(products_not_tracked), f"({round(len(products_not_tracked) / len(all_categories_df) * 100,2)}%)")

Total categories on products.json: 1833
Total categories on categories.json: 4584
Categories in products.json not in categories.json: 239
Products not tracked on categories.json: 6547 (12.68%)


### Create categories database based on products categories