In [1]:
import pandas as pd
import numpy as np
import json
from anytree import Node, RenderTree

In [2]:
products = pd.read_json('https://raw.githubusercontent.com/anyoneai/e-commerce-open-data-set/master/products.json')
categories = pd.read_json('https://raw.githubusercontent.com/anyoneai/e-commerce-open-data-set/master/categories.json')

In [3]:
products[['name','description','category']].head(4) 

Unnamed: 0,name,description,category
0,Duracell - AAA Batteries (4-Pack),Compatible with select electronic devices; AAA...,"[{'id': 'pcmcat312300050015', 'name': 'Connect..."
1,Duracell - AA 1.5V CopperTop Batteries (4-Pack),Long-lasting energy; DURALOCK Power Preserve t...,"[{'id': 'pcmcat312300050015', 'name': 'Connect..."
2,Duracell - AA Batteries (8-Pack),Compatible with select electronic devices; AA ...,"[{'id': 'pcmcat312300050015', 'name': 'Connect..."
3,Energizer - MAX Batteries AA (4-Pack),4-pack AA alkaline batteries; battery tester i...,"[{'id': 'pcmcat312300050015', 'name': 'Connect..."


In [4]:
prod_cat = {} # new dictionary. 
for item in products.category: # each row of this column has a list of dictionaries.
    for dic in item: # each dictionary has a category 'id' with their respective 'name'.
        prod_cat[dic['id']] = dic['name'] # using 'id' and 'name' for new dict's keys and values. 

In [5]:
# We now know all the categories present in the products table but we need to check the category
# table to identify root, parent and sibling relationships between them.
categories.head(4) # 'path' shows ancestors while 'subCategories' shows children of every node.

Unnamed: 0,id,name,path,subCategories
0,abcat0010000,Gift Ideas,"[{'id': 'abcat0010000', 'name': 'Gift Ideas'}]","[{'id': 'pcmcat140000050035', 'name': 'Capturi..."
1,abcat0020001,Learning Toys,"[{'id': 'abcat0010000', 'name': 'Gift Ideas'},...",[]
2,abcat0020002,DVD Games,"[{'id': 'abcat0010000', 'name': 'Gift Ideas'},...",[]
3,abcat0020004,Unique Gifts,"[{'id': 'abcat0010000', 'name': 'Gift Ideas'},...",[]


In [6]:
def levels(idx, path):
    root = path[0]['id']
    level = []
    if idx == root:
        level = 0
    else:
        level = len(path) - 1
    return level, root

levels = np.vectorize(levels)

In [7]:
categories['level'], categories['root'] = levels(categories.id,categories.path)

In [8]:
categories

Unnamed: 0,id,name,path,subCategories,level,root
0,abcat0010000,Gift Ideas,"[{'id': 'abcat0010000', 'name': 'Gift Ideas'}]","[{'id': 'pcmcat140000050035', 'name': 'Capturi...",0,abcat0010000
1,abcat0020001,Learning Toys,"[{'id': 'abcat0010000', 'name': 'Gift Ideas'},...",[],3,abcat0010000
2,abcat0020002,DVD Games,"[{'id': 'abcat0010000', 'name': 'Gift Ideas'},...",[],3,abcat0010000
3,abcat0020004,Unique Gifts,"[{'id': 'abcat0010000', 'name': 'Gift Ideas'},...",[],1,abcat0010000
4,abcat0100000,TV & Home Theater,"[{'id': 'abcat0100000', 'name': 'TV & Home The...","[{'id': 'abcat0101000', 'name': 'TVs'}, {'id':...",0,abcat0100000
...,...,...,...,...,...,...
4579,pcmcat86300050019,New Sony Blu-ray Disc Player,"[{'id': 'pcmcat128500050004', 'name': 'Name Br...","[{'id': 'pcmcat86300050020', 'name': 'Movies C...",3,pcmcat128500050004
4580,pcmcat86300050020,Movies Coming to Blu-ray Disc,"[{'id': 'pcmcat128500050004', 'name': 'Name Br...",[],4,pcmcat128500050004
4581,pcmcat86500050000,Blu-ray Disc and DVD-Video Comparison,"[{'id': 'pcmcat128500050004', 'name': 'Name Br...",[],4,pcmcat128500050004
4582,pcmcat87800050001,Customer Service,"[{'id': 'pcmcat87800050001', 'name': 'Customer...","[{'id': 'pcmcat203400050001', 'name': 'Help'}]",0,pcmcat87800050001
