## Integrating `build_df()`, `make_tree()` and `dist_nodes()` 

In [148]:
import json
import pandas as pd
import numpy as np

from anytree import Node
from anytree import RenderTree, find, find_by_attr, findall

### Modifying our `build_df()` function

In the first stage, we made our buil_df() function that allows us to create a dataset according to a given threshold for the minimum amount products per category.

The second step was to create a function to take into account different aspects of the tree structure of the category products. To this end, we made make_tree() and  dist_nodes() functions, which capture the tree structure and the distance between nodes of that structure respectively. 

As the third step, we need to integrate our different functions. To do so, we add the "category" entry of product.json as a column of the data frame to be generated by our build_df() function and dismiss those entries (categories and subcategories)  below the threshold parameter. 

In [208]:
# Conditions for creating the "leaf" column
def conditions_leaf(row, number_of_categories_columns):
    if row["category_0"] == "other":
        return "other"
    for i in reversed(range(number_of_categories_columns)):
        if type(row["category_" + str(i)]) == str and row["category_" + str(i)] != 'other':
            return row["category_" + str(i)]

# Conditions for creating the "max_depth" column
def conditions_max_depth(row, number_of_categories_columns):
    if row["path"] == str:
        return 1
    return len(row["path"])


# Conditions for creating the "path" column
def conditions_path(row, number_of_categories_columns):
    path = []
    if row["category_0"] == "other":
        return "other"
    for i in range(number_of_categories_columns):
        if type(row["category_" + str(i)]) == str and row["category_" + str(i)] != 'other':
            path.append(row["category_" + str(i)])
    return path


# Get "category_" columns list
def get_category_columns(df):    
    category_columns = []
    for column in df.columns:
            if column.startswith("category_"):
                category_columns.append(column)
    return category_columns




def build_df(json_path: str, threshold: int, preprocessed_csv: str = None): 
    # Save json into dict
    with open(json_path) as f:
        products_dic = json.load(f)
    
    # Create base DataFrame
    all_cats_dict = []
    for dic in products_dic:
        new_dict = {}
        new_dict

        if preprocessed_csv == None:
            # Add name and description
            new_dict["name"] = dic["name"]
            new_dict["description"] = dic["description"]
        
        # Add category (list of dict) to be used by make_tree()
        new_dict['category'] = dic['category']

        # Add category names respecting hierarchy
        for i in range(len(dic["category"])):
            new_dict["category_" + str(i)] = dic["category"][i]["id"]
        all_cats_dict.append(new_dict)
    
    df = pd.DataFrame(all_cats_dict)
    
    if preprocessed_csv != None:
        df.insert(0, "name", pd.read_csv(preprocessed_csv)["name"])
        df.insert(1, "description", pd.read_csv(preprocessed_csv)["description"])
    
    # Get number of category columns
    number_of_categories_columns = len(get_category_columns(df))
    
    # Replace leaf category with "other" according to threshold
    for i in range(number_of_categories_columns):
        level = df["category_" + str(i)].copy()
        level_count_series = level.value_counts()
        categories_within_threshold = level_count_series[level_count_series > threshold].reset_index()["index"]
        is_category_in_threshold_and_not_nan = ~(~level.isin(categories_within_threshold) & level.notna())
        df["category_" + str(i)] = level.where(is_category_in_threshold_and_not_nan, 'other')

    # "path" column, with a list of all the categories from root to leaf
    df.insert(2,"path",df.apply(lambda row: conditions_path(row, number_of_categories_columns), axis=1))

    # "leaf" column, excluding "other" unless it's a root category
    df.insert(2,"leaf",df.apply(lambda row: conditions_leaf(row, number_of_categories_columns), axis=1))
    
    # "max_depth" column
    df.insert(3,"max_depth",df.apply(lambda row: conditions_max_depth(row, number_of_categories_columns), axis=1))
    
    # Modify category column to be used by make_tree() according to threshold. For those entries below the threshold we assign the following one-entry dictionary
    other_dict = [{'id': 'other', 'name': 'other'}]
    df['category'] = df.apply(lambda row: row.category[0: len(row.path)] if row.path != 'other' else other_dict, axis = 1)

    # Change "other" for np.nan
    df_nan = df.copy()
    for column in get_category_columns(df_nan):
        df_nan[column] = df_nan[column].apply(lambda x: np.nan if x == "other" else x)
    
    return df_nan


In [215]:
df_int = build_df("../data/products.json", 100)

In [216]:
df_int[5:20]

Unnamed: 0,name,description,leaf,max_depth,path,category,category_0,category_1,category_2,category_3,category_4,category_5,category_6
5,Duracell - D Batteries (4-Pack),Compatible with select electronic devices; D s...,pcmcat303600050001,3,"[pcmcat312300050015, pcmcat248700050021, pcmca...","[{'id': 'pcmcat312300050015', 'name': 'Connect...",pcmcat312300050015,pcmcat248700050021,pcmcat303600050001,,,,
6,Duracell - 9V Batteries (2-Pack),Compatible with select electronic devices; alk...,pcmcat303600050001,3,"[pcmcat312300050015, pcmcat248700050021, pcmca...","[{'id': 'pcmcat312300050015', 'name': 'Connect...",pcmcat312300050015,pcmcat248700050021,pcmcat303600050001,,,,
7,Directed Electronics - Viper Audio Glass Break...,From our expanded online assortment; compatibl...,other,5,other,"[{'id': 'other', 'name': 'other'}]",,,,,,,
8,Energizer - N Cell E90 Batteries (2-Pack),Alkaline batteries; 1.5V,pcmcat303600050001,3,"[pcmcat312300050015, pcmcat248700050021, pcmca...","[{'id': 'pcmcat312300050015', 'name': 'Connect...",pcmcat312300050015,pcmcat248700050021,pcmcat303600050001,,,,
9,Metra - Radio Installation Dash Kit for Most 1...,From our expanded online assortment; compatibl...,pcmcat165900050033,5,"[abcat0300000, pcmcat165900050023, pcmcat33160...","[{'id': 'abcat0300000', 'name': 'Car Electroni...",abcat0300000,pcmcat165900050023,pcmcat331600050007,pcmcat165900050031,pcmcat165900050033,,
10,Metra - Radio Dash Multikit for Select GM Vehi...,From our expanded online assortment; compatibl...,pcmcat165900050033,5,"[abcat0300000, pcmcat165900050023, pcmcat33160...","[{'id': 'abcat0300000', 'name': 'Car Electroni...",abcat0300000,pcmcat165900050023,pcmcat331600050007,pcmcat165900050031,pcmcat165900050033,,
11,Metra - Wiring Harness for Select 1998-2008 Fo...,Compatible with select 1998-2008 Ford vehicles...,pcmcat165900050034,5,"[abcat0300000, pcmcat165900050023, pcmcat33160...","[{'id': 'abcat0300000', 'name': 'Car Electroni...",abcat0300000,pcmcat165900050023,pcmcat331600050007,pcmcat165900050031,pcmcat165900050034,,
12,Metra - Turbo Wire Aftermarket Radio Wire Harn...,Compatible with Honda and Acura vehicles; conn...,pcmcat165900050034,5,"[abcat0300000, pcmcat165900050023, pcmcat33160...","[{'id': 'abcat0300000', 'name': 'Car Electroni...",abcat0300000,pcmcat165900050023,pcmcat331600050007,pcmcat165900050031,pcmcat165900050034,,
13,Metra - Wiring Harness for Most 1986-1998 Hond...,Compatible with most 1986-1998 Honda Acura veh...,pcmcat165900050034,5,"[abcat0300000, pcmcat165900050023, pcmcat33160...","[{'id': 'abcat0300000', 'name': 'Car Electroni...",abcat0300000,pcmcat165900050023,pcmcat331600050007,pcmcat165900050031,pcmcat165900050034,,
14,METRA - Antenna Cable Adapter - Black,Compatible with select 1988-2005 vehicles; ada...,pcmcat165900050032,5,"[abcat0300000, pcmcat165900050023, pcmcat33160...","[{'id': 'abcat0300000', 'name': 'Car Electroni...",abcat0300000,pcmcat165900050023,pcmcat331600050007,pcmcat165900050031,pcmcat165900050032,,


### Applying `make_tree()`function

Here we apply our `make_tree()` function to the data frame created by our modified `build_df()` function:

In [217]:
def make_tree(df, df_column_cat, root_name):
  
  """Takes a data frame, a column of such dataframe and a string name
  Using anytree library generate the tree and print it, and return a dictionary 
  
  Parameters:
  df = a column data frame
  df_column_cat: a column of a data frame which values are a simple dictinary or a list of them
  root_name: str
  
  print the tree and
  Returns a dictionary with category name:str as keys and nodes as values 
  """
  #set the root node
  root = Node(root_name)
  # list of nodes to be generated by the function
  nodes = {}
  nodes[root.name]: root
  # Iteration over rows
  for index, row in df.iterrows():
  # Iteration over index of values at each row
    c = df_column_cat[index]
    for i_cat in range(len(c)):
      #cat_name = c[i_cat]['id'] 
      cat_name = c[i_cat]['id'] + ' '+ c[i_cat]['name'] # If you want to display the id along with category names uncomment this line and comment the previous one

      # Verify if the parent category exist if it does not exists appends it to the nodes list and creates the node. Else continues
      if i_cat == 0 and cat_name not in nodes:
        nodes[cat_name] = Node(cat_name, parent=find_by_attr(root, root_name))

      # Verify if the subcategory exists if it does not exists appends it to the nodes list and creates the node. Else continues  
      elif i_cat > 0 and cat_name not in nodes:
        #predecessor = c[i_cat -1]['id'] 
        predecessor = c[i_cat - 1]['id'] + ' '+ c[i_cat-1]['name'] # If you want to display the id along with category names uncomment this line and comment the previous one
        nodes[cat_name] = Node(cat_name, find_by_attr(root,predecessor))
      
      else: continue
   
  for pre, _, node in RenderTree(root):
    print("%s%s" % (pre, node.name))
    
  return nodes 

In [220]:
tree_dict = make_tree(df_int, df_int['category'], "Categories")

Categories
├── pcmcat312300050015 Connected Home & Housewares
│   ├── pcmcat248700050021 Housewares
│   │   ├── pcmcat303600050001 Household Batteries
│   │   └── pcmcat179100050006 Outdoor Living
│   │       ├── pcmcat179200050003 Grills
│   │       ├── pcmcat179200050008 Patio Furniture & Decor
│   │       │   └── pcmcat748300322875 Outdoor Seating
│   │       └── pcmcat179200050013 Outdoor Heating
│   ├── abcat0802000 Telephones & Communication
│   │   ├── abcat0811011 Telephone Accessories
│   │   └── abcat0802001 Cordless Telephones
│   │       └── pcmcat159300050002 Systems
│   ├── abcat0805000 Office Electronics
│   │   └── abcat0511001 Printers, Ink & Toner
│   │       └── pcmcat266500050030 All Printers
│   ├── pcmcat275600050000 Office & School Supplies
│   │   └── abcat0807000 Printer Ink & Toner
│   │       ├── abcat0807001 Printer Ink
│   │       ├── pcmcat335400050008 3D Printer Filament
│   │       └── abcat0807009 Toner
│   ├── abcat0809000 Office Furniture & Storage
│ 

In [221]:
def dist_nodes(node_nm1, node_nm2, cat_tree_dict):
    """Takes two nodes names categories and the dictionary generated by maketree() function and returns the distance between them using anytree libray
    input: str names of nodes
    return: int distance between nodes """

    cat_node1 = cat_tree_dict[node_nm1]
    cat_node2 = cat_tree_dict[node_nm2]

    path_list1 = list(cat_node1.path)[1:]
    path_list2 = list(cat_node2.path)[1:]
    
    len_list_path = [len(path_list1), len(path_list2)]
    
    for lp in [path_list1, path_list2]:
      if len(lp) == max(len_list_path):
        max_length_list = lp
      else:
        min_length_list = lp
    
    dist = 1
    common_path = []
        

    for nd in max_length_list:
        if nd in min_length_list:
            common_path.append(nd)
            dist +=0
        else:
            dist += 1
    return dist # (dist, common_path)

### Applying `make_tree()`function

Finally, we made use of our `dist_nodes()` using categories present in our tree generated by the `make_tree()` function. This step complete the integration of our different functions 

In [219]:
node1 = 'pcmcat179200050008 Patio Furniture & Decor'
node2 = 'pcmcat311200050005 All Unlocked Cell Phones'


print(dist_nodes(node1, node2, tree_dict))
dist_nodes(node1, node2, tree_dict) == dist_nodes(node2, node1, tree_dict)

5


True