In [1]:
import requests
import json
import pandas as pd
import numpy as np

In [2]:
URL = 'http://localhost:8983/solr/craving/'

## Category Information Analysis

In [3]:
df_category = pd.read_csv('product_categories_v2.csv')
df_category.head()

Unnamed: 0,item_id,category_level_1,category_level_2,category_level_3,category_level_4,category_level_5,category_level_6,category_level_7
0,61b0b8fc4ce3c40008bf94b5,Food / Beverages,Frozen Foods,Meals (Frozen),,,,
1,61acc46e244643000aabe8f1,Food / Beverages,Dairy & Egg Products,Cheeses,,,,
2,619e4484abc5600008f0d530,categoryNotFound,,,,,,
3,61a238616296b40008d37ee2,Food / Beverages,Produce,Prepared Fruits / Vegetables,,,,
4,619e43edabc5600008f0d2b4,Food / Beverages,Grocery,Condiments,Sauces / Dressings / Dips,Sauces (Shelf-Stable),,


In [4]:
# drop item id column
df = df_category.drop(columns='item_id').drop_duplicates()
# strip whitespaces of categories
for col in df.columns:
    df[col] = df[col].str.strip()

In [5]:
# get hierarchy of categories
# {level: {parent_cat: [child_cats]}}
cat_levels = df.columns
cat_tree = {}
for level in range(1, 7):
    level_str = 'category_level_' + str(level)
    cat_tree[level_str] = {key: [] for key in df[level_str].unique() if key not in ['categoryNotFound', np.nan]}
    dict_cats = cat_tree[level_str]
    for parent in dict_cats.keys():
        dict_cats[parent] = df[df[level_str] == parent]['category_level_' + str(level+1)].unique().tolist()
    
list(cat_tree['category_level_1'].keys())

['Food / Beverages', 'Beer / Wine / Spirits']

In [6]:
def get_subcats(parent):
    '''
    given a category of any level, list all its sub-categories.
    '''
    for level_str, dict_cat in cat_tree.items():
        if parent in dict_cat.keys():
            return list(dict_cat[parent]), level_str
    print('Category not found')
    
get_subcats('Grocery')

(['Condiments',
  'Snack Foods',
  'Nuts & Seeds',
  'Seasonings / Preservatives / Extracts',
  'Breads / Cereals / Grains',
  'Prepared & Preserved Foods',
  'Beans, Peas, & Lentils',
  'Vegetables (Shelf Stable)',
  'Fats & Oils',
  'Dietary Supplement Foods',
  'Confectionery / Desserts / Sweeteners',
  'Meat / Poultry / Seafood',
  'Fruits',
  'Baking & Cooking',
  'Pasta / Noodles',
  'Non-Supplement Nutritional Foods'],
 'category_level_2')

In [7]:
def trace_cats(child):
    '''
    given a category of any level, trace all its parent categories.
    '''
    for level_str, dict_cat in cat_tree.items():
        if child in dict_cat.keys():
            level = int(level_str[-1])
            cat_str = child
            lst_cats = [child]
            while level >= 2:
                # find its parent
                level -= 1
                level_str_prev = 'category_level_' + str(level)
                dict_cat_prev = cat_tree[level_str_prev]
                for parent, cats in dict_cat_prev.items():
                    if child in cats:
                        cat_str = parent + ' > ' + cat_str
                        lst_cats = [parent] + lst_cats
                        child = parent
                        
            return cat_str, lst_cats

cat_str, lst_cats = trace_cats('Sauces (Shelf-Stable)')
print(cat_str)

Food / Beverages > Grocery > Condiments > Sauces / Dressings / Dips > Sauces (Shelf-Stable)


In [8]:
get_subcats('Beverages')

(['Coffee',
  'Water',
  'Soda / Flavored Drinks',
  'Fruit & Vegetable Drinks',
  'Drink Mixes & Flavorings',
  'Dairy-Based Drinks (Shelf-Stable)',
  'Tea',
  'Dairy Substitute Based Drinks (Shelf Stable)',
  'Energy Drinks',
  'Coffee / Tea Variety Packs',
  'Sports Drinks',
  'Cocktail Mixers',
  'Fermented Beverages',
  'Infant & Toddler Beverages'],
 'category_level_2')

### Good categories: 
* Grocery: Snack Foods, Nuts & Seeds, Dietary Supplement Foods, Non-Supplement Nutritional Foods
* Beverages: Coffee, Water, Soda / Flavored Drinks, Fruit & Vegetable Drinks, Drink Mixes & Flavorings, Dairy-Based Drinks (Shelf-Stable), Tea, Dairy Substitute Based Drinks (Shelf Stable), Energy Drinks, Coffee / Tea Variety Packs, Sports Drinks