# Labels reduction analysis

In [1]:
import sys
sys.path.append("/home/app/src/ecommerce-predictor/")

In [2]:
from scripts import build_df, decode_id
import pandas as pd
import numpy as np

### General analysis

Base function to create DataFrame for analyzing the impact of different thresholds on the total of categories on the dataset

In [3]:
def create_label_reduction_df(json_path: str, thresholds: list):
    # Initialize dataframe
    label_reduction_df = pd.DataFrame(columns=["threshold", "categories_in_df", "percentage_of_categories_used", "leafs_value_counts", "total_other_as_leaf", "percentage_other_as_leaf", "avg_category_level", "max_category_level"])
    
    for threshold in thresholds:
        df = build_df.build_df(json_path, threshold)
            
        # Save data for dataframe
        TOT_CATEGORIES = 1864
        TOT_PRODUCTS = 51646 
        categories_in_df = df[build_df.get_category_columns(df)].nunique().sum()
        percentage_of_categories_used = round(categories_in_df / TOT_CATEGORIES  * 100, 2)
        avg_category_level = df["max_depth"].mean()
        max_category_level = np.amax(df["max_depth"].unique())
        leafs_value_counts = df["leaf"].nunique()
        total_other_as_leaf = (df["leaf"] == "other").sum()
        percentage_other_as_leaf = round(total_other_as_leaf / TOT_PRODUCTS * 100, 2)

        # Add row to label_reduction_df
        label_reduction_df.loc[len(label_reduction_df.index)] = [threshold, categories_in_df, percentage_of_categories_used, leafs_value_counts, total_other_as_leaf, percentage_other_as_leaf, avg_category_level, max_category_level]
    return label_reduction_df

Analysis

In [4]:
label_reduction_df = create_label_reduction_df("../data/products.json", [1,2,5,10,20,30,50,75,100,150,200,300,500])
label_reduction_df

Unnamed: 0,threshold,categories_in_df,percentage_of_categories_used,leafs_value_counts,total_other_as_leaf,percentage_other_as_leaf,avg_category_level,max_category_level
0,1.0,1864.0,100.0,1667.0,0.0,0.0,3.479282,7.0
1,2.0,1588.0,85.19,1409.0,84.0,0.16,3.47237,6.0
2,5.0,1266.0,67.92,1103.0,306.0,0.59,3.449173,6.0
3,10.0,1012.0,54.29,857.0,716.0,1.39,3.404794,6.0
4,20.0,779.0,41.79,637.0,1411.0,2.73,3.320819,6.0
5,30.0,632.0,33.91,511.0,1820.0,3.52,3.239612,6.0
6,50.0,446.0,23.93,353.0,2141.0,4.15,3.094005,6.0
7,75.0,336.0,18.03,268.0,2161.0,4.18,2.971285,5.0
8,100.0,268.0,14.38,213.0,2780.0,5.38,2.837625,5.0
9,150.0,190.0,10.19,152.0,2743.0,5.31,2.667912,5.0


### Root classes

In [7]:
for i in [1, 10, 100]:
    df = build_df.build_df("../data/products.json", i)
    tot_root_cats = df["category_0"].nunique()
    df2 = df["category_0"].value_counts().reset_index()
    df2["index"] = df2["index"].apply(lambda x: decode_id.decode_id(x))
    print("With threshold", i, "there are", tot_root_cats, "root categories")
print("Top 15 root categories and its value count")    
display(df2.rename(columns={"index":"class", "category_0":"value_count"}))

With threshold 1 there are 62 root categories
With threshold 10 there are 22 root categories
With threshold 100 there are 16 root categories
Top 15 root categories and its value count


Unnamed: 0,class,value_count
0,Appliances,8888
1,Cell Phones,6778
2,Computers & Tablets,5871
3,Video Games,5688
4,Connected Home & Housewares,4542
5,Audio,3540
6,Cameras & Camcorders,3206
7,Musical Instruments,2973
8,Car Electronics & GPS,2469
9,TV & Home Theater,2056


### Classes exploration in leafs

In [6]:
for i in [1,2,5,10,20,30,50,75,100,150,200,300,500]:
    df  = build_df.build_df(json_path='../data/products.json', threshold=i)
    df = df["leaf"].value_counts().head(10).reset_index()
    df["index"] = df["index"].apply(lambda x: decode_id.decode_id(x))
    print("Top 10 leafs with threshold =", i)
    display(df.rename(columns={"index":"class", "leaf":"value_count"}))

Top 10 leafs with threshold = 1


Unnamed: 0,class,value_count
0,Pre-Owned Games,3556
1,Cell Phone Cases & Clips,1845
2,iPhone Cases & Clips,1471
3,Dash Installation Kits,739
4,All Refrigerators,689
5,Bluetooth & Wireless Speakers,637
6,Sheet Music,621
7,"Cases, Covers & Keyboard Folios",508
8,Cookware,486
9,Printer Ink,476


Top 10 leafs with threshold = 2


Unnamed: 0,class,value_count
0,Pre-Owned Games,3556
1,Cell Phone Cases & Clips,1845
2,iPhone Cases & Clips,1471
3,Dash Installation Kits,739
4,All Refrigerators,689
5,Bluetooth & Wireless Speakers,638
6,Sheet Music,621
7,"Cases, Covers & Keyboard Folios",508
8,Cookware,486
9,Printer Ink,476


Top 10 leafs with threshold = 5


Unnamed: 0,class,value_count
0,Pre-Owned Games,3556
1,Cell Phone Cases & Clips,1847
2,iPhone Cases & Clips,1471
3,Dash Installation Kits,739
4,All Refrigerators,689
5,Bluetooth & Wireless Speakers,642
6,Sheet Music,621
7,"Cases, Covers & Keyboard Folios",508
8,Cookware,486
9,Printer Ink,485


Top 10 leafs with threshold = 10


Unnamed: 0,class,value_count
0,Pre-Owned Games,3556
1,Cell Phone Cases & Clips,1847
2,iPhone Cases & Clips,1476
3,Dash Installation Kits,739
4,other,716
5,All Refrigerators,689
6,Bluetooth & Wireless Speakers,642
7,Sheet Music,621
8,"Cases, Covers & Keyboard Folios",508
9,Cookware,486


Top 10 leafs with threshold = 20


Unnamed: 0,class,value_count
0,Pre-Owned Games,3556
1,Cell Phone Cases & Clips,1847
2,iPhone Cases & Clips,1476
3,other,1411
4,Dash Installation Kits,739
5,All Refrigerators,689
6,Bluetooth & Wireless Speakers,661
7,Sheet Music,621
8,"Cases, Covers & Keyboard Folios",525
9,Printer Ink,497


Top 10 leafs with threshold = 30


Unnamed: 0,class,value_count
0,Pre-Owned Games,3556
1,Cell Phone Cases & Clips,1847
2,other,1820
3,iPhone Cases & Clips,1476
4,Dash Installation Kits,739
5,All Refrigerators,689
6,Bluetooth & Wireless Speakers,661
7,Sheet Music,621
8,"Cases, Covers & Keyboard Folios",525
9,Printer Ink,497


Top 10 leafs with threshold = 50


Unnamed: 0,class,value_count
0,Pre-Owned Games,3556
1,other,2141
2,Cell Phone Cases & Clips,1847
3,iPhone Cases & Clips,1552
4,Dash Installation Kits,739
5,Bluetooth & Wireless Speakers,708
6,All Refrigerators,689
7,Sheet Music,621
8,"Cases, Covers & Keyboard Folios",557
9,Printer Ink,497


Top 10 leafs with threshold = 75


Unnamed: 0,class,value_count
0,Pre-Owned Games,3556
1,other,2161
2,Cell Phone Cases & Clips,1847
3,iPhone Cases & Clips,1552
4,Dash Installation Kits,739
5,Bluetooth & Wireless Speakers,708
6,All Refrigerators,689
7,Sheet Music,621
8,"Cases, Covers & Keyboard Folios",557
9,Printer Ink,497


Top 10 leafs with threshold = 100


Unnamed: 0,class,value_count
0,Pre-Owned Games,3556
1,other,2780
2,Cell Phone Cases & Clips,1847
3,iPhone Cases & Clips,1552
4,Dash Installation Kits,739
5,Bluetooth & Wireless Speakers,708
6,All Refrigerators,689
7,Sheet Music,621
8,"Cases, Covers & Keyboard Folios",557
9,Printer Ink,497


Top 10 leafs with threshold = 150


Unnamed: 0,class,value_count
0,Pre-Owned Games,3556
1,other,2743
2,Cell Phone Cases & Clips,1847
3,iPhone Cases & Clips,1678
4,Video Games,753
5,Small Kitchen Appliances,740
6,Dash Installation Kits,739
7,Bluetooth & Wireless Speakers,708
8,All Refrigerators,689
9,Sheet Music,621


Top 10 leafs with threshold = 200


Unnamed: 0,class,value_count
0,Pre-Owned Games,3556
1,other,3515
2,Cell Phone Cases & Clips,1847
3,iPhone Cases & Clips,1678
4,Small Kitchen Appliances,1085
5,Video Games,930
6,Musical Instruments,927
7,"Heating, Cooling & Air Quality",820
8,"Health, Fitness & Beauty",743
9,Dash Installation Kits,739


Top 10 leafs with threshold = 300


Unnamed: 0,class,value_count
0,other,4224
1,Pre-Owned Games,3556
2,iPhone Cases & Clips,1892
3,Cell Phone Cases & Clips,1847
4,Video Games,1625
5,Musical Instruments,1204
6,Small Kitchen Appliances,1085
7,"Heating, Cooling & Air Quality",1064
8,"Cases, Covers & Keyboard Folios",821
9,Speakers,788


Top 10 leafs with threshold = 500


Unnamed: 0,class,value_count
0,other,3895
1,Pre-Owned Games,3556
2,Video Games,2281
3,Digital Camera Accessories,2204
4,Computers & Tablets,2049
5,iPhone Cases & Clips,1892
6,Cell Phone Cases & Clips,1847
7,Connected Home & Housewares,1779
8,Small Kitchen Appliances,1712
9,Musical Instruments,1656
