In [10]:
import pandas as pd
from tqdm import tqdm
from datetime import datetime

## Loading Data

In [11]:
df = pd.read_csv(
    '../shared/data/amz_products_small_pre_processed.csv.gz', 
    compression='gzip'
)

In [12]:
df

Unnamed: 0,also_buy,also_view,asin,brand,description,feature,image,price,title,main_cat
0,,,B00ADZ3WUM,NSI,('Bumpersticker: A day without sunshine is lik...,('Official Licensed Die-Cut Sticker Designed b...,,4.68,"NSI - A Day Without Sunshine is Like, Well, Ni...",Automotive
1,,,B005VII5IU,General Motors,('This is the official Genuine General Motors ...,('This is the official Genuine General Motors ...,,213.16,Genuine GM Parts 10341533 Rear Bumper Valance ...,Automotive
2,,,B001QTEKVO,JLM,('HID Xenon lights are designed to be at least...,"('Will run for approx 2500 hours', 'Produces 2...",('https://images-na.ssl-images-amazon.com/imag...,,JLM HID Conversion Kit H13 (9008) Dual Tube B...,Automotive
3,,"('B007KLMLRM', 'B007KLMNNE', 'B0085FOJ90', 'B0...",B00HWI43Q0,Spec-D Tuning,('Brand new in original packaging. Exactly the...,('Features 1 pair of Red & Smoked lens Tail Li...,('https://images-na.ssl-images-amazon.com/imag...,8.63,Spec-D Tuning LT-E362RG-F2-APC New 3D Light Ba...,Automotive
4,,,B0050VHRFK,Oracle Lighting,('Oracle Flush LED switches combines styling a...,"('Heavy stainless steel construction', 'LED il...",('https://images-na.ssl-images-amazon.com/imag...,19.95,Oracle Lighting LSGQ16FW White LED On/Off Flus...,Automotive
...,...,...,...,...,...,...,...,...,...,...
1229093,,,B00CLBJS68,Filemaker,"('Filemaker Pro 12 Np Edu Train Series Dvd',)",,,,FileMaker Pro - ( v. 12 ) - complete packag,All Electronics
1229094,,"('B07FPGPVH5',)",B00CNLGPMQ,Sage,"('<div class=""aplus""> <div class=""three-fourt...","('Organize your finances', 'Pay bills and get ...",,299.95,Sage 50 Pro Accounting 2014 US Edition,All Electronics
1229095,,,B00EVOU7FO,McAfee,"('Mcafee, Inc. Mis14edv1raa Mcafee Internet Se...","('A quality product by MCAFEE, INC.', 'A quali...",,31.80,MFE INTERNET SECURITY 1PC 2014,All Electronics
1229096,,,B00MV94V76,YBS,('This Package includes ONLY the License Upgra...,('Nuance Dragon Naturally Speaking Premium 13....,,,YBS Nuance Dragon Naturally Speaking Premium 1...,All Electronics


In [13]:
df.main_cat.nunique()

22

### Brand Statistics

Knowing which is the brand can be interesting, as we could create a map between the brand and the probability of being of a certain main category.

In [14]:
import sys
sys.path.append('../')

In [15]:
from shared.model.data.features.engineering.brand import BrandMapCategoryProbabilities

In [16]:
brand_engineering = BrandMapCategoryProbabilities(df)

In [17]:
brand_df = brand_engineering.get_brand_df()

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8524/8524 [08:41<00:00, 16.35it/s]


In [18]:
brand_df.head(3)

Unnamed: 0,Books,Grocery,Amazon Home,Digital Music,Movies & TV,Sports & Outdoors,Tools & Home Improvement,Amazon Fashion,Toys & Games,Pet Supplies,...,Office Products,Cell Phones & Accessories,Industrial & Scientific,Computers,All Electronics,Musical Instruments,Home Audio & Theater,Camera & Photo,Health & Personal Care,Video Games
Other,0.12,0.07,0.07,0.07,0.06,0.06,0.05,0.05,0.05,0.04,...,0.04,0.04,0.04,0.03,0.02,0.02,0.02,0.02,0.02,0.01
Generic,0.0,0.0,0.05,0.0,0.0,0.02,0.03,0.01,0.03,0.02,...,0.07,0.19,0.02,0.23,0.11,0.02,0.09,0.04,0.01,0.0
uxcell,0.0,0.0,0.03,0.0,0.0,0.01,0.22,0.0,0.0,0.08,...,0.05,0.01,0.35,0.04,0.09,0.01,0.03,0.01,0.01,0.0


We could check if certain categories have some brands with probability of 100%.

In [19]:
(brand_df == 1).any()

Books                        True
Grocery                      True
Amazon Home                  True
Digital Music                True
Movies & TV                  True
Sports & Outdoors            True
Tools & Home Improvement     True
Amazon Fashion               True
Toys & Games                 True
Pet Supplies                 True
Arts, Crafts & Sewing        True
Automotive                   True
Office Products              True
Cell Phones & Accessories    True
Industrial & Scientific      True
Computers                    True
All Electronics              True
Musical Instruments          True
Home Audio & Theater         True
Camera & Photo               True
Health & Personal Care       True
Video Games                  True
dtype: bool

In [20]:
brand_df[(brand_df > .95).any(1)]

Unnamed: 0,Books,Grocery,Amazon Home,Digital Music,Movies & TV,Sports & Outdoors,Tools & Home Improvement,Amazon Fashion,Toys & Games,Pet Supplies,...,Office Products,Cell Phones & Accessories,Industrial & Scientific,Computers,All Electronics,Musical Instruments,Home Audio & Theater,Camera & Photo,Health & Personal Care,Video Games
Various Artists,0.0,0.0,0.0,0.98,0.02,0.0,0.00,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ACDelco,0.0,0.0,0.0,0.00,0.00,0.0,0.01,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Small Parts,0.0,0.0,0.0,0.00,0.00,0.0,0.00,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Yu-Gi-Oh!,0.0,0.0,0.0,0.00,0.00,0.0,0.00,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Dorman,0.0,0.0,0.0,0.00,0.00,0.0,0.00,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Buster Crabbe,0.0,0.0,0.0,0.00,1.00,0.0,0.00,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Gamblin,0.0,0.0,0.0,0.00,0.00,0.0,0.00,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Reunion Blues,0.0,0.0,0.0,0.00,0.00,0.0,0.00,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
Muttluks,0.0,0.0,0.0,0.00,0.00,0.0,0.00,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


So we can see that an important bulk of the brands **always** correspond to a certain category.

We have used a threshold of 20 of minimum observations. Decreasing it could make it more easy to identify those brands, but we would be less confident about their probability.

We could consider the business impact of this, which could maintain overall model metrics (accuracy, f1score, ...) while improving the speed. With a confidence of **95%**:

In [21]:
df[df.brand.isin(brand_df[(brand_df > .95).any(1)].index)].shape[0] / len(df)

0.18851466685325335

So a total of 18% of the classifications of the model could be affected by this.

In [22]:
brand_engineering.save_brand_df()

#### Brand Feature

So basically what we would do is obtaining based on a brand, which are the probabilities.

In [24]:
from shared.model.data.features.engineering.brand import Brand

In [26]:
brand_feature_probs = Brand(
    brand_df=brand_df,
).get_feature(brand_name='Other')

In [27]:
brand_feature_probs

array([0.12, 0.07, 0.07, 0.07, 0.06, 0.06, 0.05, 0.05, 0.05, 0.04, 0.04,
       0.04, 0.04, 0.04, 0.04, 0.03, 0.02, 0.02, 0.02, 0.02, 0.02, 0.01])