In [52]:
!pip install eli5



In [0]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

import eli5
from eli5.sklearn import PermutationImportance

from ast import literal_eval
from tqdm import tqdm_notebook

In [3]:
cd "/content/drive/My Drive/Colab Notebooks/dw_matrix/"

/content/drive/My Drive/Colab Notebooks/dw_matrix


In [4]:
ls data

men_shoes.csv


In [5]:
df=pd.read_csv('data/men_shoes.csv', low_memory=False)
df.shape
df.columns

Index(['id', 'asins', 'brand', 'categories', 'colors', 'count', 'dateadded',
       'dateupdated', 'descriptions', 'dimension', 'ean', 'features',
       'flavors', 'imageurls', 'isbn', 'keys', 'manufacturer',
       'manufacturernumber', 'merchants', 'name', 'prices_amountmin',
       'prices_amountmax', 'prices_availability', 'prices_color',
       'prices_condition', 'prices_count', 'prices_currency',
       'prices_dateadded', 'prices_dateseen', 'prices_flavor', 'prices_issale',
       'prices_merchant', 'prices_offer', 'prices_returnpolicy',
       'prices_shipping', 'prices_size', 'prices_source', 'prices_sourceurls',
       'prices_warranty', 'quantities', 'reviews', 'sizes', 'skus',
       'sourceurls', 'upc', 'vin', 'websiteids', 'weight'],
      dtype='object')

In [0]:
def run_model(feats, model=DecisionTreeRegressor(max_depth=5)):
  X=df[feats].values
  y=df['prices_amountmin'].values

  scores=cross_val_score(model,X,y,scoring='neg_mean_absolute_error')
  return np.mean(scores), np.std(scores)

In [44]:
df['brand_cat']=df['brand'].factorize()[0]
run_model(['brand_cat'])

(-58.133398968282776, 4.206122611474276)

In [18]:
model=RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
run_model(['brand_cat'], model)

(-57.47223572384038, 4.328288468270897)

In [19]:
df['brand_cat']=df['brand'].map(lambda x: str(x).lower()).factorize()[0]
run_model(['brand_cat'])

(-58.133398968282776, 4.206122611474276)

In [20]:
model=RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
run_model(['brand_cat'], model)

(-57.31783843165656, 4.181246596160967)

In [22]:
df.features.head(7).values

array(['[{"key":"Gender","value":["Men"]},{"key":"Shoe Size","value":["M"]},{"key":"Shoe Category","value":["Men\'s Shoes"]},{"key":"Color","value":["Multicolor"]},{"key":"Manufacturer Part Number","value":["8190-W-NAVY-7.5"]},{"key":"Brand","value":["Josmo"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Shoe Size","value":["M"]},{"key":"Shoe Category","value":["Men\'s Shoes"]},{"key":"Color","value":["Multicolor"]},{"key":"Manufacturer Part Number","value":["8190-W-NAVY-7.5"]},{"key":"Brand","value":["Josmo"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Color","value":["Black"]},{"key":"Shipping Weight (in pounds)","value":["0.45"]},{"key":"Condition","value":["New"]},{"key":"Brand","value":["SERVUS BY HONEYWELL"]},{"key":"manufacturer_part_number","value":["ZSR101BLMLG"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Color","value":["Black"]},{"key":"Shipping Weight (in pounds)","value":["0.45"]},{"key":"Condition","value":["New"]},{"key":"Brand","value":["SER

In [0]:
def parse_features(x):
  if str(x)=='nan' : return []
  return literal_eval(x.replace('\\\\"',''))

df['features_parsed']=df['features'].map(parse_features)

In [8]:
df['features_parsed'].head()

0    [{'key': 'Gender', 'value': ['Men']}, {'key': ...
1    [{'key': 'Gender', 'value': ['Men']}, {'key': ...
2    [{'key': 'Gender', 'value': ['Men']}, {'key': ...
3    [{'key': 'Gender', 'value': ['Men']}, {'key': ...
4    [{'key': 'Gender', 'value': ['Men']}, {'key': ...
Name: features_parsed, dtype: object

In [0]:
def parse_features(x):
  output_dict={}
  if str(x)=='nan' : return output_dict
  features=literal_eval(x.replace('\\\\"',''))
  for item in features:
    key = item['key'].lower().strip()
    value = item['value'][0].lower().strip()
    output_dict[key]=value
  return output_dict

df['features_parsed']=df['features'].map(parse_features)

In [34]:
df['features_parsed'].head().values

array([{'gender': 'men', 'shoe size': 'm', 'shoe category': "men's shoes", 'color': 'multicolor', 'manufacturer part number': '8190-w-navy-7.5', 'brand': 'josmo'},
       {'gender': 'men', 'shoe size': 'm', 'shoe category': "men's shoes", 'color': 'multicolor', 'manufacturer part number': '8190-w-navy-7.5', 'brand': 'josmo'},
       {'gender': 'men', 'color': 'black', 'shipping weight (in pounds)': '0.45', 'condition': 'new', 'brand': 'servus by honeywell', 'manufacturer_part_number': 'zsr101blmlg'},
       {'gender': 'men', 'color': 'black', 'shipping weight (in pounds)': '0.45', 'condition': 'new', 'brand': 'servus by honeywell', 'manufacturer_part_number': 'zsr101blmlg'},
       {'gender': 'men', 'color': 'black', 'shipping weight (in pounds)': '0.45', 'condition': 'new', 'brand': 'servus by honeywell', 'manufacturer_part_number': 'zsr101blmlg'}],
      dtype=object)

In [19]:
keys=set()
df['features_parsed'].map(lambda x: keys.update(x.keys()))
len(keys)

476

In [23]:
def get_name_feat(key):
  return 'feat_'+key


for key in tqdm_notebook(keys):
#  print(key)
  df[get_name_feat(key)] = df.features_parsed.map(lambda feats: feats[key] if key in feats else np.nan)

HBox(children=(IntProgress(value=0, max=476), HTML(value='')))

key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key
key


In [24]:
df.columns

Index(['id', 'asins', 'brand', 'categories', 'colors', 'count', 'dateadded',
       'dateupdated', 'descriptions', 'dimension',
       ...
       'feat_product in inches (l x w x h)',
       'feat_country of origin components:', 'feat_diameter', 'feat_shape',
       'feat_antiscratch lens coating', 'feat_frame type', 'feat_watch shape',
       'feat_dial color', 'feat_fit:', 'feat_jewelry setting'],
      dtype='object', length=525)

In [31]:
df[False==df['feat_athlete'].isnull()].shape[0]/df.shape[0]*100

0.0437636761487965

In [25]:
df.shape

(18280, 525)

In [0]:
keys_stat={}
for key in keys:
  keys_stat[key]=df[False==df[get_name_feat(key)].isnull()].shape[0]/df.shape[0]*100

In [33]:
keys_stat

{'100% adidas authentic': 0.005470459518599562,
 'accessory type': 0.1422319474835886,
 'adidas': 0.005470459518599562,
 'adjustable': 0.34463894967177244,
 'age': 0.6400437636761488,
 'age end': 0.4431072210065646,
 'age gender group': 0.12582056892778995,
 'age group': 27.64770240700219,
 'age range': 0.5470459518599562,
 'age segment': 0.1422319474835886,
 'age start': 0.4431072210065646,
 'airport friendly': 0.005470459518599562,
 'alarm': 0.23522975929978115,
 'amazonbestsellersrank': 0.06017505470459519,
 'animal type': 0.005470459518599562,
 'antiscratch lens coating': 0.005470459518599562,
 'applicable': 0.02188183807439825,
 'arm': 0.6236323851203501,
 'article': 0.005470459518599562,
 'assembled in country of origin': 2.199124726477024,
 'assembled product dimensions (l x w x h)': 10.300875273522976,
 'assembled product weight': 1.5536105032822756,
 'athlete': 0.0437636761487965,
 'atpv arc rating': 0.005470459518599562,
 'attachment': 0.03282275711159737,
 'audience': 0.2352

In [36]:
{k:v for k,v in keys_stat.items() if v>30}

{'brand': 48.62691466083151,
 'color': 47.784463894967175,
 'gender': 50.17505470459519,
 'manufacturer part number': 36.252735229759296,
 'material': 34.9070021881838}

In [0]:
df['feat_brand_cat']=df['feat_brand'].factorize()[0]
df['feat_color_cat']=df['feat_color'].factorize()[0]
df['feat_gender_cat']=df['feat_gender'].factorize()[0]
df['feat_manufacturer part number_cat']=df['feat_manufacturer part number'].factorize()[0]
df['feat_material_cat']=df['feat_material'].factorize()[0]

df['feat_sport_cat']=df['feat_sport'].factorize()[0]
df['feat_style_cat']=df['feat_style'].factorize()[0]

In [39]:
df['brand']=df['brand'].map(lambda x: str(x).lower())
df[df.brand==df.feat_brand].shape

(8846, 526)

In [0]:
model=RandomForestRegressor(max_depth=5, n_estimators=100)
#feats=['brand_cat','feat_brand_cat','feat_color_cat','feat_gender_cat','feat_manufacturer part number_cat','feat_material_cat']
#feats=['brand_cat','feat_brand_cat','feat_gender_cat','feat_material_cat']
feats=['brand_cat','feat_brand_cat','feat_gender_cat','feat_material_cat','feat_style_cat','feat_sport_cat']
result=run_model(feats,model)

In [70]:
X=df[feats].values
y=df['prices_amountmin'].values
m=RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
m.fit(X,y)

print(result)
perm=PermutationImportance(m, random_state=1).fit(X,y);
eli5.show_weights(perm, feature_names=feats)



(-57.14819246935597, 4.291553959533405)


Weight,Feature
0.2605  ± 0.0107,brand_cat
0.1077  ± 0.0082,feat_material_cat
0.0465  ± 0.0044,feat_gender_cat
0.0210  ± 0.0011,feat_brand_cat
0.0067  ± 0.0016,feat_style_cat
0.0001  ± 0.0000,feat_sport_cat


In [62]:
df['brand'].value_counts(normalize=True)

nike                  0.097210
puma                  0.033315
ralph lauren          0.028775
vans                  0.021116
new balance           0.020295
                        ...   
calcutta              0.000055
cudas                 0.000055
glasses by me         0.000055
kirkland signature    0.000055
primitive             0.000055
Name: brand, Length: 1732, dtype: float64

In [63]:
df[df['brand']=='nike'].features_parsed.head().values

array([{'sport': 'soccer', 'main color': 'orange', 'type': 'cleats', 'condition': 'new without box'},
       {'sport': 'soccer', 'main color': 'orange', 'type': 'cleats', 'condition': 'new without box'},
       {'sport': 'soccer', 'main color': 'orange', 'type': 'cleats', 'condition': 'new without box'},
       {'style': 'athletic sneakers', 'condition': 'new with box'}, {}],
      dtype=object)