In [0]:
#!pip install eli5

In [0]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

import eli5
from eli5.sklearn import PermutationImportance

from ast import literal_eval
from tqdm import tqdm_notebook

Using TensorFlow backend.


In [0]:
cd '/content/drive/My Drive/Colab Notebooks/dw_matrix'

/content/drive/My Drive/Colab Notebooks/dw_matrix


In [0]:
ls data

men_shoes.csv  shoe_prices.csv  w_shoe_prices.csv


In [0]:
df = pd.read_csv('data/men_shoes.csv', low_memory=False)

In [0]:
def run_model(feats, model = DecisionTreeRegressor(max_depth=5)):
  X = df[ feats ].values
  y = df['prices_amountmin'].values

  scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error')
  return np.mean(scores), np.std(scores)

In [0]:
df['brand_cat'] = df.brand.map(lambda x: str(x).lower()).factorize()[0]

In [0]:
run_model(['brand_cat'])

(-58.133398968282776, 4.206122611474276)

In [0]:
model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
run_model(['brand_cat'], model)

(-57.31783843165656, 4.181246596160967)

In [0]:
df.features.values

array(['[{"key":"Gender","value":["Men"]},{"key":"Shoe Size","value":["M"]},{"key":"Shoe Category","value":["Men\'s Shoes"]},{"key":"Color","value":["Multicolor"]},{"key":"Manufacturer Part Number","value":["8190-W-NAVY-7.5"]},{"key":"Brand","value":["Josmo"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Shoe Size","value":["M"]},{"key":"Shoe Category","value":["Men\'s Shoes"]},{"key":"Color","value":["Multicolor"]},{"key":"Manufacturer Part Number","value":["8190-W-NAVY-7.5"]},{"key":"Brand","value":["Josmo"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Color","value":["Black"]},{"key":"Shipping Weight (in pounds)","value":["0.45"]},{"key":"Condition","value":["New"]},{"key":"Brand","value":["SERVUS BY HONEYWELL"]},{"key":"manufacturer_part_number","value":["ZSR101BLMLG"]}]',
       ...,
       '[{"key":"Gender","value":["Unisex"]},{"key":"Frame Style","value":["Wrap"]},{"key":"Polarized Lenses","value":["Polarized lenses"]},{"key":"Color","value":["Multi-color"]}]',


In [0]:
def parse_features(x):
  output_dict = {}
  if str(x) == 'nan' : return output_dict

  features =  literal_eval(x.replace('\\"','"'))
  for item in features:
    key = item['key'].lower().strip()
    value = item['value'][0].lower().strip()

    output_dict[key] = value

  return output_dict

df['features_parsed'] = df['features'].map(parse_features)

In [0]:
keys = set()

# df['features_parsed'].map( lambda x: keys.update(x.keys()) )
df['features_parsed'].map( lambda x: keys.update(x.keys()) )

len(keys)

476

In [0]:
def get_name_feat(key):
  return 'feat_' + key

for key in tqdm_notebook( keys):
  df[get_name_feat(key)] = df.features_parsed.map(lambda feats: feats[key] if key in feats else np.nan)

HBox(children=(IntProgress(value=0, max=476), HTML(value='')))




In [0]:
df.columns

Index(['id', 'asins', 'brand', 'categories', 'colors', 'count', 'dateadded',
       'dateupdated', 'descriptions', 'dimension',
       ...
       'feat_vendor description', 'feat_sock style', 'feat_display technology',
       'feat_sub style', 'feat_fabric care', 'feat_audience',
       'feat_location - country', 'feat_walmart no.', 'feat_szie',
       'feat_watch power source'],
      dtype='object', length=526)

In [0]:
keys_stat = {}

for key in keys:
  keys_stat[key] = df[ False == df[get_name_feat(key)].isnull() ].shape[0]/df.shape[0] * 100

In [0]:
{k:v for k,v in keys_stat.items() if v > 30}

{'brand': 48.62691466083151,
 'color': 47.784463894967175,
 'gender': 50.17505470459519,
 'manufacturer part number': 36.252735229759296,
 'material': 34.9070021881838}

In [0]:
keys_stat

{'100% adidas authentic': 0.005470459518599562,
 'accessory type': 0.1422319474835886,
 'adidas': 0.005470459518599562,
 'adjustable': 0.34463894967177244,
 'age': 0.6400437636761488,
 'age end': 0.4431072210065646,
 'age gender group': 0.12582056892778995,
 'age group': 27.64770240700219,
 'age range': 0.5470459518599562,
 'age segment': 0.1422319474835886,
 'age start': 0.4431072210065646,
 'airport friendly': 0.005470459518599562,
 'alarm': 0.23522975929978115,
 'amazonbestsellersrank': 0.06017505470459519,
 'animal type': 0.005470459518599562,
 'antiscratch lens coating': 0.005470459518599562,
 'applicable': 0.02188183807439825,
 'arm': 0.6236323851203501,
 'article': 0.005470459518599562,
 'assembled in country of origin': 2.199124726477024,
 'assembled product dimensions (l x w x h)': 10.300875273522976,
 'assembled product weight': 1.5536105032822756,
 'athlete': 0.0437636761487965,
 'atpv arc rating': 0.005470459518599562,
 'attachment': 0.03282275711159737,
 'audience': 0.2352

In [0]:
# df['feat_brand_cat'] = df.feat_brand.factorize()[0]
# df['feat_color_cat'] = df.feat_color.factorize()[0]
# df['feat_gender_cat'] = df.feat_gender.factorize()[0]
# df['feat_manufacturer part number_cat'] = df['feat_manufacturer part number'].factorize()[0]
# df['feat_material_cat'] = df.feat_material.factorize()[0]

# df['feat_sport_cat'] = df.feat_sport.factorize()[0]
# df['feat_style_cat'] = df.feat_style.factorize()[0]

for key in keys_stat:
  df[get_name_feat(key) + '_cat'] = df[get_name_feat(key)].factorize()[0]
  print(get_name_feat(key))
  # get_name_feat(key)

feat_ring style
feat_fabrication
feat_is water-resistant
feat_sports league
feat_isbn
feat_hairstyle
feat_crown
feat_is energy star-certified
feat_special features
feat_case tone
feat_cleaning, care & maintenance
feat_sku#
feat_is orthopedic
feat_number of heat settings
feat_size
feat_condition
feat_casual & dress shoe style
feat_primary shelf id
feat_ean
feat_athlete
feat_color/finish family
feat_date first available at amazon.co.uk
feat_front style
feat_age end
feat_country of origin - assembly
feat_expandable
feat_case material
feat_case type
feat_dimensions
feat_construction
feat_motion control
feat_transactionid
feat_smart watch
feat_eyewear frame style
feat_closure style
feat_watch style
feat_year
feat_lens technology
feat_chain included
feat_shipping weight (in pounds)
feat_global composite sports type
feat_length
feat_height
feat_clothing size
feat_designer
feat_leather grade
feat_dial color
feat_pant style
feat_ground
feat_compatible devices
feat_insulation
feat_measurements:


In [0]:
feats = ['brand_cat']

In [0]:


model = RandomForestRegressor(max_depth=5, n_estimators=100)
run_model(['brand_cat'], model=model)

(-57.33555071004041, 4.216133128674874)

In [0]:
feats = ['brand_cat', 'feat_brand_cat', 'feat_color_cat', 'feat_gender_cat', 'feat_manufacturer part number_cat', 'feat_material_cat']
run_model(feats, model)

(-57.0497918697572, 4.233914867425545)

In [0]:
feats = ['brand_cat', 'feat_brand_cat', 'feat_gender_cat', 'feat_material_cat', 'feat_style_cat']
run_model(feats, model)

(-57.01001371883092, 4.245936180979017)

In [0]:
feats_cat = [x for x in df.columns if '_cat' in x]

In [0]:
feats_cat.remove('feat_catalog')

In [0]:
feats += feats_cat
feats = list(set(feats))
run_model(feats_cat, model)

(-57.58858268224101, 4.211335209554334)

In [0]:
X = df [ feats ].values
y = df['prices_amountmin'].values
m = model.fit(X, y)

perm = PermutationImportance(m, random_state=0).fit(X, y);
eli5.show_weights(perm, feature_names=feats)

Weight,Feature
0.2507  ± 0.0146,brand_cat
0.1056  ± 0.0065,feat_material_cat
0.0120  ± 0.0008,feat_weight_cat
0.0118  ± 0.0019,feat_brand_cat
0.0092  ± 0.0013,feat_fabric content_cat
0.0071  ± 0.0022,feat_adjustable_cat
0.0069  ± 0.0005,feat_shoe category_cat
0.0047  ± 0.0019,feat_resizable_cat
0.0045  ± 0.0004,feat_fabric material_cat
0.0030  ± 0.0005,feat_gender_cat
