In [150]:
!pip install eli5



In [0]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score


import eli5
from eli5.sklearn import PermutationImportance

from ast import literal_eval
from tqdm import tqdm_notebook

In [152]:
cd "drive/My Drive/Colab Notebooks/dw_mtx/"

[Errno 2] No such file or directory: 'drive/My Drive/Colab Notebooks/dw_mtx/'
/content/drive/My Drive/Colab Notebooks/dw_mtx


In [153]:
ls

[0m[01;34mdata[0m/           dw_mtx_4.ipynb  HelloGithub.ipynb
dw_mtx_3.ipynb  dw-mtx_5.ipynb  README.md


In [154]:
df = pd.read_csv('data/men_shoes.csv', low_memory=False)
df.shape

(18280, 48)

In [0]:
def run_model(feats, model = DecisionTreeRegressor(max_depth=5)):
  x = df[feats].values 
  y = df['prices_amountmin'].values

  #model = DecisionTreeRegressor(max_depth=5)

  scores = cross_val_score(model,x,y,scoring='neg_mean_absolute_error')
  return np.mean(scores), np.std(scores)

In [0]:
df['brand_cat']=df['brand'].map(lambda x: str(x).lower()).factorize()[0]

In [157]:
run_model(['brand_cat'])

(-58.133398968282776, 4.206122611474276)

In [158]:
model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
run_model(['brand_cat'], model)

(-57.31783843165656, 4.181246596160967)

In [159]:
df.features.head().values

array(['[{"key":"Gender","value":["Men"]},{"key":"Shoe Size","value":["M"]},{"key":"Shoe Category","value":["Men\'s Shoes"]},{"key":"Color","value":["Multicolor"]},{"key":"Manufacturer Part Number","value":["8190-W-NAVY-7.5"]},{"key":"Brand","value":["Josmo"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Shoe Size","value":["M"]},{"key":"Shoe Category","value":["Men\'s Shoes"]},{"key":"Color","value":["Multicolor"]},{"key":"Manufacturer Part Number","value":["8190-W-NAVY-7.5"]},{"key":"Brand","value":["Josmo"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Color","value":["Black"]},{"key":"Shipping Weight (in pounds)","value":["0.45"]},{"key":"Condition","value":["New"]},{"key":"Brand","value":["SERVUS BY HONEYWELL"]},{"key":"manufacturer_part_number","value":["ZSR101BLMLG"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Color","value":["Black"]},{"key":"Shipping Weight (in pounds)","value":["0.45"]},{"key":"Condition","value":["New"]},{"key":"Brand","value":["SER

In [160]:
test = {'key':'value'}
test['key']

str(test)

"{'key': 'value'}"

In [161]:
str_dict = '[{"key":"Gender","value":["Men"]},{"key":"Shoe Size","value":["M"]},{"key":"Shoe Category","value":["Men\'s Shoes"]},{"key":"Color","value":["Multicolor"]},{"key":"Manufacturer Part Number","value":["8190-W-NAVY-7.5"]},{"key":"Brand","value":["Josmo"]}]'
literal_eval(str_dict)[0]['value'][0]

'Men'

In [0]:
def parse_features(x):
  output_dict = {}
  if str(x) == 'nan': return output_dict

  features = literal_eval(x.replace('\\"','"'))
  for item in features:
    key = item['key'].lower().strip()
    value = item['value'][0].lower().strip()

    output_dict[key] = value

  return output_dict

df['features_parsed'] = df['features'].map(parse_features)

# {'key': 'Gender', 'value': ['Men']}

In [163]:
keys = set()

df['features_parsed'].map(lambda x: keys.update( x.keys() ) )
len(keys)

# df['features_parsed'].head().values

476

In [164]:
df.features_parsed.head().values

array([{'gender': 'men', 'shoe size': 'm', 'shoe category': "men's shoes", 'color': 'multicolor', 'manufacturer part number': '8190-w-navy-7.5', 'brand': 'josmo'},
       {'gender': 'men', 'shoe size': 'm', 'shoe category': "men's shoes", 'color': 'multicolor', 'manufacturer part number': '8190-w-navy-7.5', 'brand': 'josmo'},
       {'gender': 'men', 'color': 'black', 'shipping weight (in pounds)': '0.45', 'condition': 'new', 'brand': 'servus by honeywell', 'manufacturer_part_number': 'zsr101blmlg'},
       {'gender': 'men', 'color': 'black', 'shipping weight (in pounds)': '0.45', 'condition': 'new', 'brand': 'servus by honeywell', 'manufacturer_part_number': 'zsr101blmlg'},
       {'gender': 'men', 'color': 'black', 'shipping weight (in pounds)': '0.45', 'condition': 'new', 'brand': 'servus by honeywell', 'manufacturer_part_number': 'zsr101blmlg'}],
      dtype=object)

In [165]:

def get_name_feat(key):
  return 'feat_' + key

for key in tqdm_notebook(keys):
  df[get_name_feat(key)] = df.features_parsed.map(lambda feats: feats[key] if key in feats else np.nan)

HBox(children=(IntProgress(value=0, max=476), HTML(value='')))




In [166]:
df.columns

Index(['id', 'asins', 'brand', 'categories', 'colors', 'count', 'dateadded',
       'dateupdated', 'descriptions', 'dimension',
       ...
       'feat_lens material', 'feat_item package quantity', 'feat_season',
       'feat_polarized?', 'feat_temple length', 'feat_package',
       'feat_boxed-product dimensions', 'feat_sku number', 'feat_foot arch',
       'feat_domestic shipping'],
      dtype='object', length=526)

In [0]:
keys_stat = {}
for key in keys:
  keys_stat[key] = df[ False == df[get_name_feat(key)].isnull() ].shape[0] / df.shape[0] * 100

In [168]:
{k:v for k, v in keys_stat.items() if v > 30}

{'brand': 48.62691466083151,
 'color': 47.784463894967175,
 'gender': 50.17505470459519,
 'manufacturer part number': 36.252735229759296,
 'material': 34.9070021881838}

In [0]:
df['feat_brand_cat'] = df['feat_brand'].factorize()[0]
df['feat_color_cat'] = df['feat_color'].factorize()[0]
df['feat_gender_cat'] = df['feat_gender'].factorize()[0]
df['feat_manufacturer part number_cat'] = df['feat_manufacturer part number'].factorize()[0]
df['feat_material_cat'] = df['feat_material'].factorize()[0]
df['feat_sport_cat'] = df['feat_sport'].factorize()[0]
df['feat_style_cat'] = df['feat_style'].factorize()[0]

for key in keys:
  df[get_name_feat(key) + '_cat'] = df[get_name_feat(key)].factorize()[0]

In [172]:
df['brand'] = df['brand'].map(lambda x: str(x).lower())
df [df.brand == df.feat_brand ][['brand', 'feat_brand'] ].head()
df [df.brand == df.feat_brand ].shape

(8846, 1002)

In [170]:
df['brand'] = df['brand'].map( lambda x: str(x).lower() )
df[ df.brand != df.feat_brand][ ['brand', 'feat_brand']].head()
df[ df.brand == df.feat_brand].shape

(8846, 1002)

In [0]:
feats = ['']

In [175]:
model = RandomForestRegressor(max_depth=5, n_estimators=100)
run_model(['brand_cat'], model)

(-57.29180075110887, 4.167429761898533)

In [176]:
feats_cat = [x for x in df.columns if 'cat' in x]
feats_cat

['categories',
 'brand_cat',
 'feat_shoe category',
 'feat_multi pack indicator',
 'feat_fabrication',
 'feat_recommended location',
 'feat_location - city/state',
 'feat_certifications and listings',
 'feat_catalog',
 'feat_clothing category',
 'feat_location - country',
 'feat_brand_cat',
 'feat_color_cat',
 'feat_gender_cat',
 'feat_manufacturer_part_number_cat',
 'feat_material_cat',
 'feat_sport_cat',
 'feat_style_cat',
 'feat_watch case shape_cat',
 'feat_country//region of manufacture_cat',
 'feat_shoe closure_cat',
 'feat_safety features_cat',
 'feat_part type_cat',
 'feat_black_cat',
 'feat_very popular bag now a days_cat',
 'feat_removable liner_cat',
 'feat_fits styles_cat',
 'feat_character_cat',
 'feat_age group_cat',
 'feat_leg_cat',
 'feat_guaranteed authentic_cat',
 'feat_weather resistant_cat',
 'feat_adjustable_cat',
 'feat_crystal_cat',
 'feat_count_cat',
 'feat_product id_cat',
 'feat_is waterproof_cat',
 'feat_name_cat',
 'feat_resizable_cat',
 'feat_is dark sky-co

In [177]:
df['weight'].unique()

array([nan, '3.0 lbs', '9 g', '1.45 lbs', '0.45 lbs', '1.0 lbs',
       '0.23 lbs', '5.0 lbs', '5.5 lbs', '7.45 lbs', '4.0 lbs',
       '2.7969 lbs', '3.9 lbs', '4.6 pounds', '2.1 lbs', '1.1057 lbs',
       '15.0 lbs', '2.4 ounces', '454 g', '0.105 lbs', '9.1 ounces',
       '4.8 lbs', '6.1 lbs', '6.5 lbs', '1.1041 lbs', '1.3 Kg', '91 g',
       '20.0 lbs', '6.0 lbs', '386 g', '0.81 lbs', '4.5 lbs',
       '0.5 ounces', '2.0 lbs', '3.13 lbs', '5.9 lbs', '6.15 lbs',
       '1 pounds', '1.95 lbs', '2.15 lbs', '2 pounds', '2.1 pounds',
       '14 Kg', '0.4788 lbs', '10.0 lbs', '0.38 lbs', '2.5 lbs',
       '68.912 lbs', '45 g', '13.09 lbs', '2.5 pounds', '0.21 lbs',
       '16.75 lbs', '6.3 lbs', '272 g', '1.8 Kg', '2.8 pounds', '0.1 lbs',
       '5.05 lbs', '0.28 lbs', '76.08 lbs', '0.15 lbs', '200 g',
       '7.8 pounds', '399 g', '4.95 lbs', '64.144 lbs', '24 pounds',
       '73.696 lbs', '1.6 lbs', '6.6 ounces', '5 g', '1.2 Kg', '862 g',
       '3.05 lb', '8.6 ounces', '3.6 lbs', '71.

In [0]:
feats = ['brand_cat', 'feat_brand_cat', 'feat_gender_cat', 'feat_material_cat', 'feat_style_cat', 'feat_metal type_cat', 'feat_shape_cat']
#feats += feats_cat
#feats = list(set(feats))

model = RandomForestRegressor(max_depth = 5, n_estimators = 100)
result = run_model(feats, model)

In [181]:
X = df[ feats].values
y = df['prices_amountmin'].values

m = RandomForestRegressor(max_depth = 5, n_estimators = 100, random_state = 0)
m.fit(X, y)

print(result)

perm = PermutationImportance(m, random_state = 1).fit(X, y);
eli5.show_weights(perm, feature_names = feats)

(-57.2399723745154, 4.2612571085644)


Weight,Feature
0.2595  ± 0.0110,brand_cat
0.1016  ± 0.0074,feat_material_cat
0.0233  ± 0.0022,feat_gender_cat
0.0173  ± 0.0009,feat_brand_cat
0.0125  ± 0.0008,feat_shape_cat
0.0097  ± 0.0012,feat_metal type_cat
0.0026  ± 0.0008,feat_style_cat


In [182]:
df['brand'].value_counts(normalize=True)

nike                                 0.097210
puma                                 0.033315
ralph lauren                         0.028775
vans                                 0.021116
new balance                          0.020295
                                       ...   
it's always sunny in philadelphia    0.000055
mr.chaos                             0.000055
nissun                               0.000055
cabela's                             0.000055
munsingwear                          0.000055
Name: brand, Length: 1732, dtype: float64

In [183]:
df[ df['brand'] == 'nike'].features_parsed.head().values

array([{'sport': 'soccer', 'main color': 'orange', 'type': 'cleats', 'condition': 'new without box'},
       {'sport': 'soccer', 'main color': 'orange', 'type': 'cleats', 'condition': 'new without box'},
       {'sport': 'soccer', 'main color': 'orange', 'type': 'cleats', 'condition': 'new without box'},
       {'style': 'athletic sneakers', 'condition': 'new with box'}, {}],
      dtype=object)

In [0]:
!git add dw_mtx_5.ipynb