In [6]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
import eli5
from eli5.sklearn import PermutationImportance
from ast import literal_eval
from tqdm import tqdm_notebook


Using TensorFlow backend.


In [5]:
!pip install eli5

Collecting eli5
[?25l  Downloading https://files.pythonhosted.org/packages/97/2f/c85c7d8f8548e460829971785347e14e45fa5c6617da374711dec8cb38cc/eli5-0.10.1-py2.py3-none-any.whl (105kB)
[K     |███                             | 10kB 13.2MB/s eta 0:00:01[K     |██████▏                         | 20kB 1.9MB/s eta 0:00:01[K     |█████████▎                      | 30kB 2.5MB/s eta 0:00:01[K     |████████████▍                   | 40kB 1.8MB/s eta 0:00:01[K     |███████████████▌                | 51kB 2.0MB/s eta 0:00:01[K     |██████████████████▋             | 61kB 2.4MB/s eta 0:00:01[K     |█████████████████████▊          | 71kB 2.6MB/s eta 0:00:01[K     |████████████████████████▊       | 81kB 2.7MB/s eta 0:00:01[K     |███████████████████████████▉    | 92kB 3.1MB/s eta 0:00:01[K     |███████████████████████████████ | 102kB 3.0MB/s eta 0:00:01[K     |████████████████████████████████| 112kB 3.0MB/s 
Installing collected packages: eli5
Successfully installed eli5-0.10.1


In [7]:
cd "/content/drive/My Drive/Colab Notebooks/dw_matrix"

/content/drive/My Drive/Colab Notebooks/dw_matrix


In [2]:
cd matrix_1/

/content/drive/My Drive/Colab Notebooks/dw_matrix/matrix_1


In [18]:
ls data

shoes_prices.csv  shoes_prices_in_usd.csv


In [0]:
df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/dw_matrix/matrix_1/data/shoes_prices_in_usd.csv", low_memory=False)
df.shape
df['prices_amountmin'] = df.prices_amountmin.astype(np.float)
np.percentile(df['prices_amountmin'], 99)
filter_max = np.percentile(df['prices_amountmin'], 99)
df = df[df['prices_amountmin'] < filter_max].copy()

In [0]:
def run_model(feats, model = DecisionTreeRegressor(max_depth=5)):
  X = df[ feats].values
  y = df['prices_amountmin'].values

  scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error')
  return np.mean(scores), np.std(scores)

In [0]:
df['brand_cat'] = df['brand'].map(lambda x: str(x).lower()).factorize()[0]

In [16]:
run_model(['brand_cat'])

(-58.38655694633361, 4.223555478221712)

In [20]:
model = RandomForestRegressor(max_depth= 5, n_estimators=100, random_state=0)
run_model(['brand_cat'], model)

(-57.31783843165656, 4.181246596160967)

In [34]:
df['features'].head().values

array(['[{"key":"Gender","value":["Men"]},{"key":"Shoe Size","value":["M"]},{"key":"Shoe Category","value":["Men\'s Shoes"]},{"key":"Color","value":["Multicolor"]},{"key":"Manufacturer Part Number","value":["8190-W-NAVY-7.5"]},{"key":"Brand","value":["Josmo"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Shoe Size","value":["M"]},{"key":"Shoe Category","value":["Men\'s Shoes"]},{"key":"Color","value":["Multicolor"]},{"key":"Manufacturer Part Number","value":["8190-W-NAVY-7.5"]},{"key":"Brand","value":["Josmo"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Color","value":["Black"]},{"key":"Shipping Weight (in pounds)","value":["0.45"]},{"key":"Condition","value":["New"]},{"key":"Brand","value":["SERVUS BY HONEYWELL"]},{"key":"manufacturer_part_number","value":["ZSR101BLMLG"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Color","value":["Black"]},{"key":"Shipping Weight (in pounds)","value":["0.45"]},{"key":"Condition","value":["New"]},{"key":"Brand","value":["SER

In [0]:
def parse_features(x):
  output_dict = {}
  if str(x) == 'nan': return output_dict
  features = literal_eval(x.replace('\\"', '"'))
  for item in features:
    key = item['key'].lower().strip()
    value = item['value'][0].lower().strip()
    output_dict[key] = value
  return output_dict

df['features_parsed'] = df['features'].map(parse_features)

In [53]:
df['features_parsed'].head().values

array([{'gender': 'men', 'shoe size': 'm', 'shoe category': "men's shoes", 'color': 'multicolor', 'manufacturer part number': '8190-w-navy-7.5', 'brand': 'josmo'},
       {'gender': 'men', 'shoe size': 'm', 'shoe category': "men's shoes", 'color': 'multicolor', 'manufacturer part number': '8190-w-navy-7.5', 'brand': 'josmo'},
       {'gender': 'men', 'color': 'black', 'shipping weight (in pounds)': '0.45', 'condition': 'new', 'brand': 'servus by honeywell', 'manufacturer_part_number': 'zsr101blmlg'},
       {'gender': 'men', 'color': 'black', 'shipping weight (in pounds)': '0.45', 'condition': 'new', 'brand': 'servus by honeywell', 'manufacturer_part_number': 'zsr101blmlg'},
       {'gender': 'men', 'color': 'black', 'shipping weight (in pounds)': '0.45', 'condition': 'new', 'brand': 'servus by honeywell', 'manufacturer_part_number': 'zsr101blmlg'}],
      dtype=object)

In [12]:
keys = set()

df['features_parsed'].map( lambda x: keys.update(x.keys()))
len(keys)

476

In [13]:
def get_name_feat(key):
  return 'feat_'+key

for key in tqdm_notebook(keys):
  df[get_name_feat(key)] = df.features_parsed.map(lambda feats:  feats[key] if key in feats else np.nan)

HBox(children=(IntProgress(value=0, max=476), HTML(value='')))




In [14]:
df.columns

Index(['id', 'asins', 'brand', 'categories', 'colors', 'count', 'dateadded',
       'dateupdated', 'descriptions', 'dimension',
       ...
       'feat_number of items', 'feat_wash', 'feat_country of manufacture',
       'feat_color/finish family', 'feat_sock size', 'feat_bed size',
       'feat_location - country', 'feat_season', 'feat_msrp',
       'feat_lens color family'],
      dtype='object', length=526)

In [0]:
keys_stat = {}
for key in keys:
  keys_stat[key]= df[False == df[get_name_feat(key)].isnull() ].shape[0] /df.shape[0]*100

In [0]:
#keys_stat

In [23]:
{k:v for k,v in keys_stat.items() if v>30}

{'brand': 48.62691466083151,
 'color': 47.784463894967175,
 'gender': 50.17505470459519,
 'manufacturer part number': 36.252735229759296,
 'material': 34.9070021881838}

In [0]:
df['feat_brand_cat'] = df['feat_brand'].factorize()[0]
df['feat_color_cat'] = df['feat_color'].factorize()[0]
df['feat_gender_cat'] = df['feat_gender'].factorize()[0]
df['feat_manufacturer part number_cat'] = df['feat_manufacturer part number'].factorize()[0]
df['feat_material_cat'] = df['feat_material'].factorize()[0]

df['feat_sport_cat'] = df['feat_sport'].factorize()[0]
df['feat_style_cat'] = df['feat_style'].factorize()[0]


for key in keys:
  df[get_name_feat(key)+ '_cat'] = df[get_name_feat(key)].factorize()[0]

In [16]:
df[ df.brand != df.feat_brand ].shape

(18228, 531)

In [18]:
df[ df.brand == df.feat_brand ][['brand', 'feat_brand']].head()

Unnamed: 0,brand,feat_brand
2193,adidas,adidas
2447,adidas,adidas
3100,adidas,adidas
3133,adidas,adidas
3712,totes,totes


In [0]:
df['brand'] = df['brand'].map(lambda x: str(x).lower()).copy

In [25]:
model = RandomForestRegressor(max_depth=5, n_estimators=100)
#feats= ['']
run_model(['brand_cat'], model)

(-57.31569631812492, 4.190574158911621)

In [0]:
feats =['brand_cat', 'feat_brand_cat', 'feat_gender_cat','feat_material_cat', 'feat_style_cat', 'feat_sport_cat']
feats += [
 'feat_model_cat',
 'feat_ground_cat',
 'feat_power reserve_cat',
 'feat_waterproof_cat',
 'feat_lens material_cat',
 'feat_fabrication_cat',
 'feat_full product manual_cat',
 'feat_hardsided or softsided_cat',
 'feat_front_cat',
 'feat_hood_cat',
 'feat_location - city/state_cat',
 'feat_temple length_cat',
 'feat_label_cat',
 'feat_watch band material_cat',
 'feat_combined shipping_cat',
 'feat_upper material_cat',
 'feat_walmart no._cat',
 'feat_case finish_cat',
 'feat_sports league_cat',
 'feat_sizearm_cat',
 'feat_comfort technology_cat',
 'feat_case thickness_cat',
 'feat_quantity in set_cat',
 'feat_item color_cat',
 'feat_removable liner_cat',
 'feat_lens color_cat',
 'feat_case type_cat',
 'feat_fabric care_cat']
results = run_model(feats, model)

In [56]:
X = df[ feats].values
y = df['prices_amountmin'].values
m = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
m.fit(X,y)
print(results)
perm = PermutationImportance(m, random_state=1).fit(X,y)
eli5.show_weights(perm,  feature_names = feats)

(-57.337164696519686, 4.369555392218715)


Weight,Feature
0.2562  ± 0.0039,brand_cat
0.1015  ± 0.0110,feat_material_cat
0.0177  ± 0.0029,feat_gender_cat
0.0170  ± 0.0009,feat_brand_cat
0.0160  ± 0.0013,feat_case thickness_cat
0.0053  ± 0.0020,feat_watch band material_cat
0.0028  ± 0.0011,feat_style_cat
0.0016  ± 0.0001,feat_lens color_cat
0.0006  ± 0.0001,feat_model_cat
0.0004  ± 0.0000,feat_lens material_cat


In [42]:
df[ df['brand'] == 'nike'].features_parsed.head().values

array([], dtype=object)

In [45]:
df[ 'feat_age group'].value_counts()

adult               4563
men                  350
child                 77
men's                 33
unisex                 6
toddler                4
infant                 4
mens                   4
boys'                  3
women ,�� unisex       2
youth                  2
men||women             2
women                  2
12 up                  1
adult ,�� teen         1
Name: feat_age group, dtype: int64

In [48]:
feats_cat = [x for x in df.columns if 'cat' in x]
feats_cat

['categories',
 'brand_cat',
 'feat_certifications and listings',
 'feat_clothing category',
 'feat_catalog',
 'feat_multi pack indicator',
 'feat_recommended location',
 'feat_fabrication',
 'feat_location - city/state',
 'feat_shoe category',
 'feat_location - country',
 'feat_brand_cat',
 'feat_color_cat',
 'feat_gender_cat',
 'feat_manufacturer part number_cat',
 'feat_material_cat',
 'feat_sport_cat',
 'feat_style_cat',
 'feat_outer material_cat',
 'feat_weather resistant_cat',
 'feat_chronograph_cat',
 'feat_part number_cat',
 'feat_suitable for_cat',
 'feat_color family_cat',
 'feat_sleeve length_cat',
 'feat_protects against_cat',
 'feat_lens technology_cat',
 'feat_bracelet style_cat',
 'feat_feature_cat',
 'feat_compass_cat',
 'feat_country of origin - components_cat',
 'feat_main colour_cat',
 'feat_machine washable_cat',
 'feat_casing_cat',
 'feat_has paper wood_cat',
 'feat_year made_cat',
 'feat_boxed-product dimensions_cat',
 'feat_charger included_cat',
 'feat_atpv arc 

In [0]:
#feats += feats_cat

#feats = list(set(feats))

In [0]:
!git add matrix_1/day5.ipynb

fatal: pathspec 'matrix_1/day5.ipynb' did not match any files
