In [0]:
!pip install eli5



In [0]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score 

import eli5
from eli5.sklearn import PermutationImportance

from ast import literal_eval
from tqdm import tqdm_notebook

Using TensorFlow backend.


In [0]:
cd "/content/drive/My Drive/Colab Notebooks/dw_matrix/"

/content/drive/My Drive/Colab Notebooks/dw_matrix


In [0]:
ls data

men_shoes.csv  shoes_prices.csv


In [0]:
df = pd.read_csv('data/men_shoes.csv', low_memory=False)

In [0]:
df.columns

Index(['id', 'asins', 'brand', 'categories', 'colors', 'count', 'dateadded',
       'dateupdated', 'descriptions', 'dimension', 'ean', 'features',
       'flavors', 'imageurls', 'isbn', 'keys', 'manufacturer',
       'manufacturernumber', 'merchants', 'name', 'prices_amountmin',
       'prices_amountmax', 'prices_availability', 'prices_color',
       'prices_condition', 'prices_count', 'prices_currency',
       'prices_dateadded', 'prices_dateseen', 'prices_flavor', 'prices_issale',
       'prices_merchant', 'prices_offer', 'prices_returnpolicy',
       'prices_shipping', 'prices_size', 'prices_source', 'prices_sourceurls',
       'prices_warranty', 'quantities', 'reviews', 'sizes', 'skus',
       'sourceurls', 'upc', 'vin', 'websiteids', 'weight'],
      dtype='object')

In [0]:
def run_model(feats, model = DecisionTreeRegressor(max_depth=5)):
 X =  df[ feats ].values
 y =  df['prices_amountmin'].values

 scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error' )
 return np.mean(scores), np.std(scores)

In [0]:
df['brand_cat'] = df['brand'].map(lambda x: str(x).lower()).factorize()[0]
run_model(['brand_cat'])

(-58.133398968282776, 4.206122611474276)

In [0]:
model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0 )
run_model(['brand_cat'], model)

(-57.31783843165656, 4.181246596160967)

In [0]:
 df.features.head().values

array(['[{"key":"Gender","value":["Men"]},{"key":"Shoe Size","value":["M"]},{"key":"Shoe Category","value":["Men\'s Shoes"]},{"key":"Color","value":["Multicolor"]},{"key":"Manufacturer Part Number","value":["8190-W-NAVY-7.5"]},{"key":"Brand","value":["Josmo"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Shoe Size","value":["M"]},{"key":"Shoe Category","value":["Men\'s Shoes"]},{"key":"Color","value":["Multicolor"]},{"key":"Manufacturer Part Number","value":["8190-W-NAVY-7.5"]},{"key":"Brand","value":["Josmo"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Color","value":["Black"]},{"key":"Shipping Weight (in pounds)","value":["0.45"]},{"key":"Condition","value":["New"]},{"key":"Brand","value":["SERVUS BY HONEYWELL"]},{"key":"manufacturer_part_number","value":["ZSR101BLMLG"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Color","value":["Black"]},{"key":"Shipping Weight (in pounds)","value":["0.45"]},{"key":"Condition","value":["New"]},{"key":"Brand","value":["SER

In [0]:
 test = {'key': 'values'}
 test['key']

 str(test)

"{'key': 'values'}"

In [0]:
str_dict = '[{"key":"Gender","value":["Men"]},{"key":"Shoe Size","value":["M"]},{"key":"Shoe Category","value":["Men\'s Shoes"]},{"key":"Color","value":["Multicolor"]},{"key":"Manufacturer Part Number","value":["8190-W-NAVY-7.5"]},{"key":"Brand","value":["Josmo"]}]'

literal_eval(str_dict) [0]['value']

['Men']

In [0]:
def parse_features(x):
  output_dict = {}
  if str(x) == 'nan': return output_dict

  features = literal_eval(x.replace('\\"', '"'))
  for item in features:
    key = item['key'].lower().strip() 
    value = item['value'][0].lower().strip()

    output_dict[key] = value 
 
  return output_dict

df['features_parsed'] = df['features'].map(parse_features) 

In [0]:
keys = set()

df['features_parsed'].map( lambda x:  keys.update(x.keys()) )

len(keys)

476

In [0]:
def get_name_feat(key):
  return 'feat_' + key

for key in tqdm_notebook(keys):
  df[get_name_feat(key)] = df.features_parsed.map(lambda feats: feats[key] if key in feats else np.nan) 

HBox(children=(IntProgress(value=0, max=476), HTML(value='')))




In [0]:
df.columns

Index(['id', 'asins', 'brand', 'categories', 'colors', 'count', 'dateadded',
       'dateupdated', 'descriptions', 'dimension',
       ...
       'feat_applicable', 'feat_product #',
       'feat_global composite sports type', 'feat_season', 'feat_sku',
       'feat_auto', 'feat_hammer loop', 'feat_mirrored',
       'feat_combined shipping', 'feat_temple length'],
      dtype='object', length=526)

In [0]:
keys_stat = {}
for key in keys:
  keys_stat[key] = df[ False == df[get_name_feat(key)].isnull() ].shape[0]  / df.shape[0] * 100

In [0]:

{k:v for k,v in keys_stat.items() if v> 30}

{'brand': 48.62691466083151,
 'color': 47.784463894967175,
 'gender': 50.17505470459519,
 'manufacturer part number': 36.252735229759296,
 'material': 34.9070021881838}

In [0]:
df['feat_brand_cat'] = df['feat_brand'].factorize()[0]
df['feat_color_cat'] = df['feat_color'].factorize()[0]
df['feat_gender_cat'] = df['feat_gender'].factorize()[0]
df['feat_manufacturer part number_cat'] = df['feat_manufacturer part number'].factorize()[0]
df['feat_material_cat'] = df['feat_material'].factorize()[0]

df['feat_sport_cat'] = df['feat_sport'].factorize()[0]
df['feat_style_cat'] = df['feat_style'].factorize()[0]

for key in keys:
  df[get_name_feat(key) + '_cat'] = df[get_name_feat(key)].factorize()[0]
  

In [0]:
df['brand'] = df['brand'].map(lambda x: str(x).lower() ) 
df[ df.brand == df.feat_brand ].shape

(8846, 1002)

In [0]:
feats = ''

In [0]:
model = RandomForestRegressor(max_depth=5, n_estimators=100)
run_model(['brand_cat'], model)


(-57.286227508008416, 4.184088814048063)

In [0]:
feats_cat = [x for x in df.columns if 'cat' in x] 
feats_cat

['categories',
 'brand_cat',
 'feat_catalog',
 'feat_fabrication',
 'feat_recommended location',
 'feat_clothing category',
 'feat_multi pack indicator',
 'feat_certifications and listings',
 'feat_location - country',
 'feat_location - city/state',
 'feat_shoe category',
 'feat_brand_cat',
 'feat_color_cat',
 'feat_gender_cat',
 'feat_manufacturer part number_cat',
 'feat_material_cat',
 'feat_sport_cat',
 'feat_style_cat',
 'feat_wind resistant_cat',
 'feat_date first available_cat',
 'feat_ul safety listing_cat',
 'feat_shirt size_cat',
 'feat_vehicle type_cat',
 'feat_waterproof_cat',
 'feat_hunting_cat',
 'feat_color/finish family_cat',
 'feat_lining material_cat',
 'feat_general warranty_cat',
 'feat_materials_cat',
 'feat_fits most screen size_cat',
 'feat_power type_cat',
 'feat_expandable_cat',
 'feat_picture_cat',
 'feat_parts_cat',
 'feat_design_cat',
 'feat_contained battery type_cat',
 'feat_nickel free_cat',
 'feat_chain length_cat',
 'feat_enhanced visibility (reflective

In [0]:
df['weight'].unique()

array([nan, '3.0 lbs', '9 g', '1.45 lbs', '0.45 lbs', '1.0 lbs',
       '0.23 lbs', '5.0 lbs', '5.5 lbs', '7.45 lbs', '4.0 lbs',
       '2.7969 lbs', '3.9 lbs', '4.6 pounds', '2.1 lbs', '1.1057 lbs',
       '15.0 lbs', '2.4 ounces', '454 g', '0.105 lbs', '9.1 ounces',
       '4.8 lbs', '6.1 lbs', '6.5 lbs', '1.1041 lbs', '1.3 Kg', '91 g',
       '20.0 lbs', '6.0 lbs', '386 g', '0.81 lbs', '4.5 lbs',
       '0.5 ounces', '2.0 lbs', '3.13 lbs', '5.9 lbs', '6.15 lbs',
       '1 pounds', '1.95 lbs', '2.15 lbs', '2 pounds', '2.1 pounds',
       '14 Kg', '0.4788 lbs', '10.0 lbs', '0.38 lbs', '2.5 lbs',
       '68.912 lbs', '45 g', '13.09 lbs', '2.5 pounds', '0.21 lbs',
       '16.75 lbs', '6.3 lbs', '272 g', '1.8 Kg', '2.8 pounds', '0.1 lbs',
       '5.05 lbs', '0.28 lbs', '76.08 lbs', '0.15 lbs', '200 g',
       '7.8 pounds', '399 g', '4.95 lbs', '64.144 lbs', '24 pounds',
       '73.696 lbs', '1.6 lbs', '6.6 ounces', '5 g', '1.2 Kg', '862 g',
       '3.05 lb', '8.6 ounces', '3.6 lbs', '71.

In [0]:
feats = ['brand_cat', 'feat_brand_cat', 'feat_gender_cat', 'feat_material_cat', 'feat_shape_cat', 'feat_style_cat', 'feat_sport_cat'  ]


model = RandomForestRegressor(max_depth=5, n_estimators=100)
result = run_model(feats, model)

In [0]:
X = df[feats].values
y = df['prices_amountmin'].values

m = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
m.fit(X, y)

print(result)
perm = PermutationImportance(m, random_state=1).fit(X, y);
eli5.show_weights(perm, feature_names=feats)

(-57.146752288955575, 4.297626688851055)


Weight,Feature
0.2613  ± 0.0108,brand_cat
0.1061  ± 0.0071,feat_material_cat
0.0230  ± 0.0023,feat_gender_cat
0.0210  ± 0.0011,feat_brand_cat
0.0137  ± 0.0010,feat_shape_cat
0.0042  ± 0.0007,feat_style_cat
0.0003  ± 0.0000,feat_sport_cat


In [0]:
df['brand'].value_counts(normalize=True)

nike                    0.097210
puma                    0.033315
ralph lauren            0.028775
vans                    0.021116
new balance             0.020295
                          ...   
elastogel               0.000055
silver jeans            0.000055
kr3w                    0.000055
reef footwear           0.000055
adidas by raf simons    0.000055
Name: brand, Length: 1732, dtype: float64

In [0]:
df[ df['brand'] == 'nike' ].features_parsed.sample(5).values

array([{'season': 'all-season', 'material': 'synthetic', 'gender': 'men', 'shoe size': '11', 'size': '11', 'color': 'gray', 'model': '704923 004', 'manufacturer part number': '704923 004', 'brand': 'nike', 'age group': 'adult'},
       {'season': 'all-season', 'material': 'synthetic', 'gender': 'men', 'shoe size': '8.5', 'color': 'blue', 'manufacturer part number': '554954 400', 'brand': 'nike', 'age group': 'adult'},
       {'gender': 'boys', 'shoe category': "boys' shoes", 'brand': 'nike', 'color': 'bright crimson/atomic orange/black', 'model': '827281-680', 'casual & dress shoe style': 'fashion sneakers'},
       {'sport': 'soccer', 'condition': 'new with box', 'type': 'cleats'},
       {'sport': 'football', 'condition': 'new without box', 'type': 'cleats'}],
      dtype=object)

In [0]:
keys_stat = {}
for key in keys:
  keys_stat[key] = df[ False == df[get_name_feat(key)].isnull() ].shape[0]  / df.shape[0] * 100

In [46]:
ls

[0m[01;34mdata[0m/  HelloGithub.ipynb  LICENSE  [01;34mmatrix_one[0m/  README.md


In [47]:
!git add matrix_one/Day5_new.ipynb

fatal: pathspec 'matrix_one/Day5_new.ipynb' did not match any files


In [45]:
!git init

[master 2360a7f] DAY 5
 1 file changed, 1 insertion(+), 1 deletion(-)
 rewrite matrix_one/Day3.ipynb (95%)


In [0]:
!git commit -m "DAY 5"

In [0]:
!git config --global user.email "dworakowsky@gmail.com"
!git config --global user.name "Pawel"

In [0]:
!git push -u origin master


Branch 'master' set up to track remote branch 'master' from 'origin'.
Everything up-to-date
