In [0]:
#!pip install eli5

In [3]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

import eli5
from eli5.sklearn import PermutationImportance 


from ast import literal_eval
from tqdm import tqdm_notebook

Using TensorFlow backend.


In [4]:
cd "/content/drive/My Drive/Colab Notebooks/dw_matrix"

/content/drive/My Drive/Colab Notebooks/dw_matrix


In [5]:
ls 

[0m[01;34mdata[0m/  HelloGithub.ipynb  LICENSE  [01;34mmatrix_one[0m/  README.md


In [0]:
df = pd.read_csv("data/men_shoes.csv", low_memory=False)

In [0]:
def run_model(feats, model=DecisionTreeRegressor(max_depth=5)):
  x = df[ feats ].values
  y = df['prices_amountmin'].values

  scores = cross_val_score(model, x, y, scoring='neg_mean_absolute_error')
  return np.mean(scores), np.std(scores) 

In [0]:
df['brand_cat'] = df['brand'].map(lambda x: str(x).lower()).factorize()[0]

In [9]:
run_model(['brand_cat'])

(-58.133398968282776, 4.206122611474276)

In [10]:
model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
run_model(['brand_cat'],model)

(-57.31783843165656, 4.181246596160967)

In [11]:
df.features.values
one_str = '{"key":"Gender","value":["Men"]},{"key":"Shoe Size","value":["M"]},{"key":"Shoe Category","value":["Men\'s Shoes"]},{"key":"Color","value":["Multicolor"]},{"key":"Manufacturer Part Number","value":["8190-W-NAVY-7.5"]},{"key":"Brand","value":["Josmo"]}'
literal_eval(one_str)[0]['key']

'Gender'

In [0]:
def parse_features(x):
  output_dict = {}
  if str(x) == 'nan' : return output_dict
  feats =  literal_eval(x.replace('\\"','"'))
  for item in feats: 
    key = item['key'].lower().strip()
    value = item['value'][0].lower().strip()
    output_dict[key] = value
  return output_dict

df['features_parsed']  = df['features'].map(parse_features)


In [0]:
def get_name_feat(key):
  return 'feat_' + key



In [14]:
keys = set()

df['features_parsed'].map(lambda x: keys.update(x.keys()))

len(keys)

476

In [15]:
for key in tqdm_notebook(keys):

  df[get_name_feat(key)] = df.features_parsed.map(lambda feats: feats[key] if key in feats else np.nan)

HBox(children=(IntProgress(value=0, max=476), HTML(value='')))




In [16]:
df.columns

Index(['id', 'asins', 'brand', 'categories', 'colors', 'count', 'dateadded',
       'dateupdated', 'descriptions', 'dimension',
       ...
       'feat_flame resistant', 'feat_interior pockets', 'feat_heat zones',
       'feat_number of batteries included', 'feat_bridge width',
       'feat_case material', 'feat_protects against', 'feat_fastener',
       'feat_lens material:', 'feat_power reserve'],
      dtype='object', length=526)

In [17]:
key_stat = {}
for key in tqdm_notebook(keys):
  key_stat[key] = df[ False == df[get_name_feat(key)].isnull() ].shape[0] / df.shape[0] * 100

HBox(children=(IntProgress(value=0, max=476), HTML(value='')))




In [18]:
{k:v for k,v in key_stat.items() if v > 30}

{'brand': 48.62691466083151,
 'color': 47.784463894967175,
 'gender': 50.17505470459519,
 'manufacturer part number': 36.252735229759296,
 'material': 34.9070021881838}

In [0]:
df['feat_brand_cat'] = df['feat_brand'].factorize()[0]
df['feat_color_cat'] = df['feat_color'].factorize()[0]
df['feat_gender_cat'] = df['feat_gender'].factorize()[0]
df['feat_manufacturer part number_cat'] = df['feat_manufacturer part number'].factorize()[0]
df['feat_material_cat'] = df['feat_material'].factorize()[0]

df['feat_sport_cat'] = df['feat_sport'].factorize()[0]
df['feat_condition_cat'] = df['feat_condition'].factorize()[0]



In [0]:
feats_cat = [ x for x in df.columns if 'feat_' in x] 
for x in feats_cat:
  df[x +'_cat'] = df[x].factorize()[0]

In [21]:
df['brand'] = df['brand'].map(lambda x : str(x).lower().strip())
df[df['brand'] == df['feat_brand']].shape[0]

8846

In [0]:
feats = ['brand_cat','feat_color_cat', 'feat_gender_cat','feat_style_cat', 'feat_material_cat','feat_adjustable_cat','feat_weight_cat','feat_gender_cat','feat_resizable_cat'] 
model = RandomForestRegressor(max_depth=5,n_estimators=100)



resuts = run_model(feats,model)

In [23]:
x = df [feats].values
y = df['prices_amountmin'].values 
m = RandomForestRegressor(max_depth=5,n_estimators=100,random_state=0)
m.fit(x,y)
print(resuts)
perm = PermutationImportance(m, random_state=1).fit(x,y);
eli5.show_weights(perm,feature_names =feats)

(-57.09170252082473, 4.183538651214804)


Weight,Feature
0.2570  ± 0.0111,brand_cat
0.1046  ± 0.0082,feat_material_cat
0.0130  ± 0.0009,feat_weight_cat
0.0125  ± 0.0021,feat_gender_cat
0.0089  ± 0.0008,feat_gender_cat
0.0078  ± 0.0012,feat_style_cat
0.0076  ± 0.0024,feat_adjustable_cat
0.0051  ± 0.0024,feat_resizable_cat
0.0038  ± 0.0006,feat_color_cat


In [24]:
df['weight'].unique()

array([nan, '3.0 lbs', '9 g', '1.45 lbs', '0.45 lbs', '1.0 lbs',
       '0.23 lbs', '5.0 lbs', '5.5 lbs', '7.45 lbs', '4.0 lbs',
       '2.7969 lbs', '3.9 lbs', '4.6 pounds', '2.1 lbs', '1.1057 lbs',
       '15.0 lbs', '2.4 ounces', '454 g', '0.105 lbs', '9.1 ounces',
       '4.8 lbs', '6.1 lbs', '6.5 lbs', '1.1041 lbs', '1.3 Kg', '91 g',
       '20.0 lbs', '6.0 lbs', '386 g', '0.81 lbs', '4.5 lbs',
       '0.5 ounces', '2.0 lbs', '3.13 lbs', '5.9 lbs', '6.15 lbs',
       '1 pounds', '1.95 lbs', '2.15 lbs', '2 pounds', '2.1 pounds',
       '14 Kg', '0.4788 lbs', '10.0 lbs', '0.38 lbs', '2.5 lbs',
       '68.912 lbs', '45 g', '13.09 lbs', '2.5 pounds', '0.21 lbs',
       '16.75 lbs', '6.3 lbs', '272 g', '1.8 Kg', '2.8 pounds', '0.1 lbs',
       '5.05 lbs', '0.28 lbs', '76.08 lbs', '0.15 lbs', '200 g',
       '7.8 pounds', '399 g', '4.95 lbs', '64.144 lbs', '24 pounds',
       '73.696 lbs', '1.6 lbs', '6.6 ounces', '5 g', '1.2 Kg', '862 g',
       '3.05 lb', '8.6 ounces', '3.6 lbs', '71.

In [0]:
def normalize_weight(values):
  if str(values) == 'nan' : return -1
  numb, unit = str(values).lower().split(' ')
  unit = unit.strip()
  numb = np.float(numb)
  if unit in ['lbs','lb','pounds']:
    return  numb * 453.59237
  elif unit == 'ounces':
    return numb * 28.3495231

  elif unit == 'kg':
    return numb * 1000
  else:
    return -2 

df['weight_normalize'] = df['weight'].map(normalize_weight)


In [0]:
df[df['weight_normalize'] != -1].shape
feats += ['weight_normalize']


In [27]:
feats

['brand_cat',
 'feat_color_cat',
 'feat_gender_cat',
 'feat_style_cat',
 'feat_material_cat',
 'feat_adjustable_cat',
 'feat_weight_cat',
 'feat_gender_cat',
 'feat_resizable_cat',
 'weight_normalize']

In [30]:
x = df [feats].values
y = df['prices_amountmin'].values 
m = RandomForestRegressor(max_depth=5,n_estimators=100,random_state=0)
m.fit(x,y)
print("Mean: {0}, Std: {1}".format(resuts[0],resuts[1]))
perm = PermutationImportance(m, random_state=1).fit(x,y);
eli5.show_weights(perm,feature_names =feats)

Mean: -57.09170252082473, Std: 4.183538651214804


Weight,Feature
0.2575  ± 0.0063,brand_cat
0.0988  ± 0.0100,feat_material_cat
0.0134  ± 0.0010,feat_weight_cat
0.0110  ± 0.0009,feat_gender_cat
0.0100  ± 0.0034,feat_adjustable_cat
0.0078  ± 0.0005,feat_gender_cat
0.0070  ± 0.0009,feat_style_cat
0.0052  ± 0.0010,feat_color_cat
0.0050  ± 0.0009,weight_normalize
0.0042  ± 0.0016,feat_resizable_cat
