In [0]:
#cd '/content/drive/My Drive/Colab Notebooks/dw_matrix/matrix_one'

In [0]:
#!pip install eli5

In [0]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
import eli5
from eli5.sklearn import PermutationImportance
from ast import literal_eval
from tqdm import tqdm_notebook

Using TensorFlow backend.


In [0]:
df = pd.read_csv('data/men_shoes.csv', low_memory=False)

In [0]:
df['brand_f'] = df.brand.factorize()[0]
df['categories_f'] = df.categories.factorize()[0]
df['manufacturer_f'] = df.manufacturer.factorize()[0]

In [0]:
def run_model(feats, model = DecisionTreeRegressor(max_depth=5)):
  X = df[feats].values
  y = df['prices_amountmin'].values

  scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error')
  return np.mean(scores), np.std(scores)

In [0]:
feats = []

In [0]:
def parse_features(x):
  output_dict = {}
  if str(x) == 'nan': return output_dict

  features = literal_eval(x.replace('\\"', '"'))
  for item in features:
    key = item['key'].lower().strip()
    value = item['value'][0].lower().strip()

    output_dict[key] = value

  return output_dict
  

df['features_parsed'] = df['features'].map(parse_features)

In [0]:
keys = set()
df['features_parsed'].map(lambda x: keys.update(x.keys()))
len(keys)


476

In [0]:

def get_name_feat(key):
  return 'feat_' + key

for key in tqdm_notebook(keys):
  df[get_name_feat(key)]  = df.features_parsed.map(lambda feats: feats[key] if key in feats else np.nan)

HBox(children=(IntProgress(value=0, max=476), HTML(value='')))




In [0]:
keys_stat = {}
for key in keys:
  keys_stat[key] = df[False == df[get_name_feat(key)].isnull()].shape[0] / df.shape[0] * 100

In [0]:
{k:v for k,v in keys_stat.items() if v > 30}

{'brand': 48.62691466083151,
 'color': 47.784463894967175,
 'gender': 50.17505470459519,
 'manufacturer part number': 36.252735229759296,
 'material': 34.9070021881838}

In [0]:
df['feat_brand_cat'] = df.feat_brand.factorize()[0]
df['feat_color_cat'] = df.feat_color.factorize()[0]
df['feat_gender_cat'] = df.feat_gender.factorize()[0]
df['feat_manufacturer part number_cat'] = df['feat_manufacturer part number'].factorize()[0]
df['feat_material_cat'] = df.feat_material.factorize()[0]

for key in keys:
  df[get_name_feat(key) + '_cat'] = df[get_name_feat(key)].factorize()[0]

In [0]:
feats_cat = [x for x in df.columns if 'cat' in x]

In [0]:
model = DecisionTreeRegressor(max_depth=5)
run_model(['brand_f'], model)

(-58.38655694633361, 4.223555478221712)

In [0]:
model = RandomForestRegressor(max_depth=15, n_estimators=100, random_state=0)
run_model(['brand_f'], model)

(-48.16257470162026, 3.629773851475144)

In [0]:
feats = ['brand_f', 'feat_gender_cat', 'feat_manufacturer part number_cat', 'feat_material_cat']
model = RandomForestRegressor(max_depth=15, n_estimators=100, random_state=0)
run_model(feats, model)

(-49.15243301681104, 3.7455805588272955)

In [0]:
feats = ['brand_f', 'feat_gender_cat', 'feat_manufacturer part number_cat', 'feat_material_cat']
X = df[feats].values
y = df['prices_amountmin'].values

m = RandomForestRegressor(max_depth=15, n_estimators=100, random_state=0)
m.fit(X, y)

perm = PermutationImportance(m, random_state=1).fit(X, y);
eli5.show_weights(perm, feature_names=feats)

Weight,Feature
0.7380  ± 0.0068,brand_f
0.2796  ± 0.0282,feat_material_cat
0.2145  ± 0.0057,feat_gender_cat
0.0909  ± 0.0053,feat_manufacturer part number_cat


In [0]:
df2 = df._get_numeric_data()
feats_cat = [x for x in df2.columns if 'cat' in x]
df2.fillna(0)
def run_model2(feats, model = DecisionTreeRegressor(max_depth=5)):
  X = df2[feats].values
  y = df2['prices_amountmin'].values

  scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error')
  return np.mean(scores), np.std(scores)
model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
run_model2(feats_cat, model)

(-59.10304986740874, 4.015234460446181)

In [0]:
X = df2[feats_cat].values
y = df2['prices_amountmin'].values

m = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
m.fit(X, y)

perm = PermutationImportance(m, random_state=1).fit(X, y);
eli5.show_weights(perm, feature_names=feats_cat)

Weight,Feature
0.1414  ± 0.0083,categories_f
0.0948  ± 0.0071,feat_material_cat
0.0132  ± 0.0014,feat_weight_cat
0.0066  ± 0.0004,feat_color_cat
0.0045  ± 0.0001,feat_shoe category_cat
0.0041  ± 0.0005,feat_shoe size_cat
0.0036  ± 0.0015,feat_adjustable_cat
0.0034  ± 0.0003,feat_jacket length_cat
0.0031  ± 0.0002,feat_gender_cat
0.0031  ± 0.0007,feat_resizable_cat


In [0]:
feats_cat2 =['categories_f',
'feat_material_cat',
'feat_weight_cat',
'feat_color_cat',
'feat_adjustable_cat',
'feat_shoe category_cat',
'feat_shoe size_cat',
'feat_jacket length_cat',
'feat_gender_cat',
'feat_case thickness_cat',
'feat_style_cat',
'feat_brand_cat',
'feat_casual & dress shoe style_cat',
'feat_resizable_cat',
'feat_frame material_cat',
'feat_case diameter_cat',
'feat_item package quantity_cat',
'feat_age_cat',
'feat_condition_cat',
'feat_shoe width_cat'
]
def run_model2(feats, model = DecisionTreeRegressor(max_depth=5)):
  X = df2[feats].values
  y = df2['prices_amountmin'].values

  scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error')
  return np.mean(scores), np.std(scores)
model = RandomForestRegressor(max_depth=15, n_estimators=100, random_state=0)
result = run_model2(feats_cat2, model)

In [0]:
X = df2[feats_cat2].values
y = df2['prices_amountmin'].values

m = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
m.fit(X, y)

print(result)

perm = PermutationImportance(m, random_state=1).fit(X, y);
eli5.show_weights(perm, feature_names=feats_cat2)

(-47.18733341990735, 2.6678437261263874)


Weight,Feature
0.1475  ± 0.0089,categories_f
0.0982  ± 0.0098,feat_material_cat
0.0139  ± 0.0007,feat_weight_cat
0.0074  ± 0.0009,feat_frame material_cat
0.0074  ± 0.0009,feat_case thickness_cat
0.0066  ± 0.0005,feat_color_cat
0.0061  ± 0.0007,feat_shoe category_cat
0.0043  ± 0.0006,feat_shoe size_cat
0.0041  ± 0.0004,feat_style_cat
0.0041  ± 0.0003,feat_gender_cat


In [0]:
#cd '/content/drive/My Drive/Colab Notebooks'

In [0]:
#ls

In [1]:
!echo 'data' > .gitignore
!echo add .gitignore
!add .gitignore

add .gitignore
/bin/bash: add: command not found


In [0]:
!git add 'Copy of dzien5.ipynb'

fatal: not a git repository (or any parent up to mount point /content)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).
