In [1]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

import eli5
from eli5.sklearn import PermutationImportance

from ast import literal_eval
from tqdm.notebook import tqdm

In [2]:
df = pd.read_csv("../data/men-shoes.csv", low_memory=False)

In [3]:
def run_model(feats, model = DecisionTreeRegressor(max_depth=5)):
    x = df[feats].values
    y = df['prices_amountmin'].values

    scores = cross_val_score(model, x, y, scoring='neg_mean_absolute_error')
    return np.mean(scores), np.std(scores)

In [94]:
df['brand_cat'] = df['brand'].map(lambda value: str(value).lower()).factorize()[0]
run_model(['brand_cat'])

(-58.133398968282776, 4.206122611474276)

In [95]:
model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
run_model(['brand_cat'], model)

(-57.31783843165656, 4.181246596160967)

In [96]:
def parse_features(feature) -> dict:
    if str(feature) == 'nan':
        return {}
    
    output = {}
    features = literal_eval(feature.replace('\\"', '"'))
    for item in features:
        key = item['key'].lower().strip()
        value = item['value'][0].lower().strip()
        
        output[key] = value
    
    return output

df['features_parsed'] = df['features'].map(parse_features)


In [97]:
features_keys = set()
df['features_parsed'].map(lambda x: features_keys.update(x.keys()))

def getNameFeat(aKey: str) -> str:
    return f'feat_{aKey}'

for key in tqdm(features_keys):
    df[getNameFeat(key)] = df['features_parsed'].map(lambda feats: feats[key] if key in feats else np.nan)

HBox(children=(FloatProgress(value=0.0, max=476.0), HTML(value='')))




In [149]:
keys_stat = {}
for key in tqdm(features_keys):
    keys_stat[key] = df[False == df[getNameFeat(key)].isnull()].shape[0] / df.shape[0] * 100

HBox(children=(FloatProgress(value=0.0, max=476.0), HTML(value='')))




In [264]:
for key in keys_stat.keys():
    name = f'{getNameFeat(key)}_cat'
    df[name] = df[getNameFeat(key)].factorize()[0]

# keys_stat = {key:value for key,value in keys_stat.items() if value > 5}

# features = ['brand_cat']
for key in keys_stat.keys():
    name = f'{getNameFeat(key)}_cat'
    features.append(name)

features = [
    'brand_cat',
    'feat_color_cat',
    'feat_gender_cat',
    'feat_material_cat',
    'feat_weight_cat',
    'feat_adjustable_cat',
    'feat_shoe category_cat',
    'feat_resizable_cat',
    'feat_movement_cat',
    'feat_band width_cat',
    'feat_fabric material_cat',
    'feat_brand_cat'
    ]

In [265]:
df['brand'] = df['brand'].map(lambda x: str(x).lower())

In [267]:
def fromPoundsToGrams(aWeight: float) -> float:
    return aWeight * 453.59237

def fromKilogramsToGrams(aWeight: float) -> float:
    return aWeight * 1000

def fromOuncesToGrams(aWeight: float) -> float:
    return aWeight * 28.3495231

def convertWeightToGrams(aWeight):
    weight = str(aWeight)
    if str(aWeight) == 'nan':
        return 0
    
    weight = weight.split()
    count = float(weight[0])
    unit = str(weight[1]).lower()
    
    if unit == 'pounds' or unit == 'lbs':
        return fromPoundsToGrams(count)
    elif unit == 'ounces':
        return fromOuncesToGrams(count)
    elif unit == 'kg':
        return fromKilogramsToGrams(count)
    else:
        return count
df['converted_weight_cat'] = df['weight'].map(convertWeightToGrams)
features.append('converted_weight_cat')

In [268]:
model = RandomForestRegressor(max_depth=5, n_estimators=100)
result = run_model(['brand_cat'], model)

In [269]:
x = df[features].values
y = df['prices_amountmin'].values

model = RandomForestRegressor(max_depth=5, n_estimators = 100, random_state = 0)
model.fit(x, y)

print(result)
permutation = PermutationImportance(model, random_state = 1).fit(x, y)
eli5.show_weights(permutation, feature_names = features)

(-57.31256185096349, 4.180682562958532)


Weight,Feature
0.2490  ± 0.0101,brand_cat
0.1014  ± 0.0059,feat_material_cat
0.0133  ± 0.0013,feat_weight_cat
0.0114  ± 0.0033,feat_adjustable_cat
0.0105  ± 0.0016,feat_brand_cat
0.0104  ± 0.0005,feat_shoe category_cat
0.0072  ± 0.0008,feat_movement_cat
0.0053  ± 0.0004,feat_band width_cat
0.0052  ± 0.0021,feat_resizable_cat
0.0048  ± 0.0005,feat_gender_cat
