In [1]:
from google.colab import drive
ROOT = "/content/drive"
drive.mount(ROOT)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
HOME = "/content/drive/My Drive/Colab Notebooks/dw-matrix"

In [10]:
cd {HOME}

/content/drive/My Drive/Colab Notebooks/dw-matrix


In [11]:
ls

[0m[01;34mdata[0m/  day3.ipynb  day4.ipynb  HelloGithub.ipynb  LICENSE  README.md


In [12]:
!pip install eli5

Requirement already up-to-date: eli5 in /usr/local/lib/python3.6/dist-packages (0.10.1)
Requirement already up-to-date: pandas in /usr/local/lib/python3.6/dist-packages (1.0.1)
Requirement already up-to-date: scikit-learn in /usr/local/lib/python3.6/dist-packages (0.22.1)


In [0]:
import numpy as np
import pandas as pd
import eli5
from eli5 import show_weights
from eli5.sklearn import PermutationImportance

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

from ast import literal_eval
from tqdm import tqdm_notebook

In [15]:
df = pd.read_csv("data/df-usd-99.csv", low_memory=False)
df.shape

(18280, 48)

In [0]:
def score_model(feats, model=DecisionTreeRegressor(criterion="mae", max_depth=15)):    
    X = df[ feats ].values
    y = df["prices_amountmin"].values
    c_score = -cross_val_score(estimator=model, X=X, y=y, scoring="neg_mean_absolute_error")
    mean = np.mean(c_score)               
    std = np.std(c_score) 
    return mean, std

In [0]:
df["brand_cat"] = df['brand'].str.lower().str.strip().factorize()[0]
feats = ["brand_cat"]

In [159]:
%%time
print(score_model(feats))

(47.89569556892779, 5.128382840134182)
CPU times: user 9.26 s, sys: 5.65 ms, total: 9.27 s
Wall time: 9.29 s


In [160]:
%%time
rf = RandomForestRegressor(n_estimators=5, criterion="mae", max_depth=15, min_samples_leaf=2, n_jobs=2)
print(score_model(feats, rf))

(46.142964496717724, 4.93104411712265)
CPU times: user 131 ms, sys: 41.2 ms, total: 172 ms
Wall time: 16.6 s


In [161]:
%%time 
from sklearn.neighbors import KNeighborsRegressor
for i in range(2,10,1):
    knn = KNeighborsRegressor(n_neighbors=i, n_jobs=2)
    print(i, score_model(feats, knn))

2 (51.54522647702407, 2.9459280444764406)
3 (50.07817231947483, 3.4184283172140573)
4 (49.217338758205685, 3.720954958940563)
5 (49.13412603938731, 3.3475433446031637)
6 (49.07614752005835, 3.588554912844922)
7 (49.25573874648327, 3.694208710286333)
8 (49.80676114606128, 3.799331235078319)
9 (49.637178033065894, 3.760480299301029)
CPU times: user 621 ms, sys: 26.4 ms, total: 648 ms
Wall time: 1.13 s


In [76]:
### FEATURES EXTRACTION FROM FIELD features
df["features"].sample(5).values

array(['[{"key":"Material","value":["Plastic"]},{"key":"Gender","value":["Men"]},{"key":"Color","value":["Brown"]},{"key":"Manufacturer Part Number","value":["Does not apply"]},{"key":"Brand","value":["Unique Bargains"]},{"key":"Age Group","value":["Adult"]}]',
       nan,
       '[{"key":"Style","value":["Messenger/Shoulder Bag"]},{"key":"Condition","value":["New with tags"]},{"key":"Material","value":["Leather"]},{"key":"Country of Manufacture","value":["China"]}]',
       '[{"key":"Heel Height","value":["Low (3/4 in. to 1 1/2 in.)"]},{"key":"Material","value":["Canvas"]},{"key":"Gender","value":["Men"]},{"key":"Shoe Size","value":["8"]},{"key":"Size","value":["8"]},{"key":"Color","value":["Black"]},{"key":"Model","value":["1008648-BLK"]},{"key":"Manufacturer Part Number","value":["1008648-BLK"]},{"key":"Brand","value":["Teva"]},{"key":"Age Group","value":["Adult"]},{"key":"Shoe Width","value":["D(M)"]}]',
       '[{"key":"Material","value":["Synthetic"]},{"key":"Gender","value":["Me

In [90]:
line = df["features"].sample().values
literal_eval(line[0])

[{'key': 'Sports Team', 'value': ['Baltimore Ravens']},
 {'key': 'Brand', 'value': ['Gifts']},
 {'key': 'Color', 'value': ['Other']},
 {'key': 'Sports League', 'value': ['NFL']}]

In [110]:
def parse_features(x):
    x = str(x)
    x = x.lower().strip().replace('\\\\"','')
    output_dict= {}

    if x == 'nan': 
        return output_dict   
    
    features = literal_eval(x)
    for item in features:
        key = item["key"].lower().strip()
        value = item["value"][0].lower().strip()
        output_dict[key] = value
    return output_dict

df["features_parsed"] = df["features"].map(parse_features)
df["features_parsed"].sample(5) 

7875    {'style': 'messenger/shoulder bag', 'material'...
5681    {'gender': 'men', 'size': '14 m', 'color': 'oa...
9619                                                   {}
6954                                                   {}
7529    {'gender': 'men', 'shoe size': 'm', 'shoe cate...
Name: features_parsed, dtype: object

In [113]:
keys = set()

df['features_parsed'].map( lambda x: keys.update( x.keys() ) )
len(keys)

476

In [0]:
def get_name_feat(key):
    return "feat_" + key

for key in keys:
    df[get_name_feat(key)] = df.features_parsed.map(lambda feats: feats[key] if key in feats else np.nan)

In [0]:
keys_stat = {}
for key in keys:
    keys_stat[key] = df [ False == df[get_name_feat(key)].isnull() ] .shape[0] /df.shape[0] *100

In [0]:
to_use = []
for k,v in keys_stat.items():
    if v > 5: to_use.append("feat_"+k)

In [137]:
to_use

['feat_sport',
 'feat_fabric material',
 'feat_condition',
 'feat_shoe category',
 'feat_material',
 'feat_casual & dress shoe style',
 'feat_age group',
 'feat_model',
 'feat_style',
 'feat_size',
 'feat_brand',
 'feat_shoe width',
 'feat_shipping weight (in pounds)',
 'feat_color',
 'feat_shoe size',
 'feat_manufacturer part number',
 'feat_occasion',
 'feat_heel height',
 'feat_assembled product dimensions (l x w x h)',
 'feat_gender',
 'feat_fabric content']

In [0]:
for col in to_use:
    df[col+"_cat"] = df[col].factorize()[0]

In [141]:
df.columns

Index(['id', 'asins', 'brand', 'categories', 'colors', 'count', 'dateadded',
       'dateupdated', 'descriptions', 'dimension',
       ...
       'feat_shoe width_cat', 'feat_shipping weight (in pounds)_cat',
       'feat_color_cat', 'feat_shoe size_cat',
       'feat_manufacturer part number_cat', 'feat_occasion_cat',
       'feat_heel height_cat',
       'feat_assembled product dimensions (l x w x h)_cat', 'feat_gender_cat',
       'feat_fabric content_cat'],
      dtype='object', length=548)

In [0]:
# df.drop("brand_cat_cat", inplace=True, axis=1)

In [0]:
df['manufacturernumber_cat'] = df['manufacturernumber'].factorize()[0]
df["manufacturer_cat"] = df['manufacturer'].factorize()[0]
df["categories_cat"] = df['categories'].factorize()[0]
df["prices_merchant_cat"] = df['prices_merchant'].factorize()[0]
df["prices_issale_cat"] = df["prices_issale"].factorize()[0]

In [169]:
features_to_use= [col for col in df.columns if "_cat" in col]
features_to_use.remove("feat_catalog")
features_to_use

['brand_cat',
 'feat_sport_cat',
 'feat_fabric material_cat',
 'feat_condition_cat',
 'feat_shoe category_cat',
 'feat_material_cat',
 'feat_casual & dress shoe style_cat',
 'feat_age group_cat',
 'feat_model_cat',
 'feat_style_cat',
 'feat_size_cat',
 'feat_brand_cat',
 'feat_shoe width_cat',
 'feat_shipping weight (in pounds)_cat',
 'feat_color_cat',
 'feat_shoe size_cat',
 'feat_manufacturer part number_cat',
 'feat_occasion_cat',
 'feat_heel height_cat',
 'feat_assembled product dimensions (l x w x h)_cat',
 'feat_gender_cat',
 'feat_fabric content_cat',
 'manufacturernumber_cat',
 'manufacturer_cat',
 'categories_cat',
 'prices_merchant_cat',
 'prices_issale_cat']

In [0]:
X = df[features_to_use].values
y = df["prices_amountmin"].values
m = RandomForestRegressor(n_estimators=10, criterion="mae", max_depth=20, min_samples_leaf=2, n_jobs=2)

In [172]:
m.fit(X, y)
perm = PermutationImportance(m).fit(X,y)
eli5.show_weights(perm, feature_names=features_to_use)

Weight,Feature
0.5589  ± 0.0231,prices_merchant_cat
0.3026  ± 0.0119,categories_cat
0.2731  ± 0.0081,brand_cat
0.1862  ± 0.0108,feat_material_cat
0.1718  ± 0.0239,prices_issale_cat
0.1563  ± 0.0078,feat_gender_cat
0.1551  ± 0.0086,manufacturernumber_cat
0.0931  ± 0.0025,feat_brand_cat
0.0721  ± 0.0052,feat_style_cat
0.0331  ± 0.0014,manufacturer_cat


In [173]:
print(score_model(features_to_use,m))

(49.269091876367604, 9.738441245527286)


In [190]:
# choosing top 10
better_features = pd.DataFrame(zip(features_to_use, perm.feature_importances_)).sort_values(1, ascending=False)[:10][0].values
better_features

array(['prices_merchant_cat', 'categories_cat', 'brand_cat',
       'feat_material_cat', 'prices_issale_cat', 'feat_gender_cat',
       'manufacturernumber_cat', 'feat_brand_cat', 'feat_style_cat',
       'manufacturer_cat'], dtype=object)

In [191]:
print(score_model(better_features,m))

(49.48570484135667, 10.09237176468882)


In [0]:
ls

In [192]:
!git add day5.ipynb

fatal: pathspec 'day5.ipynb' did not match any files


In [0]:
!git commit "adding day5 challenge"

In [0]:
!git push