In [49]:
from functools import reduce
from typing import List

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Model

from interact.fields import FieldsManager, SparseField
from interact.layers import SparseLinear, AddBias
from interact.utils import to_sequences

from utils import get_dataset, DataSet, cost

In [2]:
train = get_dataset(DataSet.Train)
valid = get_dataset(DataSet.Valid)

In [3]:
train.head(2)

Unnamed: 0.1,Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,587969,587969,Men's H&M tshirt,1,Men/Tops/T-shirts,H&M,8.0,0,"Light yellow color, NWT"
1,94528,94528,Victoria Secret Vneck lot 3,2,Women/Tops & Blouses/T-Shirts,Victoria's Secret,13.0,1,victoria-s-secret-pink-essential-v-neck-tee vi...


In [4]:
valid.head(2)

Unnamed: 0.1,Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,537620,537620,AE BUNDLE FOR LISA BOWSER,2,Women/Jeans/Boot Cut,American Eagle,105.0,0,"Size 10 short , Bought these and laundered the..."
1,548690,548690,***FOR TAM ONLY*** IPAD MINI 4 CASE,3,"Electronics/Cell Phones & Accessories/Cases, C...",,10.0,1,No description yet


In [59]:
%%time
unique_categories = reduce(
    lambda x, y: set(x).union(y),
    np.hstack(
        [
            train['category_name'].fillna('').str.split('/').values,
            valid['category_name'].fillna('').str.split('/').values,
        ]
    )
)

CPU times: user 15.9 s, sys: 146 ms, total: 16 s
Wall time: 16 s


In [60]:
len(unique_categories)

947

In [61]:
list(unique_categories.difference(''))[0] == ''

True

In [62]:
category_mapping = {}
for i, category in enumerate(unique_categories):
    if category.strip() == '':
        assert i == 0
        
    category_mapping[category] = i

In [63]:
category_mapping

{'': 0,
 'Feather Beds': 1,
 'Home Decor': 2,
 'Doormats': 3,
 'Air Purifiers': 4,
 "Women's Accessories": 5,
 'Animals': 6,
 'Motorcycle & Powersports': 7,
 'Necklaces': 8,
 'Light': 9,
 'Shopping Cart Covers': 10,
 'Socks': 11,
 'Photography': 12,
 'Sweats & Hoodies': 13,
 'Brass Instruments': 14,
 'Scale Models': 15,
 'Skirt': 16,
 'Lights & Lighting Accessories': 17,
 'Knit Top': 18,
 'Snowboard': 19,
 'Satchel': 20,
 'Tank': 21,
 'Crochet': 22,
 'Health & Baby Care': 23,
 'Just Married': 24,
 'Vacuums & Floor Care': 25,
 'Felted': 26,
 'Pillows': 27,
 'Crib Netting': 28,
 'Bear': 29,
 'Gift Wrap': 30,
 'Backpack Style': 31,
 'Artwork': 32,
 'Wallets': 33,
 'Home Surveillance': 34,
 'Doorstops': 35,
 'Tiered': 36,
 'Bottles': 37,
 'Polo Shirt': 38,
 'Baby & Child Care': 39,
 'Nail Care': 40,
 'Face': 41,
 'Cross Stitch': 42,
 'Fabric Postcard': 43,
 'Suits & Blazers': 44,
 'Biography': 45,
 'Lighting & Studio': 46,
 'Tote': 47,
 "Kids' Bath": 48,
 'Fireplaces & Accessories': 49,
 '

In [64]:
train['category_name'].fillna('').str.split('/').apply(len).max()

5

In [65]:
valid['category_name'].fillna('').str.split('/').apply(len).max()

5

In [66]:
from typing import List

def category_to_vector(category: List[str]):
    v = [0] * 5
    for i, c in enumerate(category):
        v[i] = category_mapping[c]
    return v

In [67]:
X_train_category = train['category_name'].fillna('').str.split('/').apply(category_to_vector)

In [68]:
X_valid_category = valid['category_name'].fillna('').str.split('/').apply(category_to_vector)

In [70]:
X_train_category.iloc[:3]

0    [430, 167, 531, 0, 0]
1    [122, 742, 523, 0, 0]
2    [122, 377, 679, 0, 0]
Name: category_name, dtype: object

In [74]:
X_train_category = np.vstack(X_train_category).astype(np.int32)
X_valid_category = np.vstack(X_valid_category).astype(np.int32)

In [75]:
X_train_category.max()

946

In [76]:
X_valid_category.max()

946

In [77]:
f_category = SparseField(
    name='category', 
    vocabulary_size=946,
    m=5, 
    d=5,
)

In [80]:
i = FieldsManager.fields2inputs([f_category])[0]
sparse_linear = SparseLinear(vocabulary_size=946, alpha=0.001)
o = AddBias()(sparse_linear(i))

In [81]:
model = Model(i, o)
model.compile(optimizer='sgd', loss='mse')

In [82]:
model.fit(
    X_train_category, 
    np.log1p(train['price']),
    epochs=10,
    batch_size=32,
    shuffle=True,
    validation_data=(
        X_valid_category,
        np.log1p(valid['price'])
    ), 
    callbacks=[EarlyStopping()]
)

Train on 1082535 samples, validate on 200000 samples
Epoch 1/10
     32/1082535 [..............................] - ETA: 2:42:50 - loss: 11.2995

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/10
Epoch 3/10


<tensorflow.python.keras.callbacks.History at 0x7fc5e77d8080>

In [84]:
y_pred_valid = np.expm1(model.predict(X_valid_category)).flatten()

In [85]:
cost(true=valid['price'], pred=y_pred_valid)

0.6600339717368209

In [86]:
np.mean(
    (np.log1p(valid['price']) - np.log1p(y_pred_valid)) ** 2
)

0.4356448438466824