In [14]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Model

from interact.fields import FieldsManager, SparseField
from interact.layers import SparseLinear, AddBias
from interact.utils import to_sequences

from utils import get_dataset, DataSet, cost

In [2]:
train = get_dataset(DataSet.Train)
valid = get_dataset(DataSet.Valid)

In [3]:
train.head(2)

Unnamed: 0.1,Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,587969,587969,Men's H&M tshirt,1,Men/Tops/T-shirts,H&M,8.0,0,"Light yellow color, NWT"
1,94528,94528,Victoria Secret Vneck lot 3,2,Women/Tops & Blouses/T-Shirts,Victoria's Secret,13.0,1,victoria-s-secret-pink-essential-v-neck-tee vi...


In [4]:
valid.head(2)

Unnamed: 0.1,Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,537620,537620,AE BUNDLE FOR LISA BOWSER,2,Women/Jeans/Boot Cut,American Eagle,105.0,0,"Size 10 short , Bought these and laundered the..."
1,548690,548690,***FOR TAM ONLY*** IPAD MINI 4 CASE,3,"Electronics/Cell Phones & Accessories/Cases, C...",,10.0,1,No description yet


In [6]:
train['brand_name'].isnull().mean()

0.4266457897435187

In [7]:
n = train.shape[0]
train['brand_name'].value_counts().iloc[:10] / n * 100

Nike                 3.653462
PINK                 3.648750
Victoria's Secret    3.244052
LuLaRoe              2.099978
Apple                1.174927
FOREVER 21           1.028604
Nintendo             1.016133
Lululemon            0.982324
Michael Kors         0.940201
American Eagle       0.885145
Name: brand_name, dtype: float64

In [8]:
train['brand_name'].fillna('Missing', inplace=True)
valid['brand_name'].fillna('Missing', inplace=True)

In [10]:
lb = LabelBinarizer(sparse_output=True)

In [11]:
X_train_brand = lb.fit_transform(train['brand_name'])
X_valid_brand = lb.transform(valid['brand_name'])

In [12]:
X_train_brand

<1082535x4427 sparse matrix of type '<class 'numpy.int64'>'
	with 1082535 stored elements in Compressed Sparse Row format>

In [13]:
X_valid_brand

<200000x4427 sparse matrix of type '<class 'numpy.int64'>'
	with 199750 stored elements in Compressed Sparse Row format>

In [15]:
X_train_brand_seq = to_sequences(X_train_brand, 1)
X_valid_brand_seq = to_sequences(X_valid_brand, 1)

In [28]:
len(lb.classes_)

4427

In [29]:
f_brand = SparseField(
    name='brand', 
    vocabulary_size=len(lb.classes_),
    m=1, 
    d=5,
)

In [30]:
i = FieldsManager.fields2inputs([f_brand])[0]
sparse_linear = SparseLinear(vocabulary_size=len(lb.classes_), alpha=0.001)
o = AddBias()(sparse_linear(i))

In [31]:
model = Model(i, o)
model.compile(optimizer='sgd', loss='mse')

In [32]:
model.fit(
    X_train_brand_seq, 
    np.log1p(train['price']),
    epochs=10,
    batch_size=32,
    shuffle=True,
    validation_data=(
        X_valid_brand_seq,
        np.log1p(valid['price'])
    ), 
    callbacks=[EarlyStopping()]
)

Train on 1082535 samples, validate on 200000 samples
Epoch 1/10
     32/1082535 [..............................] - ETA: 2:42:33 - loss: 8.3111

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/10
Epoch 3/10


<tensorflow.python.keras.callbacks.History at 0x7f92da34d0f0>

In [33]:
y_pred_valid = np.expm1(model.predict(X_valid_brand_seq)).flatten()

In [34]:
cost(true=valid['price'], pred=y_pred_valid)

0.6800495665760111

In [35]:
np.mean(
    (np.log1p(valid['price']) - np.log1p(y_pred_valid)) ** 2
)

0.46246741300022054