In [1]:
from functools import reduce
from typing import List

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Model

from interact.fields import FieldsManager, SparseField
from interact.layers import SparseLinear, AddBias
from interact.utils import to_sequences

from utils import get_dataset, DataSet, cost

In [2]:
train = get_dataset(DataSet.Train)
valid = get_dataset(DataSet.Valid)

In [3]:
train.head(2)

Unnamed: 0.1,Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,587969,587969,Men's H&M tshirt,1,Men/Tops/T-shirts,H&M,8.0,0,"Light yellow color, NWT"
1,94528,94528,Victoria Secret Vneck lot 3,2,Women/Tops & Blouses/T-Shirts,Victoria's Secret,13.0,1,victoria-s-secret-pink-essential-v-neck-tee vi...


In [4]:
valid.head(2)

Unnamed: 0.1,Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,537620,537620,AE BUNDLE FOR LISA BOWSER,2,Women/Jeans/Boot Cut,American Eagle,105.0,0,"Size 10 short , Bought these and laundered the..."
1,548690,548690,***FOR TAM ONLY*** IPAD MINI 4 CASE,3,"Electronics/Cell Phones & Accessories/Cases, C...",,10.0,1,No description yet


In [6]:
cv_desc = CountVectorizer(min_df=10)

cv_desc.fit(np.hstack([
    train["item_description"].fillna('').values,
    valid["item_description"].fillna('').values
]))

X_name_train = cv_desc.transform(train["item_description"].fillna(''))
X_name_valid = cv_desc.transform(valid["item_description"].fillna(''))

In [7]:
X_name_train

<1082535x30164 sparse matrix of type '<class 'numpy.int64'>'
	with 22138898 stored elements in Compressed Sparse Row format>

In [9]:
X_name_train.sum(axis=1).max()

194

In [10]:
X_name_train.sum(axis=1).mean()

24.015639217207756

In [12]:
vocabulary_size = len(cv_desc.vocabulary_)

In [13]:
vocabulary_size

30164

In [15]:
f_desc = SparseField(
    name='description', 
    vocabulary_size=vocabulary_size,
    m=194, 
    d=5,
)

In [16]:
f_desc

{'name': 'description', 'vocabulary_size': 30164, 'm': 194, 'd': 5, 'dtype': 'int32'}

In [17]:
i = FieldsManager.fields2inputs([f_desc])[0]
sparse_linear = SparseLinear(vocabulary_size=vocabulary_size, alpha=0.001)
o = AddBias()(sparse_linear(i))

In [18]:
X_desc_train_seq = to_sequences(X_name_train, seq_len=194)
X_desc_valid_seq = to_sequences(X_name_valid, seq_len=194)

In [24]:
model = Model(i, o)
model.compile(optimizer='sgd', loss='mse')

In [25]:
model.fit(
    X_desc_train_seq, 
    np.log1p(train['price']),
    epochs=10,
    batch_size=32,
    shuffle=True,
    validation_data=(
        X_desc_valid_seq,
        np.log1p(valid['price'])
    ), 
    callbacks=[EarlyStopping()]
)

Train on 1082535 samples, validate on 200000 samples
Epoch 1/10
     32/1082535 [..............................] - ETA: 2:45:43 - loss: 10.5112

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/10
Epoch 3/10
Epoch 4/10


<tensorflow.python.keras.callbacks.History at 0x7fc5921120b8>

In [27]:
y_pred_valid = np.expm1(model.predict(X_desc_valid_seq)).flatten()

In [28]:
cost(true=valid['price'], pred=y_pred_valid)

0.6336271746254887