In [1]:
from functools import reduce
from typing import List

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Add
from tensorflow.keras.models import Model

from interact.fields import FieldsManager, SparseField
from interact.layers import SparseLinear, AddBias
from interact.utils import to_sequences

from utils import get_dataset, DataSet, cost, MercariTranformer

In [2]:
train = get_dataset(DataSet.Train)
valid = get_dataset(DataSet.Valid)

In [3]:
mercari_transformer = MercariTranformer(
    CountVectorizer(min_df=10),
    CountVectorizer(min_df=10),
)

In [4]:
%%time
mercari_transformer.fit(pd.concat([train, valid], axis=0, ignore_index=True))

CPU times: user 53.2 s, sys: 1.25 s, total: 54.4 s
Wall time: 54.4 s


In [5]:
%%time
d_train = mercari_transformer.transform(train)

CPU times: user 1min 57s, sys: 577 ms, total: 1min 58s
Wall time: 1min 58s


In [6]:
%%time
d_valid = mercari_transformer.transform(valid)

CPU times: user 21 s, sys: 24.2 ms, total: 21 s
Wall time: 21 s


In [7]:
f_name = SparseField(
    name='name', 
    vocabulary_size=16543,
    m=10, 
    d=5,
)

f_description = SparseField(
    name='description', 
    vocabulary_size=30164,
    m=169, 
    d=5,
)

f_brand = SparseField(
    name='brand', 
    vocabulary_size=4658,
    m=1, 
    d=5,
)

f_condition = SparseField(
    name='item_condition_id', 
    vocabulary_size=5,
    m=1, 
    d=5,
)

f_shipping = SparseField(
    name='shipping', 
    vocabulary_size=2,
    m=1, 
    d=5,
)

f_category = SparseField(
    name='category', 
    vocabulary_size=946,
    m=5, 
    d=5,
)

In [None]:
fields = [
    f_name,
    f_description,
    f_brand,
    f_condition,
    f_shipping,
    f_category
]

In [None]:
FieldsManager.validate_fields(fields)

In [None]:
inputs = FieldsManager.fields2inputs(fields)

In [12]:
linear_terms = [
        FieldsManager.input2linear(i, field, 0.001)
         for i, field in zip(inputs, fields)
    ]

In [13]:
linear_terms

[<tf.Tensor 'sparse_linear/Identity:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'sparse_linear_1/Identity:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'sparse_linear_2/Identity:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'sparse_linear_3/Identity:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'sparse_linear_4/Identity:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'sparse_linear_5/Identity:0' shape=(None, 1) dtype=float32>]

In [14]:
linear_part = AddBias()(Add()(linear_terms) if len(linear_terms) > 1 else linear_terms[0])

In [15]:
linear_part

<tf.Tensor 'add_bias/Identity:0' shape=(None, 1) dtype=float32>

In [16]:
model = Model(inputs, linear_part)
model.compile(optimizer='sgd', loss='mse')

In [17]:
train_inputs = []
train_inputs.append(to_sequences(d_train["name"], 10))
train_inputs.append(to_sequences(d_train["item_description"], 169))
train_inputs.append(to_sequences(d_train["brand_name"], 1))
train_inputs.append((d_train["item_condition_id"].argmax(axis=1) + 1).reshape((-1, 1)))
train_inputs.append((d_train["shipping"] + 1).values.reshape((-1, 1)))
train_inputs.append(d_train["category_name"])

In [18]:
valid_inputs = []
valid_inputs.append(to_sequences(d_valid["name"], 10))
valid_inputs.append(to_sequences(d_valid["item_description"], 169))
valid_inputs.append(to_sequences(d_valid["brand_name"], 1))
valid_inputs.append((d_valid["item_condition_id"].argmax(axis=1) + 1).reshape((-1, 1)))
valid_inputs.append((d_valid["shipping"] + 1).values.reshape((-1, 1)))
valid_inputs.append(d_valid["category_name"])

In [19]:
train_inputs[-1].shape

(1082535, 5)

In [20]:
train_inputs[-1].shape

(1082535, 5)

In [21]:
model.fit(
    train_inputs, 
    np.log1p(train['price']),
    epochs=10,
    batch_size=32,
    shuffle=True,
    validation_data=(
        valid_inputs,
        np.log1p(valid['price'])
    ), 
    callbacks=[EarlyStopping()]
)

Train on 1082535 samples, validate on 200000 samples
Epoch 1/10


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/10


<tensorflow.python.keras.callbacks.History at 0x7f8143bfe208>

In [22]:
y_pred = np.expm1(model.predict(valid_inputs)).flatten()

In [41]:
cost(true=valid['price'], pred=y_pred)

0.6095028814710424

In [51]:
cost(true=valid['price'], pred=y_pred)

0.5613514051755778

In [61]:
cost(true=valid['price'], pred=y_pred)

0.546719594823619

In [22]:
cost(true=valid['price'], pred=y_pred)

0.5455225279000435

In [22]:
cost(true=valid['price'], pred=y_pred)

0.5404183729077965

In [23]:
cost(true=valid['price'], pred=y_pred)

0.5219109465479098