In [23]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Model

from interact.fields import FieldsManager, SparseField
from interact.layers import SparseLinear, AddBias
from interact.utils import to_sequences

from utils import get_dataset, DataSet, cost

In [2]:
train = get_dataset(DataSet.Train)
valid = get_dataset(DataSet.Valid)

In [3]:
train.head(2)

Unnamed: 0.1,Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,587969,587969,Men's H&M tshirt,1,Men/Tops/T-shirts,H&M,8.0,0,"Light yellow color, NWT"
1,94528,94528,Victoria Secret Vneck lot 3,2,Women/Tops & Blouses/T-Shirts,Victoria's Secret,13.0,1,victoria-s-secret-pink-essential-v-neck-tee vi...


In [4]:
valid.head(2)

Unnamed: 0.1,Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,537620,537620,AE BUNDLE FOR LISA BOWSER,2,Women/Jeans/Boot Cut,American Eagle,105.0,0,"Size 10 short , Bought these and laundered the..."
1,548690,548690,***FOR TAM ONLY*** IPAD MINI 4 CASE,3,"Electronics/Cell Phones & Accessories/Cases, C...",,10.0,1,No description yet


In [5]:
cv_name = CountVectorizer(min_df=10)

cv_name.fit(np.hstack([
    train["name"].values,
    valid["name"].values
]))

X_name_train = cv_name.transform(train["name"])
X_name_valid = cv_name.transform(valid["name"])

In [11]:
m_name = (X_name_train > 0).sum(axis=1).max()
m_name

10

In [12]:
vocabulary_size = len(cv_name.vocabulary_)

In [13]:
vocabulary_size

16543

In [16]:
f_name = SparseField(
    name='name', 
    vocabulary_size=vocabulary_size,
    m=m_name, 
    d=5,
)

In [17]:
f_name

{'name': 'name', 'vocabulary_size': 16543, 'm': 10, 'd': 5, 'dtype': 'int32'}

In [18]:
i = FieldsManager.fields2inputs([f_name], alpha=0.001)[0]
sparse_linear = SparseLinear(vocabulary_size=vocabulary_size, alpha=0.001)
o = AddBias()(sparse_linear(i))

In [19]:
X_name_train_seq = to_sequences(X_name_train, seq_len=m_name)
X_name_valid_seq = to_sequences(X_name_valid, seq_len=m_name)

In [20]:
model = Model(i, o)
model.compile(optimizer='sgd', loss='mse')

In [22]:
model.fit(
    X_name_train_seq, 
    np.log1p(train['price']),
    epochs=10,
    batch_size=32,
    shuffle=True,
    validation_data=(
        X_name_valid_seq,
        np.log1p(valid['price'])
    ), 
    callbacks=[EarlyStopping()]
)

Train on 1082535 samples, validate on 200000 samples
Epoch 1/10


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/10
Epoch 3/10
Epoch 4/10


<tensorflow.python.keras.callbacks.History at 0x7f727ec78320>

In [25]:
y_pred_valid = np.expm1(model.predict(X_name_valid_seq)).flatten()

In [26]:
cost(true=valid['price'], pred=y_pred_valid)

0.6102968168658583

In [29]:
import pandas as pd

In [30]:
unique_categories = pd.Series("/".join(train["category_name"].unique().astype("str")).split("/")).unique()

In [31]:
unique_categories

array(['Men', 'Tops', 'T-shirts', 'Women', 'Tops & Blouses', 'T-Shirts',
       'Athletic Apparel', 'Sports Bras', 'Shirts & Tops', 'Kids', 'Toys',
       'Action Figures & Statues', 'Underwear', 'Bras', 'Beauty',
       'Skin Care', 'Hands & Nails', "Women's Handbags",
       'Totes & Shoppers', 'Jewelry', 'Necklaces', 'Other', 'Automotive',
       'Car Care', 'Shoes', 'Boots', 'Home', 'Artwork',
       'Posters & Prints', "Men's Accessories", 'Arts & Crafts',
       'Sweats & Hoodies', 'Sweatshirt, Pullover', 'Makeup', 'Eyes',
       "Women's Accessories", 'Wallets', 'Watches', 'Electronics',
       'Cell Phones & Accessories', 'Cases, Covers & Skins', 'Dresses',
       'Full-Length', 'Handmade', 'Accessories', 'Hair', 'Clothing',
       'Lingerie', 'Face', 'Backpack Style', 'Books',
       'Literature & Fiction', 'Pants, Tights, Leggings', 'Bath',
       'Bathroom Accessories', 'Vintage & Collectibles', 'Toy', 'Animal',
       'Jerseys', 'Girls (4+)', 'Tops & T-Shirts', 'Tank, Cami'