In [1]:
import pandas as pd
import numpy as np

from collections import Counter

import os
import gc
import math

from sklearn.preprocessing import LabelEncoder
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [2]:
%%time
def to_categorical(dataset):
    dataset['category_name'] = dataset['category_name'].astype('category')
    dataset['brand_name'] = dataset['brand_name'].astype('category')
    #dataset['item_condition_id'] = dataset['item_condition_id'].astype('category')
    return dataset
    
df_train = pd.read_csv('../input/train.tsv', sep='\t')
df_test = pd.read_csv('../input/test.tsv', sep='\t')

df_train['target'] = np.log1p(df_train['price'])

df_train["category_name"].fillna(value = "unk_category", inplace = True)
df_train["item_description"].fillna(value = "missing", inplace = True)
df_train["brand_name"].fillna(value = "unk_brand", inplace = True)
df_train["name"].fillna(value = "unk_name", inplace = True)

df_test["category_name"].fillna(value = "unk_category", inplace = True)
df_test["item_description"].fillna(value = "missing", inplace = True)
df_test["brand_name"].fillna(value = "unk_brand", inplace = True)
df_test["name"].fillna(value = "unk_name", inplace = True)

df_train = to_categorical(df_train)
df_test = to_categorical(df_test)

print("Handling categorical variables...")
le = LabelEncoder()

category = np.hstack([df_train.category_name, df_test.category_name])

c = Counter(category)
category_dict = {}
for i in c:
    if c[i] < 100:
        category_dict[i] = "unk_category"
        
df_train.category_name = df_train.category_name.apply(lambda x: category_dict[x] if x in category_dict else x)
df_test.category_name = df_test.category_name.apply(lambda x: category_dict[x] if x in category_dict else x)

le.fit(np.hstack([df_train.category_name, df_test.category_name]))
df_train['category'] = le.transform(df_train.category_name)
df_test['category'] = le.transform(df_test.category_name)

le.fit(np.hstack([df_train.brand_name, df_test.brand_name]))
df_train['brand'] = le.transform(df_train.brand_name)
df_test['brand'] = le.transform(df_test.brand_name)
del le, df_train['brand_name'], df_test['brand_name']

Handling categorical variables...
CPU times: user 19.7 s, sys: 1.08 s, total: 20.8 s
Wall time: 20.8 s


In [3]:
%%time
lmtz = WordNetLemmatizer()
df_train.item_description = df_train.item_description.apply(lambda x: " ".join(lmtz.lemmatize(i) for i in x.split()))
df_test.item_description = df_test.item_description.apply(lambda x: " ".join(lmtz.lemmatize(i) for i in x.split()))

CPU times: user 4min 19s, sys: 527 ms, total: 4min 20s
Wall time: 4min 20s


In [4]:
%%time
document = [sentence for sentence in df_train.item_description.values+" "+df_train.name.values + " "+ df_train.category_name.values]

CPU times: user 1.64 s, sys: 445 ms, total: 2.09 s
Wall time: 2.08 s


In [5]:
%%time
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import LabeledSentence

documents = []
for item_no,sentence in enumerate(document):
    documents.append(LabeledSentence(sentence.split(), ["label" + '_%s' % item_no]))

# train the model
model_doc2vec = Doc2Vec(documents, size=150, window=4, min_count=10, seed = 11, sample = 1e-4, alpha = 0.01)

  


CPU times: user 13min 5s, sys: 1min 17s, total: 14min 22s
Wall time: 7min 43s


In [15]:
from keras.preprocessing.text import Tokenizer
raw_text = np.hstack([df_train.category_name.str.lower(), 
                      df_train.item_description.str.lower(), 
                      df_train.name.str.lower()])

tok_raw = Tokenizer()
tok_raw.fit_on_texts(raw_text)
print("Transforming text to seq...")
df_train["seq_category_name"] = tok_raw.texts_to_sequences(df_train.category_name.str.lower())
df_test["seq_category_name"] = tok_raw.texts_to_sequences(df_test.category_name.str.lower())
df_train["seq_item_description"] = tok_raw.texts_to_sequences(df_train.item_description.str.lower())
df_test["seq_item_description"] = tok_raw.texts_to_sequences(df_test.item_description.str.lower())
df_train["seq_name"] = tok_raw.texts_to_sequences(df_train.name.str.lower())
df_test["seq_name"] = tok_raw.texts_to_sequences(df_test.name.str.lower())
df_train.head(3)

Transforming text to seq...


Unnamed: 0,train_id,name,item_condition_id,category_name,price,shipping,item_description,target,category,brand,seq_text,seq_item_description,seq_name,seq_category_name
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,10.0,1,No description yet,2.397895,371,5287,"[73, 44, 70, 76]","[13, 86, 102]","[2478, 8729, 7993, 70, 91, 7, 205]","[73, 44, 70, 76]"
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,52.0,0,This keyboard is in great condition and work l...,3.970292,67,3889,"[61, 946, 878, 3457, 2066]","[33, 2684, 11, 8, 51, 18, 1, 258, 65, 20, 1226...","[10654, 25140, 16087, 2684]","[61, 946, 878, 3457, 2066]"
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,10.0,1,Adorable top with a hint of lace and a key hol...,2.397895,623,4588,"[2, 44, 72, 277]","[708, 69, 10, 3, 4599, 12, 239, 1, 3, 893, 577...","[7595, 10467, 277]","[2, 44, 72, 277]"


In [16]:
MAX_CATEGORY = np.max([df_train.category.max(), df_test.category.max()])+1
MAX_BRAND = np.max([df_train.brand.max(), df_test.brand.max()])+1
MAX_CONDITION = np.max([df_train.item_condition_id.max(), 
                        df_test.item_condition_id.max()])+1
MAX_NAME_SEQ = 20
MAX_ITEM_DESC_SEQ = 60
MAX_TEXT = np.max([np.max(df_train.seq_name.max())
                   , np.max(df_test.seq_name.max())
                   , np.max(df_train.seq_category_name.max())
                   , np.max(df_test.seq_category_name.max())
                   , np.max(df_train.seq_item_description.max())
                   , np.max(df_test.seq_item_description.max())])+3

In [8]:
def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 \
              for i, pred in enumerate(y_pred)]
    return (sum(to_sum) * (1.0/len(y))) ** 0.5

In [9]:
from sklearn.model_selection import train_test_split
dtrain, dtest = train_test_split(df_train, random_state = 11, train_size = 0.99)



In [11]:
%%time
from keras.layers import Input, Dropout, Dense, Activation, concatenate, Embedding, Flatten
from keras.models import Model
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping#, TensorBoard
from keras import backend as K
from keras import optimizers
from keras import initializers

def get_keras_data(dataset):
    X = {
        "category": np.array(dataset.category),
        'brand': np.array(dataset.brand),
        "item_condition":np.array(dataset.item_condition_id),
        "num_vars":np.column_stack((np.array(dataset.shipping), model_doc2vec.docvecs.doctag_syn0[dataset.index]))
        , 'name': pad_sequences(dataset.seq_name, maxlen=MAX_NAME_SEQ)
        ,'item_desc': pad_sequences(dataset.seq_item_description
                                    , maxlen=MAX_ITEM_DESC_SEQ)
    }
    return X
    
X = get_keras_data(dtrain)
X_valid = get_keras_data(dtest)

def get_model():
    
    #Input
    category = Input(shape = [1], name = "category")
    brand = Input(shape = [1], name = "brand")
    item_condition = Input(shape = [1], name = "item_condition")
    num_vars = Input(shape = [X["num_vars"].shape[1]], name = "num_vars")
    
    # Embed Layers
    emb_category = Embedding(MAX_CATEGORY, 32)(category)
    emb_brand = Embedding(MAX_BRAND, 128)(brand)
    emb_item_condition = Embedding(MAX_CONDITION, 2)(item_condition)
    
    # main_layer
    main_l = concatenate([Flatten() (emb_category),
                         Flatten() (emb_brand),
                         Flatten() (emb_item_condition),
                         num_vars])
    main_l = Dense(1024, activation = "relu")(main_l)
    main_l = Dropout(0.4)(main_l)
    main_l = Dense(512, activation = "relu")(main_l)
    main_l = Dropout(0.4)(main_l)
    main_l = Dense(64, activation = "relu")(main_l)
    main_l = Dropout(0.05)(main_l)
    
    output = Dense(1, activation = "linear")(main_l)
    
    model = Model([category, brand, item_condition, num_vars], output)

    optimizer = optimizers.Adam()
    model.compile(loss="mse", 
                  optimizer=optimizer)
    return model

def eval_model(model):
    val_preds = model.predict(X_valid)
    val_preds = np.expm1(val_preds)
    
    y_true = np.array(dtest.price.values)
    y_pred = val_preds[:, 0]
    v_rmsle = rmsle(y_true, y_pred)
    print("RMSLE error on dev test: "+str(v_rmsle))
    return v_rmsle

exp_decay = lambda init, fin, steps: (init/fin)**(1/(steps-1)) - 1

epochs = 2
BATCH_SIZE = 512 * 6
steps = int(len(X['category'])/BATCH_SIZE) * epochs
lr_init, lr_fin = 0.001, 0.009
lr_decay = exp_decay(lr_init, lr_fin, steps)

model = get_model()
K.set_value(model.optimizer.lr, lr_init)
K.set_value(model.optimizer.decay, lr_decay)

history = model.fit(X, dtrain.target
                    , epochs=epochs
                    , batch_size=BATCH_SIZE
                    , validation_split=0.01
                    #, callbacks=[ModelCheckpoint("embed_NN_.check", save_best_only=True)]
                    , verbose=1
                    )

v_rmsle = eval_model(model)

Train on 1453031 samples, validate on 14678 samples
Epoch 1/2
Epoch 2/2
RMSLE error on dev test: 0.5285650761160501
CPU times: user 25min 1s, sys: 1min 59s, total: 27min 1s
Wall time: 4min 10s
