In [1]:
import re
from time import time
from collections import Counter

import tensorflow as tf
import pandas as pd
import numpy as np

from nltk.stem.porter import PorterStemmer
from fastcache import clru_cache as lru_cache

from sklearn.model_selection import ShuffleSplit
from sklearn import metrics

from tqdm import tqdm, tqdm_notebook
tqdm.pandas(tqdm_notebook())

  from ._conv import register_converters as _register_converters


A Jupyter Widget

In [2]:
t_start = time()

stemmer = PorterStemmer()

def rmse(y_true, y_pred):
    return np.sqrt(metrics.mean_squared_error(y_true, y_pred))

@lru_cache(1024)
def stem(s):
    return stemmer.stem(s)

whitespace = re.compile(r'\s+')
non_letter = re.compile(r'\W+')

def tokenize(text):
    text = text.lower()
    text = non_letter.sub(' ', text)

    tokens = []

    for t in text.split():
        #t = stem(t)
        tokens.append(t)

    return tokens

class Tokenizer:
    def __init__(self, min_df=10, tokenizer=str.split):
        self.min_df = min_df
        self.tokenizer = tokenizer
        self.doc_freq = None
        self.vocab = None
        self.vocab_idx = None
        self.max_len = None

    def fit_transform(self, texts):
        tokenized = []
        doc_freq = Counter()
        n = len(texts)

        for text in texts:
            sentence = self.tokenizer(text)
            tokenized.append(sentence)
            doc_freq.update(set(sentence))

        vocab = sorted([t for (t, c) in doc_freq.items() if c >= self.min_df])
        vocab_idx = {t: (i + 1) for (i, t) in enumerate(vocab)}
        doc_freq = [doc_freq[t] for t in vocab]

        self.doc_freq = doc_freq
        self.vocab = vocab
        self.vocab_idx = vocab_idx

        max_len = 0
        result_list = []
        for text in tokenized:
            text = self.text_to_idx(text)
            max_len = max(max_len, len(text))
            result_list.append(text)

        self.max_len = max_len
        result = np.zeros(shape=(n, max_len), dtype=np.int32)
        for i in range(n):
            text = result_list[i]
            result[i, :len(text)] = text

        return result    

    def text_to_idx(self, tokenized):
        return [self.vocab_idx[t] for t in tokenized if t in self.vocab_idx]

    def transform(self, texts):
        n = len(texts)
        result = np.zeros(shape=(n, self.max_len), dtype=np.int32)

        for i in range(n):
            text = self.tokenizer(texts[i])
            text = self.text_to_idx(text)[:self.max_len]
            result[i, :len(text)] = text

        return result
    
    def vocabulary_size(self):
        return len(self.vocab) + 1



In [3]:
%%time
print('reading train data...')
df_train = pd.read_csv('../input/train.tsv', sep='\t')
df_train = df_train[df_train.price != 0].reset_index(drop=True)

price = df_train.pop('price')
y = np.log1p(price.values).reshape(-1,1)
mean = y.mean()
std = y.std()
ynorm = (y - mean) / std
ynorm = ynorm.reshape(-1, 1)

reading train data...
CPU times: user 4.15 s, sys: 289 ms, total: 4.44 s
Wall time: 4.47 s


In [4]:
%%time
df_train.name.fillna('unkname', inplace=True)
df_train.category_name.fillna('unk_cat', inplace=True)
df_train.brand_name.fillna('unk_brand', inplace=True)
df_train.item_description.fillna('nodesc', inplace=True)

CPU times: user 297 ms, sys: 1.25 ms, total: 298 ms
Wall time: 297 ms


In [5]:
%%time
#impute brand names using flashtext
def get_brands(x, keyword_processor):
    matches = keyword_processor.extract_keywords(' '.join(x))
    if len(matches) > 0:
        return matches[0]
    else:
        return None
    
from flashtext import KeywordProcessor
all_brands = df_train.loc[df_train.brand_name.value_counts().values > 3, 'brand_name'].tolist()
#all_brands.remove('always')

keyword_processor = KeywordProcessor(case_sensitive=True)
keyword_processor.add_keywords_from_list(all_brands)
print(len(keyword_processor))

tmp = df_train[['name','item_description']].progress_apply(lambda x: get_brands(x, keyword_processor), axis=1).fillna("unk_brand")
missing_indices = df_train['brand_name'] == 'unk_brand'
df_train.loc[missing_indices, 'brand_name'] = tmp[missing_indices]
df_train.brand_name.fillna("unk_brand", inplace=True)

  0%|          | 414/1481661 [00:00<05:57, 4139.88it/s]

389


100%|██████████| 1481661/1481661 [01:13<00:00, 20173.63it/s]


CPU times: user 1min 13s, sys: 220 ms, total: 1min 14s
Wall time: 1min 13s


In [6]:
#bmiss = df_train.loc[df_train.brand_name == 'unk_brand'].index
#sum([1 if len(x)> 0 else 0 for x in tmp[bmiss]])
#
#c=Counter(df_train.brand_name.tolist())
#c.most_common(600)

#c=Counter([x[0] for x in tmp if len(x)>0])
#c.most_common(300)

In [7]:
%%time
print('processing category...')

def paths(tokens):
    all_paths = ['/'.join(tokens[0:(i+1)]) for i in range(len(tokens))]
    return ' '.join(all_paths)

@lru_cache(1024)
def cat_process(cat):
    cat = cat.lower()
    cat = whitespace.sub('', cat)
    split = cat.split('/')
    return paths(split)

df_train.category_name = df_train.category_name.apply(cat_process)

cat_tok = Tokenizer(min_df=50)
X_cat = cat_tok.fit_transform(df_train.category_name)
cat_voc_size = cat_tok.vocabulary_size()
print(cat_voc_size)

processing category...
853
CPU times: user 10.5 s, sys: 180 ms, total: 10.6 s
Wall time: 10.6 s


In [8]:
%%time
print('processing title...')

name_tok = Tokenizer(min_df=5, tokenizer=tokenize)
X_name = name_tok.fit_transform(df_train.name)
name_voc_size = name_tok.vocabulary_size()
print(name_voc_size)

processing title...
25587
CPU times: user 16.5 s, sys: 220 ms, total: 16.8 s
Wall time: 16.8 s


In [9]:
%%time
print('processing description...')

desc_num_col = 40
desc_tok = Tokenizer(min_df=10, tokenizer=tokenize)
X_desc = desc_tok.fit_transform(df_train.item_description)
X_desc = X_desc[:, :desc_num_col]
desc_voc_size = desc_tok.vocabulary_size()
print(desc_voc_size)

processing description...
32139
CPU times: user 47.9 s, sys: 932 ms, total: 48.8 s
Wall time: 48.8 s


In [10]:
%%time
print('processing brand...')

df_train.brand_name = df_train.brand_name.str.lower()
df_train.brand_name = df_train.brand_name.str.replace(' ', '_')

brand_cnt = Counter(df_train.brand_name[df_train.brand_name != 'unk_brand'])
brands = sorted(b for (b, c) in brand_cnt.items() if c >= 3)
brands_idx = {b: (i + 1) for (i, b) in enumerate(brands)}

X_brand = df_train.brand_name.apply(lambda b: brands_idx.get(b, 0))
X_brand = X_brand.values.reshape(-1, 1) 
brand_voc_size = len(brands) + 1
print(brand_voc_size)

processing brand...
3002
CPU times: user 1.63 s, sys: 16 ms, total: 1.65 s
Wall time: 1.65 s


In [11]:
%%time
print('processing other features...')

X_item_cond = (df_train.item_condition_id - 1).astype('uint8').values.reshape(-1, 1)
X_shipping = df_train.shipping.astype('float32').values.reshape(-1, 1)

processing other features...
CPU times: user 8.19 ms, sys: 55.7 ms, total: 63.8 ms
Wall time: 101 ms


In [12]:
print('defining the model...')

def prepare_batches(seq, step):
    n = len(seq)
    res = []
    for i in range(0, n, step):
        res.append(seq[i:i+step])
    return res

def conv1d(inputs, num_filters, filter_size, padding='same'):
    he_std = np.sqrt(2 / (filter_size * num_filters))
    out = tf.layers.conv1d(
        inputs=inputs, filters=num_filters, padding=padding,
        kernel_size=filter_size,
        activation=tf.nn.relu, 
        kernel_initializer=tf.random_normal_initializer(stddev=he_std))
    return out

def dense(X, size, reg=0.0, activation=None):
    he_std = np.sqrt(2 / int(X.shape[1]))
    out = tf.layers.dense(X, units=size, activation=activation, 
                     kernel_initializer=tf.random_normal_initializer(stddev=he_std),
                     kernel_regularizer=tf.contrib.layers.l2_regularizer(reg))
    return out

def embed(inputs, size, dim):
    std = np.sqrt(2 / dim)
    emb = tf.Variable(tf.random_uniform([size, dim], -std, std))
    lookup = tf.nn.embedding_lookup(emb, inputs)
    return lookup

defining the model...


In [36]:
name_embeddings_dim = 128
name_seq_len = X_name.shape[1]
desc_embeddings_dim = 128
desc_seq_len = X_desc.shape[1]

brand_embeddings_dim = 64

cat_embeddings_dim = 32
cat_seq_len = X_cat.shape[1]


graph = tf.Graph()
graph.seed = 1

with graph.as_default():
    place_name = tf.placeholder(tf.int32, shape=(None, name_seq_len))
    place_desc = tf.placeholder(tf.int32, shape=(None, desc_seq_len))
    place_brand = tf.placeholder(tf.int32, shape=(None, 1))
    place_cat = tf.placeholder(tf.int32, shape=(None, cat_seq_len))
    place_ship = tf.placeholder(tf.float32, shape=(None, 1))
    place_cond = tf.placeholder(tf.uint8, shape=(None, 1))

    place_y = tf.placeholder(dtype=tf.float32, shape=(None, 1))

    place_lr = tf.placeholder(tf.float32, shape=(), )

    name = embed(place_name, name_voc_size, name_embeddings_dim)
    desc = embed(place_desc, desc_voc_size, desc_embeddings_dim)
    brand = embed(place_brand, brand_voc_size, brand_embeddings_dim)
    cat = embed(place_cat, cat_voc_size, cat_embeddings_dim)

    name = conv1d(name, num_filters=20, filter_size=3)
    name = tf.layers.dropout(name, rate=0.05)
    name = tf.layers.average_pooling1d(name, pool_size=name_seq_len, strides=1, padding='valid')
    name = tf.contrib.layers.flatten(name)
    print(name.shape)

    desc = conv1d(desc, num_filters=20, filter_size=3)
    desc = tf.layers.dropout(desc, rate=0.1)
    desc = tf.layers.average_pooling1d(desc, pool_size=desc_seq_len, strides=1, padding='valid')

    desc = tf.contrib.layers.flatten(desc)
    print(desc.shape)

    brand = tf.contrib.layers.flatten(brand)
    print(brand.shape)

    cat = tf.layers.average_pooling1d(cat, pool_size=cat_seq_len, strides=1, padding='valid')
    cat = tf.contrib.layers.flatten(cat)
    print(cat.shape)
    
    ship = place_ship
    print(ship.shape)

    cond = tf.one_hot(place_cond, 5)
    cond = tf.contrib.layers.flatten(cond)
    print(cond.shape)

    out = tf.concat([name, desc, brand, cat, ship, cond], axis=1)
    print('concatenated dim:', out.shape)
    #out = tf.contrib.layers.batch_norm(out, decay=0.9)
    out = dense(out, 256, activation=tf.nn.relu)
    out = tf.layers.dropout(out, rate=0.0)
    #out = dense(out, 64, activation=tf.nn.relu)
    #out = tf.layers.dropout(out, rate=0.03)
    #out = tf.contrib.layers.batch_norm(out, decay=0.9)
    out = dense(out, 1)

    loss = tf.losses.mean_squared_error(place_y, out)
    rmse = tf.sqrt(loss)
    opt = tf.train.AdamOptimizer(learning_rate=place_lr)
    train_step = opt.minimize(loss)

    init = tf.global_variables_initializer()

session = tf.Session(config=None, graph=graph)
session.run(init)

(?, 20)
(?, 20)
(?, 64)
(?, 32)
(?, 1)
(?, 5)
concatenated dim: (?, 142)
INFO:tensorflow:Scale of 0 disables regularizer.
INFO:tensorflow:Scale of 0 disables regularizer.


In [37]:
print('training the model...')

train_idx, val_idx= list(ShuffleSplit(1, test_size=0.05, random_state=2).split(X_name))[0]
lr_init=0.004
lr_decay=0.0012
lr = lr_init
for i in range(4):
    t0 = time()
    np.random.seed(i)
    np.random.shuffle(train_idx)
    batches = prepare_batches(train_idx, 500)

    #if i <= 2:
    #    lr = 0.006
    #else:
    #    lr = 0.001
    lr = lr_init - lr_decay*i
    print(lr)
    for j, idx in enumerate(batches):
        feed_dict = {
            place_name: X_name[idx],
            place_desc: X_desc[idx],
            place_brand: X_brand[idx],
            place_cat: X_cat[idx],
            place_cond: X_item_cond[idx],
            place_ship: X_shipping[idx],
            place_y: y[idx],
            place_lr: lr,
        }
        session.run(train_step, feed_dict=feed_dict)

    took = time() - t0
    print('Training epoch %d took %.3fs' % (i, took))
    val_batches = prepare_batches(val_idx, 5000)
    y_pred = np.zeros(len(X_name))
    for idx in val_batches:
        feed_dict = {
            place_name: X_name[idx],
            place_desc: X_desc[idx],
            place_brand: X_brand[idx],
            place_cat: X_cat[idx],
            place_cond: X_item_cond[idx],
            place_ship: X_shipping[idx],
        }
        batch_pred = session.run(out, feed_dict=feed_dict)
        y_pred[idx] = batch_pred[:, 0]
    y_pred_val = y_pred[val_idx]
    y_true_val = y[val_idx][:,0]
    print("Validation rmse is ", np.sqrt(metrics.mean_squared_error(y_true_val, y_pred_val)))

training the model...
0.004
Training epoch 0 took 19.590s
Validation rmse is  0.4353417382652538
0.0028000000000000004
Training epoch 1 took 19.414s
Validation rmse is  0.42810988971528074
0.0016000000000000003
Training epoch 2 took 19.629s
Validation rmse is  0.4169708445792268
0.0004000000000000002
Training epoch 3 took 19.797s
Validation rmse is  0.4158965267751313


In [56]:
np.sqrt(metrics.mean_squared_error(y_true_val[np.abs(y_true_val- y_pred_val) <= 1.5], y_pred_val[np.abs(y_true_val- y_pred_val) <= 1.5]))

0.396548787979571

In [57]:
df_train.iloc[val_idx].loc[np.abs(y_true_val- y_pred_val) > 1.5].category_name.value_counts()

other other/other other/other/other                                                                                                                                              12
electronics electronics/videogames&consoles electronics/videogames&consoles/games                                                                                                11
women women/jewelry women/jewelry/necklaces                                                                                                                                      10
women women/jewelry women/jewelry/bracelets                                                                                                                                      10
women women/jewelry women/jewelry/rings                                                                                                                                           9
beauty beauty/skincare beauty/skincare/face                                                         

In [58]:
tmp = df_train.iloc[val_idx].loc[np.abs(y_true_val- y_pred_val) > 1.5]

In [None]:
tmp.loc[tmp.category_name == 