This Notebook is based on the [kernel](https://www.kaggle.com/lopuhin/eli5-for-mercari/notebook)

In [1]:
import eli5
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import mean_squared_log_error
import re
pd.set_option("max_colwidth", 500)

In [2]:
df_train = pd.read_table('../input/train.tsv')

In [3]:
y_train = np.log1p(df_train['price'])
df_train['category_name'] = df_train['category_name'].fillna('Other').astype(str)
df_train['brand_name'] = df_train['brand_name'].fillna('missing').astype(str)
df_train['shipping'] = df_train['shipping'].astype(str)
df_train['item_condition_id'] = df_train['item_condition_id'].astype(str)
df_train['item_description'] = df_train['item_description'].fillna('None')

In [34]:
def text_normalization(text_series):
    text_series = text_series.str.lower()
    text_series = text_series.str.replace(r'[\W+]', " ")
    text_series =  text_series.str.replace(r't-shirt|t-shirts', "tshirt")
    text_series =  text_series.str.replace(r'boot cut', "bootcut")
    text_series =  text_series.str.replace(r'16 gb', "16gb")
    text_series =  text_series.str.replace(r'32 gb', "32gb")
    text_series =  text_series.str.replace(r'64 gb', "64gb")
    text_series =  text_series.str.replace(r'128 gb', "12gb")
    text_series =  text_series.str.replace(r'250 gb', "250gb")
    text_series =  text_series.str.replace(r'500 gb', "500gb")
    text_series = text_series.str.replace(r'new with tags|nwt|new with tag|bnwt', "new_with_tags")
    text_series = text_series.str.replace(r'never wore|never used|neve used|never worn|never been worn', "never_used")
    text_series = text_series.str.replace(r'nwot|new without tag|new without tags', "new_without_tags")
    text_series = text_series.str.replace(r'your best offer|or best offer|orbestoffer|obo', "or_best_offer")
    text_series = text_series.str.replace(r'brand new', "brand_new")
    text_series = text_series.str.replace(r'hatchimals', "hatchimal")
    text_series = text_series.str.replace(r'hover board', "hoverboard")
    text_series = text_series.str.replace(r'weighs', "weights")
    text_series = text_series.str.replace(r'mk purses', "mk_purses")
    text_series = text_series.str.replace(r'pop stack', "pop_stack")
    text_series = text_series.str.replace(r'alexis brittar', "alexis_brittar")
    text_series = text_series.str.replace(r'14k|14kt|14 carat|14 k|14-karat|14-carat|14 gold', "14_carat")
    text_series = text_series.str.replace(r'10k|10kt|10 carat|10 k|10-karat|10-carat|10 gold', "10_carat")
    text_series = text_series.str.replace(r'24k|24kt|24 carat|24 k|24-karat|24-carat|24 gold', "24_carat")
    text_series = text_series.str.replace(r'18k|18kt|18 carat|18 k|18-karat|18-carat|18 gold', "18_carat")
    text_series = text_series.str.replace(r'dock a tot', "dockatot")
    text_series = text_series.str.replace(r'007 taz', "007taz")
    text_series = text_series.str.replace(r'playstation vr', "playstation_vr")
    text_series = text_series.str.replace(r'lil', "playstation_vr")
    return text_series

In [35]:
%%time
df_train.item_description = text_normalization(df_train.item_description)
df_train.name = text_normalization(df_train.name)

CPU times: user 1min 47s, sys: 1.36 s, total: 1min 48s
Wall time: 1min 49s


In [36]:
%%time
default_preprocessor = CountVectorizer().build_preprocessor()
def build_preprocessor(field):
    field_idx = list(df_train.columns).index(field)
    return lambda x: default_preprocessor(x[field_idx])
    
vectorizer = FeatureUnion([
    ('name', CountVectorizer(
        ngram_range=(1, 2),
        min_df = 5,
        preprocessor=build_preprocessor('name'))),
    ('category_name', CountVectorizer(
        token_pattern='.+',
        min_df = 5,
        preprocessor=build_preprocessor('category_name'))),
    ('brand_name', CountVectorizer(
        token_pattern='.+',
        preprocessor=build_preprocessor('brand_name'))),
    ('shipping', CountVectorizer(
        token_pattern='\d+',
        min_df = 5,
        preprocessor=build_preprocessor('shipping'))),
    ('item_condition_id', CountVectorizer(
        token_pattern='\d+',
        preprocessor=build_preprocessor('item_condition_id'))),
    ('item_description', TfidfVectorizer(
        ngram_range=(1, 2),
        min_df = 5,
        preprocessor=build_preprocessor('item_description'))),
])
X_train = vectorizer.fit_transform(df_train.values)

CPU times: user 2min 57s, sys: 6.37 s, total: 3min 4s
Wall time: 3min 4s


In [37]:
X_train.shape

(1482535, 787838)

In [7]:
from scipy.sparse import csr_matrix

In [39]:
X_train = X_train.tocsr()

In [40]:
import threading
# the following functions allow for a parallelized batch generator
class threadsafe_iter(object):
    """
    Takes an iterator/generator and makes it thread-safe by
    serializing call to the `next` method of given iterator/generator.
    """
    def __init__(self, it):
        self.it = it
        self.lock = threading.Lock()
    def __iter__(self):
        return self

    def __next__(self):
        with self.lock:
            return next(self.it)

def threadsafe_generator(f):
    """
    A decorator that takes a generator function and makes it thread-safe.
    """
    def g(*a, **kw):
        return threadsafe_iter(f(*a, **kw))
    return g

@threadsafe_generator
def batch_generator(X, y, batch_size):
    samples_per_epoch = X.shape[0]
    number_of_batches = samples_per_epoch/batch_size
    counter=0
    shuffle_index = np.arange(np.shape(y)[0])
    np.random.shuffle(shuffle_index)
    X =  X[shuffle_index, :]
    y =  y[shuffle_index]
    while 1:
        index_batch = shuffle_index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X[index_batch,:].tocsr()
        y_batch = y[index_batch]
        counter += 1
        yield(X_batch,y_batch)
        if (counter < number_of_batches):
            np.random.shuffle(shuffle_index)
            counter=0
# def batch_generator(X_data, y_data, batch_size):
#     samples_per_epoch = X_data.shape[0]
#     number_of_batches = samples_per_epoch/batch_size
#     counter=0
#     index = np.arange(np.shape(y_data)[0])
#     while 1:
#         index_batch = index[batch_size*counter:batch_size*(counter+1)]
#         X_batch = X_data[index_batch,:].tocsr()
#         y_batch = y_data[index_batch]
#         counter += 1
#         yield np.array(X_batch),y_batch
#         if (counter > number_of_batches):
#             counter=0
            
            
@threadsafe_generator
def batch_generator_x(X_data,batch_size):
    samples_per_epoch = X_data.shape[0]
    number_of_batches = samples_per_epoch/batch_size
    counter=0
    index = np.arange(np.shape(X_data)[0])
    while 1:
        index_batch = index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X_data[index_batch,:].todense()
        counter += 1
        yield np.array(X_batch)
        if (counter > number_of_batches):
            counter=0

In [None]:
from keras.layers import Dense, Dropout, BatchNormalization, Activation, Input, Flatten, concatenate, Embedding, advanced_activations
from keras.models import Model
from keras.callbacks import ModelCheckpoint, EarlyStopping, Callback
from keras.regularizers import l2
from keras import backend as K
from keras import optimizers
from keras import initializers
from keras import regularizers
import multiprocessing

np.random.seed(11)

n_workers = multiprocessing.cpu_count()
print("Number of workers....", n_workers)
            
def get_model():
    
    #Input
    inp_layer = Input(shape = [X_train.shape[1]], sparse= True)

    #main_layer
   # main_l = BatchNormalization()(inp_layer)
    main_l = Dense(60, activation = "relu", kernel_initializer = "he_normal")(inp_layer)
#    main_l = Dropout(dr_1)(Dense(50, activation = "relu", kernel_initializer = "he_normal")(main_l))
    main_l = BatchNormalization()(main_l)
    #main_l = Dropout(dr_1)(Dense(2048, activation = "relu")(inp_layer))
    main_l = Dropout(dr_2)(Dense(10, activation = "relu", kernel_initializer = "he_normal")(main_l))
#    main_l = Dropout(dr_3)(Dense(64, activation = "relu", kernel_initializer = "he_normal")(main_l))
    
    #output
    output = Dense(1, activation = "linear")(main_l)
    
    #model
    model = Model(inp_layer, output)
    
    opt = optimizers.Adam(clipnorm=1.)
    model.compile(loss = "mse", optimizer = opt)
    
    return model

epochs = 2
dr_1 = dr_2 = dr_3 = 0.15
batch_size = 512*4
model = get_model()
K.set_value(model.optimizer.lr, 0.005)
K.set_value(model.optimizer.decay, 0.005)

history = model.fit_generator(generator=batch_generator(X_train, y_train, batch_size),
                    workers=n_workers, 
                    steps_per_epoch=X_train.shape[0]//batch_size, 
                    max_queue_size=128,
                    epochs=epochs, 
                    verbose=1,
                   )

Number of workers.... 8
Epoch 1/2
152/723 [=====>........................] - ETA: 7:46 - loss: 0.9981