In [1]:
import copy
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
import sys
import os
import pickle
import hashlib
import string
import unicodedata
import re
from tqdm import tqdm, tqdm_notebook
tqdm.pandas(tqdm_notebook())

from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, KFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, QuantileTransformer
import lightgbm as lgb
from sklearn import metrics
import gc

from collections import defaultdict, OrderedDict, Counter
#from nltk.corpus import stopwords
#from spacy.lang.en.stop_words import STOP_WORDS
from itertools import chain

# from __future__ import print_function
np.random.seed(786)  # for reproducibility
from keras.models import Sequential, Model, load_model
from keras.layers import *
from keras.optimizers import *
from keras.utils import np_utils
from keras.layers.convolutional import Convolution1D, MaxPooling1D, ZeroPadding1D, AveragePooling1D
from keras.callbacks import EarlyStopping
from keras.layers.normalization import BatchNormalization
from keras.preprocessing import sequence
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN, LSTM, GRU
from keras.layers.wrappers import TimeDistributed
from keras.preprocessing.text import Tokenizer
from keras.callbacks import Callback, ModelCheckpoint
from keras.regularizers import l2
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasClassifier,  KerasRegressor

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
#Functions we need - Feature Selector, Fasttext_Estimator, Preprocessing Transformer, Binary_Encoder
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin
from pandas.api.types import is_numeric_dtype, is_string_dtype
from scipy.sparse.csr import csr_matrix
from sklearn.metrics import mean_squared_error, make_scorer

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_sklearn = make_scorer(rmse, greater_is_better=False)    
    
# the following functions allow for a parallelized batch generator
class threadsafe_iter(object):
    """
    Takes an iterator/generator and makes it thread-safe by
    serializing call to the `next` method of given iterator/generator.
    """
    def __init__(self, it):
        self.it = it
        self.lock = threading.Lock()
    def __iter__(self):
        return self

    def __next__(self):
        with self.lock:
            return next(self.it)

def threadsafe_generator(f):
    """
    A decorator that takes a generator function and makes it thread-safe.
    """
    def g(*a, **kw):
        return threadsafe_iter(f(*a, **kw))
    return g

@threadsafe_generator
def batch_generator(X_data, y_data, batch_size):
    
    #index = np.random.permutation(X_data.shape[0])    
    #X_data = X_data[index]
    #y_data = y_data[index]
    
    samples_per_epoch = X_data.shape[0]
    number_of_batches = samples_per_epoch/batch_size
    counter=0
    index = np.arange(np.shape(y_data)[0])
    #idx = 1
    while 1:
        index_batch = index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X_data[index_batch,:].todense()
        y_batch = y_data[index_batch]
        counter += 1
        yield np.array(X_batch),y_batch
        #print("")
        #print(X_batch.shape)
        #print("")
        #print('generator yielded a batch %d' % idx)
        #idx += 1
        if (counter > number_of_batches):
            counter=0
            
            
@threadsafe_generator
def batch_generator_x(X_data,batch_size):
    samples_per_epoch = X_data.shape[0]
    number_of_batches = samples_per_epoch/batch_size
    counter=0
    index = np.arange(np.shape(X_data)[0])
    while 1:
        index_batch = index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X_data[index_batch,:].todense()
        counter += 1
        yield np.array(X_batch)
        if (counter > number_of_batches):
            counter=0

In [3]:
num_partitions = 30
num_cores = 16
from multiprocessing import Pool, cpu_count
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
stop_words = ['ourselves', 'hers', 'between', 'yourself', 'but', 'again', 
              'there', 'about', 'once', 'during', 'out', 'very', 'having', 
              'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 
              'its', 'yours', 'such', 'into', 'most', 'itself', 'other', 
              'off', 'is', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 
              'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 
              'through', 'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 
              'down', 'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to', 
              'ours', 'had', 'she', 'all', 'when', 'at', 'any', 'before', 'them',
              'same', 'and', 'been', 'have', 'in', 'will', 'does', 'yourselves', 
              'then', 'that', 'because', 'what', 'over', 'why’, ‘so', 'can', 'did',
              'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only',
              'myself', 'which', 'those', 'i','after', 'few', 'whom', 'being', 'if', 
              'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than']

def unicodeToAscii(s):
    return  unicodedata.normalize('NFKC', s)

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"'", r"", s)
    s = re.sub(r"[.!?':;,]", r" ", s)
    s = re.sub(r"-", r"", s)
    s = re.sub(r"[^0-9a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"0", r"zero", s)
    s = re.sub(r"1", r"one", s)
    s = re.sub(r"2", r"two", s)
    s = re.sub(r"3", r"three", s)
    s = re.sub(r"4", r"four", s)
    s = re.sub(r"5", r"five", s)
    s = re.sub(r"6", r"six", s)
    s = re.sub(r"7", r"seven", s)
    s = re.sub(r"8", r"eight", s)
    #s = re.sub(r"/s/s", r"/s", s)
    return s

def _normalize_and_ngrams(sent, ngrams):
    input_list = normalizeString(sent).split()
    input_list = [word for word in input_list if word not in stop_words]
    s = input_list.copy()
    for i in range(2, ngrams+1):
        s += [' '.join(input_list[j:j+i]) for j in range(len(input_list)-i + 1)]
        #s += list((zip(*[input_list[j:] for j in range(i)])))
    return s

#tmp = "I am not a dance'r and i am a 6ixy   c-o:d;er programmer"
#print(normalizeString(tmp))
#print(_normalize_and_ngrams(tmp, 3))

class Vocab_topwords():
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.index2word = {}
        
    def fit_data(self, data, col, ngrams=3, max_features=50000):
        c = Counter(list(chain.from_iterable(data[col].tolist())))
        for i, (w, count) in enumerate(c.most_common(max_features)):
            self.word2index[w] = i
        return
    

            
            
def prepareVocab(name, data, max_features):
    vocab = Vocab_topwords(name)
    vocab.fit_data(data, name, max_features=max_features)
    
    print("Counted words:")
    print(vocab.name, len(vocab.word2index))
    return vocab

def indexesFromSentence(vocab, tokens, ngrams, max_len):
    num_list = []
    for i, item in enumerate(tokens):
        if len(num_list) == max_len:
            break
        elif item in vocab.word2index:
            num_list.append(vocab.word2index[item])
        else:
            continue
        
    if len(num_list) < max_len :
        num_list += [0]*(max_len - len(num_list) )
        
    return num_list

def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

def get_cat_1(x): return str(x).split('/')[0]
def get_cat_2(x): return str(x).split('/')[1] if len(str(x).split('/')) > 1 else -1
def get_cat_3(x): return ' '.join(str(x).split('/')[2:]) if len(str(x).split('/')) > 2 else -1

def applycat1(df): 
    df['cat1'] = df['category_name'].progress_apply(get_cat_1)
    return df

def applycat2(df): 
    df['cat2'] = df['category_name'].progress_apply(get_cat_2)
    return df

def applycat3(df): 
    df['cat3'] = df['category_name'].progress_apply(get_cat_3)
    return df

def norm3grams(s): return _normalize_and_ngrams(s, 3)

def applyname(series): return series.progress_apply(norm3grams)

def index2sent1(x, name_vocab): return indexesFromSentence(name_vocab, x, 3, 10)

def name2index(series): return series.progress_apply(lambda x: index2sent1(x, name_vocab))

def norm2grams(s): return _normalize_and_ngrams(s, 1)

def applydesc(series):return series.progress_apply(norm2grams)

def index2sent2(x, desc_vocab): return indexesFromSentence(desc_vocab, x, 1, 50)

def desc2index(series): return series.progress_apply(lambda x: index2sent2(x, desc_vocab))

def read_data(in_path, out_path):
    if False and os.path.exists(os.path.join(out_path, 'train_2.pkl')) and os.path.exists(os.path.join(out_path, 'test_2.pkl')):
        train_data = pd.read_pickle(os.path.join(out_path, 'train_2.pkl'))
        test_data  = pd.read_pickle(os.path.join(out_path, 'test_2.pkl'))
        
        return train_data, test_data
    
    else:
        train_data = pd.read_table(os.path.join(in_path, 'train.tsv'))
        test_data  = pd.read_table(os.path.join(in_path, 'test.tsv'))
    
        train_rows = len(train_data)
        data = pd.concat([train_data, test_data], ignore_index=True)
        
        data['name'] = data['name'].astype(str)
        data['item_description'] = data['item_description'].astype(str)
        
        #ddata = dd.from_pandas(data, 4)

        
        data = applycat1(data)
        data = applycat2(data)
        data = applycat3(data)
        data.fillna(-1, inplace=True)
        cat_cols = ['category_name', 'brand_name', 'item_condition_id', 'cat1', 'cat2', 'cat3']
        print("Label enoding categoricals")
        for col in cat_cols:
            data[col] = LabelEncoder().fit_transform(data[col].astype(str)).astype(np.int32)
            
        print("Tokenizing text columns")
        data['name'] = parallelize_dataframe(data['name'], applyname)
        print("Preparing vocabs")
        global name_vocab
        name_vocab = prepareVocab('name', data[['name']], 50000)
        data['name'] = name2index(data['name'])
        del name_vocab
        
        print("Transforming text to sequences")
        data['item_description'] = parallelize_dataframe(data['item_description'], applydesc)
        global desc_vocab
        desc_vocab = prepareVocab('item_description', data[['item_description']], 100000)
        data['item_description'] = desc2index(data['item_description'])
        del desc_vocab
        
        train_data = data.iloc[: train_rows, :]
        train_data = train_data.loc[(train_data.price >= 1) & (train_data.price <= 2100), :].reset_index(drop=True)
        test_data  = data.iloc[train_rows: , :].reset_index(drop=True)
        
        del train_data['test_id']
        del test_data['train_id']
        del data
        gc.collect()
        print("Writing out new pickles dataframes")
        train_data.to_pickle(os.path.join(out_path, 'train_2.pkl'))
        test_data.to_pickle(os.path.join(out_path, 'test_2.pkl'))
        
        return train_data, test_data

In [4]:
%%time
train_data, test_data = read_data("../input", "./")

100%|██████████| 2175894/2175894 [00:02<00:00, 826506.36it/s]
100%|██████████| 2175894/2175894 [00:03<00:00, 630439.43it/s]
100%|██████████| 2175894/2175894 [00:03<00:00, 580924.16it/s]


Label enoding categoricals
Tokenizing text columns


100%|██████████| 72530/72530 [00:03<00:00, 23790.67it/s]
100%|██████████| 72530/72530 [00:03<00:00, 23150.42it/s]
100%|██████████| 72530/72530 [00:03<00:00, 22581.22it/s]
100%|██████████| 72530/72530 [00:03<00:00, 22843.36it/s]
100%|██████████| 72530/72530 [00:03<00:00, 22968.39it/s]
100%|██████████| 72530/72530 [00:03<00:00, 21695.02it/s]
100%|██████████| 72530/72530 [00:03<00:00, 21988.37it/s]
100%|██████████| 72530/72530 [00:03<00:00, 21651.73it/s]
100%|██████████| 72530/72530 [00:03<00:00, 23512.84it/s]
 97%|█████████▋| 70605/72530 [00:03<00:00, 20270.24it/s]
100%|██████████| 72530/72530 [00:03<00:00, 22421.63it/s]
100%|██████████| 72530/72530 [00:03<00:00, 21427.91it/s]
100%|██████████| 72530/72530 [00:03<00:00, 22181.12it/s]
100%|██████████| 72530/72530 [00:03<00:00, 20776.13it/s]
100%|██████████| 72530/72530 [00:03<00:00, 22882.67it/s]
100%|██████████| 72530/72530 [00:03<00:00, 21452.73it/s]
100%|██████████| 72530/72530 [00:02<00:00, 30976.15it/s]
100%|██████████| 72530/72530 [0

Preparing vocabs


  1%|          | 13379/2175894 [00:00<00:16, 133785.02it/s]

Counted words:
name 50000


100%|██████████| 2175894/2175894 [00:13<00:00, 164640.72it/s]


Transforming text to sequences


100%|██████████| 72530/72530 [00:07<00:00, 10005.93it/s]
100%|██████████| 72530/72530 [00:07<00:00, 10209.21it/s]
100%|██████████| 72530/72530 [00:07<00:00, 9986.49it/s]]
100%|██████████| 72530/72530 [00:07<00:00, 9962.38it/s] 
100%|██████████| 72530/72530 [00:07<00:00, 9848.44it/s] 
100%|██████████| 72530/72530 [00:07<00:00, 10017.17it/s]
100%|██████████| 72530/72530 [00:07<00:00, 9839.57it/s] 
100%|██████████| 72530/72530 [00:07<00:00, 9630.90it/s]]
100%|██████████| 72530/72530 [00:07<00:00, 9715.35it/s] 
100%|██████████| 72530/72530 [00:07<00:00, 9990.75it/s]
100%|██████████| 72530/72530 [00:07<00:00, 9598.46it/s] 
100%|██████████| 72530/72530 [00:07<00:00, 9495.60it/s]
100%|██████████| 72530/72530 [00:07<00:00, 9717.20it/s]
100%|██████████| 72530/72530 [00:07<00:00, 9698.85it/s]
  2%|▏         | 1564/72530 [00:00<00:08, 8144.62it/s]]
100%|██████████| 72530/72530 [00:07<00:00, 9092.62it/s]
100%|██████████| 72530/72530 [00:04<00:00, 14620.21it/s]
100%|██████████| 72530/72530 [00:04<0

Counted words:
item_description 100000


100%|██████████| 2175894/2175894 [00:20<00:00, 106609.57it/s]


Writing out new pickles dataframes
CPU times: user 1min 50s, sys: 5.12 s, total: 1min 55s
Wall time: 2min 7s


In [5]:
cvlist = list(KFold(n_splits=10).split(train_data, train_data.price))
print(train_data.shape, test_data.shape)
train_data.head()

(1481661, 11) (693359, 11)


Unnamed: 0,brand_name,category_name,item_condition_id,item_description,name,price,shipping,train_id,cat1,cat2,cat3
0,2,830,2,"[3, 32, 35, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[4084, 9421, 10256, 125, 16, 3, 55, 20981, 326...",10.0,1,0.0,5,103,774
1,3890,87,2,"[2820, 15, 8, 133, 26, 800, 2, 29, 2, 2633, 54...","[12669, 34664, 2087, 0, 0, 0, 0, 0, 0, 0]",52.0,0,1.0,1,31,216
2,4589,1278,0,"[445, 48, 4458, 2, 190, 872, 978, 53, 1650, 23...","[208, 0, 0, 0, 0, 0, 0, 0, 0, 0]",10.0,1,2.0,9,104,98
3,2,504,0,"[0, 30, 134, 6481, 127, 10, 965, 1378, 94, 208...","[119, 2139, 44620, 0, 0, 0, 0, 0, 0, 0]",35.0,1,3.0,3,56,411
4,2,1205,0,"[647, 5978, 2, 1564, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[6132, 46, 1122, 145, 9686, 1480, 0, 0, 0, 0]",44.0,0,4.0,9,59,543


In [6]:
from sklearn.base import BaseEstimator, RegressorMixin
class EM_NNRegressor(BaseEstimator, RegressorMixin):
    
    def __init__(self, embed_cols=None, dense_cols=None, embed_dims=None, 
                 text_embed_cols=None, text_embed_seq_lens=None, 
                 text_embed_dims=None,
                 num_layers=2, multiprocess=False,
                layer_activations=None, layer_dims=None,layer_dropouts=None, epochs=20, batchsize=32,
                optimizer_kwargs=None, val_size=0.1, verbose=1, seed=1):
        
        self.embed_cols = embed_cols
        self.dense_cols = dense_cols
        self.embed_dims = embed_dims
        self.text_embed_cols = text_embed_cols
        self.text_embed_dims = text_embed_dims
        #self.text_embed_tokenizers = text_embed_tokenizers
        self.text_embed_seq_lens = text_embed_seq_lens
        self.dense_dims = None
        self.num_layers = num_layers
        self.layer_dims = layer_dims
        self.layer_activations = layer_activations
        self.layer_dropouts = layer_dropouts
        self.epochs = epochs
        self.batchsize = batchsize
        self.optimizer_kwargs = optimizer_kwargs
        self.val_size = val_size
        self.verbose = verbose
        self.multiprocess = multiprocess
        self.seed = seed
        self.model = None
        if self.dense_cols:
            self.dense_dims = len(self.dense_cols)
            
    def _splitX(self, X):
        X_splits = []
        
        if self.embed_cols:
            for col in self.embed_cols :
                X_splits.append(np.asarray(X[col]))
                
        if self.text_embed_cols:
            for i, col in enumerate(self.text_embed_cols):
                X_splits.append(np.asarray([*X[col].values]))
                
        if self.dense_cols:
            X_splits.append(X[self.dense_cols].values.reshape(X.shape[0], -1))
            
        return X_splits
    
    
    def _build_model(self):
        np.random.seed(786)
        model_inputs = []
        model_layers = []
        
        if self.embed_cols:
            for col, dim in zip(self.embed_cols, self.embed_dims):
                x1 = Input( shape=(1,), name=col)
                model_inputs.append(x1)
                x1 = Embedding(input_dim=dim[0], output_dim=dim[1],)(x1)
                #x1 = Dropout(0.1)(x1)
                x1 = Reshape(target_shape=(dim[1],))(x1)
                model_layers.append(x1)
                
        if self.text_embed_cols:
            for col, dim, seq_len in zip(self.text_embed_cols, 
                                                self.text_embed_dims, 
                                                self.text_embed_seq_lens):
                x3 = Input( shape=(seq_len,))
                model_inputs.append(x3)
                x3 = Embedding(input_dim=dim[0], output_dim=dim[1], input_length=seq_len,)(x3)
                #x3 = Conv1D(16, return_sequences=True)(x3)
                x3 = GlobalAveragePooling1D()(x3)
                x3 = Reshape(target_shape=(dim[1],))(x3)
                model_layers.append(x3)
                
        if self.dense_cols:
            x2 = Input( shape=(self.dense_dims, ), name='dense_cols')
            model_inputs.append(x2)
            model_layers.append(x2)
        print(model_layers)
        x = concatenate(model_layers)
        
        if self.num_layers > 0:
            for dim, drops in zip(self.layer_dims, self.layer_dropouts):
                x = BatchNormalization()(x)
                x = Dropout(rate=drops, seed=self.seed)(x)
                x = Dense(dim, kernel_initializer='he_normal')(x)
                #x = Dense(dim, activation='selu', kernel_initializer='he_normal')(x)
                x = LeakyReLU()(x)
        
        x = BatchNormalization()(x)
        x = Dropout(0.05, seed=self.seed)(x)
        output = Dense(1, activation='linear', kernel_initializer='he_normal')(x)
        
        model = Model(inputs=model_inputs, outputs=output)
        #print(model.summary())
        adam = RMSprop(lr=0.001, decay=0.001)
        #adam = Adam(lr=0.001, decay=1e-4)
        model.compile(optimizer=adam, loss='mean_squared_error')
        
        return model 
    
    
    def fit(self, X, y):
        self.model = self._build_model()
        if self.val_size > 0:
            X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=self.val_size, random_state=self.seed)
            print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)
            
            callbacks= [ModelCheckpoint("embed_NN_"+str(self.seed)+".check", save_best_only=True, verbose=1)]
            if self.multiprocess == False:
                self.model.fit(self._splitX(X_train), y_train, batch_size=self.batchsize, epochs=self.epochs,
                               verbose=self.verbose,
                              validation_data=(self._splitX(X_val), y_val), shuffle=True,
                              callbacks=callbacks)
            else:
                X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=self.val_size, random_state=1)

        else:
            self.model.fit(self._splitX(X), y, batch_size=self.batchsize, epochs=self.epochs,
               verbose=self.verbose, shuffle=True,)

        
        return self
    
    def predict(self, X, y=None):
        
        if self.model:
            self.model = load_model("embed_NN_"+str(self.seed)+".check")
            y_hat = self.model.predict(self._splitX(X))
        else:
            raise ValueError("Model not fit yet")
            
        return y_hat

In [17]:
nnet = EM_NNRegressor(embed_cols=['brand_name','category_name','item_condition_id', 'cat1', 'cat2', 'cat3'], 
                      embed_dims=[(6000, 40),(1500, 30), (5,4), (15,4), (120, 10), (900, 15)],
                      text_embed_cols=['name', 'item_description'],
                      text_embed_dims=[(50000, 70), (100000, 70)],
                      text_embed_seq_lens =[10, 50], 
                      dense_cols=['shipping'],
                      epochs=6,
                      batchsize=2048,
                      num_layers = 1,
                      layer_dropouts=[0.1],
                      layer_dims=[200],
                      val_size=0.05
                     )


In [18]:
#n_workers = multiprocessing.cpu_count()
#batch_size = 32

In [19]:
#scores = cross_val_score(nnet, train_data, np.log1p(train_data.price), scoring=rmse_sklearn, cv=cvlist, verbose =10)
#print(scores, np.mean(scores))
#for seed in range(1):
nnet.set_params(seed=1).fit(train_data, np.log1p(train_data.price))

[<tf.Tensor 'reshape_25/Reshape:0' shape=(?, 40) dtype=float32>, <tf.Tensor 'reshape_26/Reshape:0' shape=(?, 30) dtype=float32>, <tf.Tensor 'reshape_27/Reshape:0' shape=(?, 4) dtype=float32>, <tf.Tensor 'reshape_28/Reshape:0' shape=(?, 4) dtype=float32>, <tf.Tensor 'reshape_29/Reshape:0' shape=(?, 10) dtype=float32>, <tf.Tensor 'reshape_30/Reshape:0' shape=(?, 15) dtype=float32>, <tf.Tensor 'reshape_31/Reshape:0' shape=(?, 70) dtype=float32>, <tf.Tensor 'reshape_32/Reshape:0' shape=(?, 70) dtype=float32>, <tf.Tensor 'dense_cols_4:0' shape=(?, 1) dtype=float32>]
(1407577, 11) (74084, 11) (1407577,) (74084,)
Train on 1407577 samples, validate on 74084 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


EM_NNRegressor(batchsize=2048, dense_cols=['shipping'],
        embed_cols=['brand_name', 'category_name', 'item_condition_id', 'cat1', 'cat2', 'cat3'],
        embed_dims=[(6000, 40), (1500, 30), (5, 4), (15, 4), (120, 10), (900, 15)],
        epochs=6, layer_activations=None, layer_dims=[200],
        layer_dropouts=[0.1], multiprocess=False, num_layers=1,
        optimizer_kwargs=None, seed=1,
        text_embed_cols=['name', 'item_description'],
        text_embed_dims=[(50000, 70), (100000, 70)],
        text_embed_seq_lens=[10, 50], val_size=0.05, verbose=1)

In [20]:
nn_preds = nnet.predict(test_data)

In [21]:
nnet.model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 10)           0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
brand_name (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
category_name (InputLayer)      (None, 1)            0                                            
__________________________________________________________________________________________________
item_condi

In [22]:
def get_embeds(data, col, max_features, max_len, embed_dim, model, layer_num):
    embed_model = Sequential()
    embed_model.add(Embedding(max_features, embed_dim, input_length=max_len, weights=model.model.layers[layer_num].get_weights()))
    embed_model.add(GlobalAveragePooling1D())
    #name_embed_model.compile(optimizer='adam', loss='mse')

    embeddings = embed_model.predict(np.asarray([*data[col]]), batch_size=2500)
    return pd.DataFrame(embeddings, columns=[col+'_'+str(i) for i in range(embeddings.shape[1])])

def get_embeds2(data, col, max_features, max_len, embed_dim, model, layer_num):
    embed_model = Sequential()
    embed_model.add(Embedding(max_features, embed_dim, input_length=max_len, weights=model.model.layers[layer_num].get_weights()))
    #name_embed_model.compile(optimizer='adam', loss='mse')

    embeddings = embed_model.predict(np.asarray(data[col]), batch_size=2500).reshape(len(data), embed_dim)
    print(embeddings.shape)
    return pd.DataFrame(embeddings, columns=[col+'_'+str(i) for i in range(embeddings.shape[1])])

In [24]:
name_train_df = get_embeds(train_data, 'name', 50000, 10, 70, nnet, 8)
desc_train_df = get_embeds(train_data, 'item_description', 100000, 50, 70, nnet, 9)
brand_train_df = get_embeds2(train_data, 'brand_name', 6000, 1, 40, nnet, 10)
category_train_df = get_embeds2(train_data, 'category_name', 1500, 1, 30, nnet, 11)
condition_train_df = get_embeds2(train_data, 'item_condition_id', 5, 1, 4, nnet, 12)
cat1_train_df = get_embeds2(train_data, 'cat1', 15, 1, 4, nnet, 13)
cat2_train_df = get_embeds2(train_data, 'cat2', 120, 1, 10, nnet, 14)
cat3_train_df = get_embeds2(train_data, 'cat3', 900, 1, 15, nnet, 15)
print(name_train_df.shape, desc_train_df.shape)

(1481661, 40)
(1481661, 30)
(1481661, 4)
(1481661, 4)
(1481661, 10)
(1481661, 15)
(1481661, 70) (1481661, 70)


In [25]:
gc.collect()

19896

In [26]:
def embed_df(data, name_df, desc_df, brand_df, category_df, cat1_df, cat2_df, cat3_df, condition_df):
    data = data[['shipping', 'price', 'id']]
    data = pd.concat([data, name_df, desc_df, brand_df, category_df, 
                      condition_df, cat1_df, cat2_df, cat3_df], axis=1)
    return data

In [27]:
train_data['id'] = train_data['train_id']

In [28]:
train2 = embed_df(train_data, name_train_df, desc_train_df, brand_train_df, category_train_df,
                 cat1_train_df, cat2_train_df, cat3_train_df, condition_train_df)
train2.head()

Unnamed: 0,shipping,price,id,name_0,name_1,name_2,name_3,name_4,name_5,name_6,...,cat3_5,cat3_6,cat3_7,cat3_8,cat3_9,cat3_10,cat3_11,cat3_12,cat3_13,cat3_14
0,1,10.0,0.0,0.005436,-0.002834,-0.00062,-0.01432,-0.018339,0.011182,0.037989,...,-0.034057,0.0113,-0.049982,0.015964,-0.012996,0.06573,-0.072058,0.039482,0.013817,0.03786
1,0,52.0,1.0,-0.003587,-0.006315,0.008053,-0.01168,-0.004148,-0.005934,0.002647,...,-0.058369,0.026939,0.016904,0.038118,-0.004726,0.017666,0.049773,0.017442,-0.039765,0.034741
2,1,10.0,2.0,0.013486,-0.009317,-0.000395,-0.02065,0.001478,-0.00568,0.010997,...,0.005834,-0.019087,-0.030805,0.06307,0.007766,-0.035301,0.014713,-0.014809,0.047389,-0.007801
3,1,35.0,3.0,0.009502,-0.001996,0.005002,-0.016129,0.001382,-0.01397,-0.005269,...,-0.02068,0.057065,0.048931,0.014143,-0.013839,-0.005063,-0.016318,0.024021,0.055805,-0.019555
4,0,44.0,4.0,-0.008966,0.003316,0.001357,-0.021735,-0.001377,0.003187,0.008127,...,-0.03526,0.046141,-0.003048,-0.020771,0.009553,0.036631,0.057082,0.017131,-0.029955,-0.056519


In [29]:
#del train2, train_data
gc.collect()

182

In [31]:
import lightgbm as lgb
from sklearn.preprocessing import RobustScaler, QuantileTransformer
lgb1 = lgb.LGBMRegressor(learning_rate=0.1, n_estimators=6000, num_leaves=15, n_jobs=4)

dtrain, dvalid = train_test_split(train2, test_size=0.05, random_state=1)
feats = [col for col in train2.columns if col not in ['price','id']]
#scaler = QuantileTransformer(output_distribution='normal')

Xtrain = dtrain[feats]
Xvalid = dvalid[feats]
y = np.log1p(dtrain['price'].values)
yvalid = np.log1p(dvalid['price'].values)
lgb1.fit(Xtrain, y, eval_set=[(Xtrain, y), (Xvalid, yvalid)], eval_metric='l2_root', verbose=10, early_stopping_rounds=100)

Training until validation scores don't improve for 100 rounds.
[10]	training's rmse: 0.620978	valid_1's rmse: 0.625592
[20]	training's rmse: 0.557749	valid_1's rmse: 0.564743
[30]	training's rmse: 0.522137	valid_1's rmse: 0.530665
[40]	training's rmse: 0.500487	valid_1's rmse: 0.510183
[50]	training's rmse: 0.485791	valid_1's rmse: 0.496141
[60]	training's rmse: 0.47564	valid_1's rmse: 0.486761
[70]	training's rmse: 0.468834	valid_1's rmse: 0.480691
[80]	training's rmse: 0.463613	valid_1's rmse: 0.476041
[90]	training's rmse: 0.459809	valid_1's rmse: 0.472817
[100]	training's rmse: 0.456954	valid_1's rmse: 0.470382
[110]	training's rmse: 0.454698	valid_1's rmse: 0.46848
[120]	training's rmse: 0.452728	valid_1's rmse: 0.466801
[130]	training's rmse: 0.451057	valid_1's rmse: 0.465346
[140]	training's rmse: 0.449584	valid_1's rmse: 0.464085
[150]	training's rmse: 0.448289	valid_1's rmse: 0.463015
[160]	training's rmse: 0.447054	valid_1's rmse: 0.461941
[170]	training's rmse: 0.445895	vali

[1440]	training's rmse: 0.409402	valid_1's rmse: 0.434388
[1450]	training's rmse: 0.409261	valid_1's rmse: 0.434301
[1460]	training's rmse: 0.409168	valid_1's rmse: 0.434228
[1470]	training's rmse: 0.409028	valid_1's rmse: 0.434129
[1480]	training's rmse: 0.408907	valid_1's rmse: 0.434074
[1490]	training's rmse: 0.408804	valid_1's rmse: 0.434014
[1500]	training's rmse: 0.408684	valid_1's rmse: 0.433944
[1510]	training's rmse: 0.408578	valid_1's rmse: 0.433875
[1520]	training's rmse: 0.408461	valid_1's rmse: 0.433811
[1530]	training's rmse: 0.408352	valid_1's rmse: 0.433729
[1540]	training's rmse: 0.408249	valid_1's rmse: 0.433658
[1550]	training's rmse: 0.408102	valid_1's rmse: 0.433582
[1560]	training's rmse: 0.407978	valid_1's rmse: 0.433495
[1570]	training's rmse: 0.40788	valid_1's rmse: 0.433442
[1580]	training's rmse: 0.407767	valid_1's rmse: 0.43338
[1590]	training's rmse: 0.407678	valid_1's rmse: 0.433336
[1600]	training's rmse: 0.407576	valid_1's rmse: 0.433268
[1610]	training'

[2860]	training's rmse: 0.397181	valid_1's rmse: 0.427951
[2870]	training's rmse: 0.397102	valid_1's rmse: 0.427907
[2880]	training's rmse: 0.397043	valid_1's rmse: 0.427874
[2890]	training's rmse: 0.396976	valid_1's rmse: 0.427847
[2900]	training's rmse: 0.396912	valid_1's rmse: 0.427826
[2910]	training's rmse: 0.396854	valid_1's rmse: 0.427791
[2920]	training's rmse: 0.396785	valid_1's rmse: 0.427762
[2930]	training's rmse: 0.396724	valid_1's rmse: 0.427717
[2940]	training's rmse: 0.396664	valid_1's rmse: 0.427688
[2950]	training's rmse: 0.396604	valid_1's rmse: 0.427661
[2960]	training's rmse: 0.396541	valid_1's rmse: 0.427633
[2970]	training's rmse: 0.39647	valid_1's rmse: 0.427592
[2980]	training's rmse: 0.396401	valid_1's rmse: 0.42755
[2990]	training's rmse: 0.396342	valid_1's rmse: 0.42753
[3000]	training's rmse: 0.396282	valid_1's rmse: 0.42751
[3010]	training's rmse: 0.39622	valid_1's rmse: 0.427476
[3020]	training's rmse: 0.396166	valid_1's rmse: 0.427456
[3030]	training's r

[4280]	training's rmse: 0.389122	valid_1's rmse: 0.424532
[4290]	training's rmse: 0.389066	valid_1's rmse: 0.424502
[4300]	training's rmse: 0.389028	valid_1's rmse: 0.424497
[4310]	training's rmse: 0.38898	valid_1's rmse: 0.424481
[4320]	training's rmse: 0.388935	valid_1's rmse: 0.424455
[4330]	training's rmse: 0.388896	valid_1's rmse: 0.424444
[4340]	training's rmse: 0.388844	valid_1's rmse: 0.424434
[4350]	training's rmse: 0.388806	valid_1's rmse: 0.424426
[4360]	training's rmse: 0.388761	valid_1's rmse: 0.424415
[4370]	training's rmse: 0.388714	valid_1's rmse: 0.424395
[4380]	training's rmse: 0.388678	valid_1's rmse: 0.424395
[4390]	training's rmse: 0.388633	valid_1's rmse: 0.424377
[4400]	training's rmse: 0.388584	valid_1's rmse: 0.424347
[4410]	training's rmse: 0.388533	valid_1's rmse: 0.424329
[4420]	training's rmse: 0.388494	valid_1's rmse: 0.424313
[4430]	training's rmse: 0.388443	valid_1's rmse: 0.424306
[4440]	training's rmse: 0.388397	valid_1's rmse: 0.424292
[4450]	training

[5700]	training's rmse: 0.38296	valid_1's rmse: 0.422484
[5710]	training's rmse: 0.382922	valid_1's rmse: 0.422468
[5720]	training's rmse: 0.382877	valid_1's rmse: 0.422447
[5730]	training's rmse: 0.382844	valid_1's rmse: 0.422439
[5740]	training's rmse: 0.382807	valid_1's rmse: 0.42243
[5750]	training's rmse: 0.382767	valid_1's rmse: 0.422414
[5760]	training's rmse: 0.38273	valid_1's rmse: 0.422409
[5770]	training's rmse: 0.382687	valid_1's rmse: 0.422395
[5780]	training's rmse: 0.382646	valid_1's rmse: 0.422377
[5790]	training's rmse: 0.382614	valid_1's rmse: 0.422366
[5800]	training's rmse: 0.382567	valid_1's rmse: 0.422347
[5810]	training's rmse: 0.382522	valid_1's rmse: 0.422343
[5820]	training's rmse: 0.38248	valid_1's rmse: 0.422342
[5830]	training's rmse: 0.382442	valid_1's rmse: 0.422335
[5840]	training's rmse: 0.382399	valid_1's rmse: 0.422327
[5850]	training's rmse: 0.382354	valid_1's rmse: 0.422317
[5860]	training's rmse: 0.382318	valid_1's rmse: 0.422316
[5870]	training's 

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=6000,
       n_jobs=4, num_leaves=8, objective=None, random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)

In [None]:
from sklearn.linear_model import Ridge
rd = Ridge()
rd.fit(Xtrain, y)
rmse(yvalid, rd.predict(Xvalid))

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(10, 20))
lgb.plot_importance(lgb1, ax=ax)
plt.show()

In [None]:
#Checking sequence lengths
#sns.distplot(data.item_description.apply(lambda x: len(str(x).split())))
test_data['id'] = test_data['test_id']
del name_train_df, desc_train_df, train_data, brand_train_df, condition_train_df
name_test_df = get_embeds(test_data, 'name', 50000, 10, 70, nnet, 8)
desc_test_df = get_embeds(test_data, 'item_description', 100000, 80, 70, nnet, 9)
brand_test_df = get_embeds2(test_data, 'brand_name', 6000, 1, 40, nnet, 10)
category_test_df = get_embeds2(test_data, 'category_name', 1500, 1, 30, nnet, 11)
condition_test_df = get_embeds2(test_data, 'item_condition_id', 5, 1, 4, nnet, 12)
cat1_test_df = get_embeds2(test_data, 'cat1', 15, 1, 4, nnet, 13)
cat2_test_df = get_embeds2(test_data, 'cat2', 120, 1, 10, nnet, 14)
cat3_test_df = get_embeds2(test_data, 'cat3', 900, 1, 15, nnet, 15)

train2 = embed_df(test_data, name_test_df, desc_test_df, brand_test_df, category_test_df,
                 cat1_test_df, cat2_test_df, cat3_test_df, condition_test_df)
test_preds = np.expm1(lgb1.predict(test2[feats]))

print("Write out submission")
sub = pd.DataFrame({'test_id': test_data['id'].values})
sub['price']= test_preds
sub['test_id'] =sub['test_id'].astype(np.int32)
sub['price'] = sub['price'].clip(3, 2000)
sub.to_csv("embed_lgb.csv", index=False)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.distplot(np.log1p(train_data['price']))
sns.distplot(np.log1p(test_preds))
plt.show()