### Experiment 2
* Initialize with pretrained weights
    - Try different pretrained weights

In [1]:
import copy
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
import sys
import os
import re
import threading
import multiprocessing

from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, KFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
import lightgbm as lgb
from sklearn import metrics
import gc

# from __future__ import print_function
np.random.seed(786)  # for reproducibility

from keras.models import Sequential, Model, load_model
from keras.layers import *
from keras.optimizers import *
from keras.utils import np_utils
from keras.layers.convolutional import Convolution1D, MaxPooling1D, ZeroPadding1D, AveragePooling1D
from keras.callbacks import EarlyStopping
from keras.layers.normalization import BatchNormalization
from keras.preprocessing import sequence
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN, LSTM, GRU
from keras.layers.wrappers import TimeDistributed
from keras.preprocessing.text import Tokenizer
from keras.callbacks import Callback, ModelCheckpoint
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasClassifier,  KerasRegressor
#Some classes
#Functions we need - Feature Selector, Fasttext_Estimator, Preprocessing Transformer, Binary_Encoder
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin
from pandas.api.types import is_numeric_dtype, is_string_dtype
from scipy.sparse.csr import csr_matrix
from sklearn.metrics import mean_squared_error, make_scorer

def rmse(y_true, y_pred):
    print(np.min(y_pred), np.max(y_pred))
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_sklearn = make_scorer(rmse, greater_is_better=False)

def get_obj_cols(df):
    """Return columns with object dtypes"""
    obj_cols = []
    for idx, dt in enumerate(df.dtypes):
        if dt == 'object':
            obj_cols.append(df.columns.values[idx])

    return obj_cols


def convert_input(X):
    """if input not a dataframe convert it to one"""
    if not isinstance(X, pd.DataFrame):
        if isinstance(X, list):
            X = pd.DataFrame(np.array(X))
        elif isinstance(X, (np.generic, np.ndarray)):
            X = pd.DataFrame(X)
        elif isinstance(X, csr_matrix):
            X = pd.SparseDataFrame(X)
        else:
            raise ValueError('Unexpected input type: %s' % (str(type(X))))

        #X = X.apply(lambda x: pd.to_numeric(x, errors='ignore'))
    return X

class FeatureSelector(BaseEstimator, TransformerMixin):
    """ Class to do subset of features in sklearn pipeline"""
    def __init__(self, cols=None, return_df=True, verbose=0):
        self.cols = cols
        self.return_df = return_df
        self.verbose = verbose
        
    def fit(self, X, y=None):
        #Do nothing
        return self
    
    def transform(self, X, y=None):
        #if the input dataset isn't already a dataframe, convert it to one
        X = X.copy(deep=True)
        X = convert_input(X)
        X = X.loc[:, self.col]
        
        if self.verbose:
            print("Selecting columns are {}".format(self.col))
        if self.return_df:
            return X
        else:
            return X.values
        
class TargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cols=None, thresh=0, func=np.mean, add_to_orig=False):
        self.cols = cols
        self.thresh = thresh
        self.func = func
        self.add_to_orig = add_to_orig
    
    #@numba.jit        
    def fit(self, X, y):
        self.prior = self.func(y)
        self._dict = {}
        for col in self.cols:
            if isinstance(col, (list, tuple)):
                print('here')
                tmp_df = X.loc[: ,col]
                col = tuple(col)
            else:
                tmp_df = X.loc[: ,[col]]
            tmp_df['y'] = y
            print(tmp_df.columns)
            #tmp_df = pd.DataFrame({'eval_col':X[col].values, 'y':y})
            if isinstance(col, (list, tuple)):
                print('here')
                col = tuple(col)
            self._dict[col] = tmp_df.groupby(col)['y'].apply(lambda x: 
                                self.func(x) if len(x) >= self.thresh  else self.prior).to_dict()
                                
            del tmp_df
        return self
    #@numba.jit
    def transform(self, X, y=None):
        X_transformed = []
        for col in self.cols:
            
            if isinstance(col, (list, tuple)):
                tmp_df = X.loc[:, col]
                enc = tmp_df[col].apply(lambda x: self._dict[tuple(col)][tuple(x)]
                                                                     if tuple(x) in self._dict[tuple(col)]
                                                                     else self.prior, axis=1).values
            else:
                tmp_df = X.loc[:, [col]]
                enc = tmp_df[col].apply(lambda x: self._dict[col][x]
                                                                     if x in self._dict[col]
                                                                     else self.prior).values
            del tmp_df
            X_transformed.append(enc)
        
        X_transformed = np.vstack(X_transformed).T
        
        if self.add_to_orig:
            return np.concatenate((X.values, X_transformed), axis=1)
            
        else:
            return X_transformed
        
def isiphonecase(series): return (series.str.contains('iphone', flags=re.IGNORECASE) & 
                                (series.str.contains('case', flags=re.IGNORECASE)) )
def isiphone6(series): return (series.str.contains('iphone', flags=re.IGNORECASE) & 
                        series.str.contains('6|six', flags=re.IGNORECASE) &
                        ~(series.str.contains('plus|\+', flags=re.IGNORECASE)) &
                                ~(series.str.contains('case', flags=re.IGNORECASE)) )

def isiphone6p(series): return (series.str.contains('iphone', flags=re.IGNORECASE) & 
                        series.str.contains('6|six', flags=re.IGNORECASE) &
                        series.str.contains('plus|\+', flags=re.IGNORECASE) &
                                ~(series.str.contains('case', flags=re.IGNORECASE)) )

def isiphone5(series): return (series.str.contains('iphone', flags=re.IGNORECASE) & 
                        series.str.contains('5|five', flags=re.IGNORECASE) &
                        ~(series.str.contains('plus|\+', flags=re.IGNORECASE)) &
                                ~(series.str.contains('case', flags=re.IGNORECASE)) )

def isiphone5p(series): return (series.str.contains('iphone', flags=re.IGNORECASE) & 
                        series.str.contains('5|five', flags=re.IGNORECASE) &
                        series.str.contains('plus|\+', flags=re.IGNORECASE) &
                                ~(series.str.contains('case', flags=re.IGNORECASE)) )

def isiphone7(series): return (series.str.contains('iphone', flags=re.IGNORECASE) & 
                        series.str.contains('7|seven', flags=re.IGNORECASE) &
                        ~(series.str.contains('plus|\+', flags=re.IGNORECASE)) &
                                ~(series.str.contains('case', flags=re.IGNORECASE)) )

def isiphone7p(series): return (series.str.contains('iphone', flags=re.IGNORECASE) & 
                        series.str.contains('7|seven', flags=re.IGNORECASE) &
                        series.str.contains('plus|\+', flags=re.IGNORECASE) &
                                ~(series.str.contains('case', flags=re.IGNORECASE)) )

#Data reading function
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
#(sequences, maxlen=None, dtype='int32', padding='pre', truncating='pre', value=0.)

def read_data(in_path, out_path):
    if False and os.path.exists(os.path.join(out_path, 'train_2.pkl')) and os.path.exists(os.path.join(out_path, 'test_2.pkl')):
        train_data = pd.read_pickle(os.path.join(out_path, 'train_2.pkl'))
        test_data  = pd.read_pickle(os.path.join(out_path, 'test_2.pkl'))
        
        return train_data, test_data
    
    else:
        train_data = pd.read_table(os.path.join(in_path, 'train.tsv'))
        test_data  = pd.read_table(os.path.join(in_path, 'test.tsv'))
    
        train_rows = len(train_data)
        data = pd.concat([train_data, test_data], ignore_index=True)
    
        data['cat1'] = data['category_name'].apply(lambda x: str(x).split('/')[0])
        data['cat2'] = data['category_name'].apply(lambda x: str(x).split('/')[1] 
                                                   if len(str(x).split('/')) > 1 else -1)
        data['cat3'] = data['category_name'].apply(lambda x: ' '.join(str(x).split('/')[2:]) 
                                                   if len(str(x).split('/')) > 2 else -1)
        data.fillna(-1, inplace=True)
        
        print("Getting word/char len features")
        data['desc_words'] = data['item_description'].apply(lambda x: len(str(x).split()))
        data['desc_chars'] = data['item_description'].apply(lambda x: len(str(x)))
        data['name_words'] = data['name'].apply(lambda x: len(str(x).split()))
        data['name_chars'] = data['name'].apply(len)
        
        for col in ["desc_words", "desc_chars", "name_words", "name_chars"]:
            data[col]  = data[col]/ data[col].max()
        
        print("Get iphone features")
        data['iphone_case'] = isiphonecase(data['name'])
        data['iphone6'] = isiphone6(data['name'])
        data['iphone6p'] = isiphone6p(data['name'])
        data['iphone5'] = isiphone5(data['name'])
        data['iphone5p'] = isiphone5p(data['name'])
        data['iphone7'] = isiphone7(data['name'])
        data['iphone7p'] = isiphone7p(data['name'])
        data['unlocked_phone'] = data.name.str.contains('unlocked', flags=re.IGNORECASE)
        cat_cols = ['category_name', 'brand_name', 'cat1', 'cat2', 'cat3', 'item_condition_id']
        for col in cat_cols:
            data[col] = LabelEncoder().fit_transform(data[col].astype(str)) + 1
            
        print("Get count features")
        target_enc1 = TargetEncoder(cols=['brand_name'], func=len)
        data['brand_counts'] = target_enc1.fit_transform(data[['brand_name']], data.price)
        data['brand_counts'] = data['brand_counts']/data['brand_counts'].max()

        target_enc2 = TargetEncoder(cols=['category_name'], func=len)
        data['cat_counts'] = target_enc2.fit_transform(data[['category_name']], data.price)
        data['cat_counts'] = data['cat_counts']/data['cat_counts'].max()
        
        target_enc3 = TargetEncoder(cols=['cat1'], func=len)
        data['cat1_counts'] = target_enc3.fit_transform(data[['cat1']], data.price)
        data['cat1_counts'] = data['cat1_counts']/data['cat1_counts'].max()
        
        target_enc4 = TargetEncoder(cols=['cat2'], func=len)
        data['cat2_counts'] = target_enc4.fit_transform(data[['cat2']], data.price)
        data['cat2_counts'] = data['cat2_counts']/data['cat2_counts'].max()
        
        target_enc5 = TargetEncoder(cols=['cat3'], func=len)
        data['cat3_counts'] = target_enc5.fit_transform(data[['cat3']], data.price)
        data['cat3_counts'] = data['cat3_counts']/data['cat3_counts'].max()
        #tkn_desc = Tokenizer(50000)   
        
        data['item_desc2gram'] = data.item_description.apply(lambda x: add_ngrams(x, 2))
        print("Tokenizing data")
        tok_name  = Tokenizer(20000)
        tok_name.fit_on_texts(data['name'].astype(str))
        
        tok_desc= Tokenizer(100000)
        tok_desc.fit_on_texts(data['item_description'].astype(str))

        tok_desc2 = Tokenizer(20000)
        tok_desc2.fit_on_texts(data['item_desc2gram'].astype(str))
        
        data["name"] = list(zip(sequence.pad_sequences(tok_name.texts_to_sequences(data.name.astype(str)),
                                         maxlen=7, padding='post', truncating='post')))
        
        data["item_description"] = list(zip(sequence.pad_sequences(tok_desc.texts_to_sequences(data.item_description.astype(str)),
                                         maxlen=70, padding='post', truncating='post')))
        
        data["item_desc2gram"] = list(zip(sequence.pad_sequences(tok_desc2.texts_to_sequences(data.item_desc2gram.astype(str)),
                                         maxlen=30, padding='post', truncating='post')))
        #tkn_desc = Tokenizer(50000)
        #tkn_desc.fit_on_texts(data.item_description.astype(str))
        #data['desc_seq'] = pad_sequences(tkn_desc.texts_to_sequences(data.item_description.astype(str)),
        #                                 maxlen=100, padding='post', truncating='post')
        
        #tkn_name = Tokenizer(4000)
        #tkn_name.fit_on_texts(data.name.astype(str))
        #data['name_seq'] = pad_sequences(tkn_name.texts_to_sequences(data.name.astype(str)),
        #                                 maxlen=6, padding='post', truncating='post')
        
        
        train_data = data.loc[: train_rows - 1, :].reset_index(drop=True)
        train_data = train_data.loc[(train_data.price >= 1) & (train_data.price <= 2000), :].reset_index(drop=True)
        test_data  = data.loc[train_rows: , :].reset_index(drop=True)
        
        del train_data['test_id']
        del test_data['train_id']
        del data 
        test_data['test_id'] = test_data['test_id'].astype(int)
        #train_data.to_pickle(os.path.join(out_path, 'train_2.pkl'))
        #test_data.to_pickle(os.path.join(out_path, 'test_2.pkl'))
        
        return train_data, test_data, tok_name, tok_desc, tok_desc2
        


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [None]:
class EM_NNRegressor(BaseEstimator, RegressorMixin):
    
    def __init__(self, embed_cols=None, dense_cols=None, embed_dims=None, 
                 text_embed_cols=None, text_embed_seq_lens=None, 
                 text_embed_dims=None, 
                 text_embed_tokenizers=None,
                 num_layers=2, multiprocess=False,
                layer_activations=None, layer_dims=None,layer_dropouts=None, epochs=20, batchsize=32,
                optimizer_kwargs=None, val_size=0.1, verbose=1, seed=1,):
        
        self.embed_cols = embed_cols
        self.dense_cols = dense_cols
        self.embed_dims = embed_dims
        self.text_embed_cols = text_embed_cols
        self.text_embed_dims = text_embed_dims
        self.text_embed_tokenizers = text_embed_tokenizers
        self.text_embed_seq_lens = text_embed_seq_lens
        self.dense_dims = None
        self.num_layers = num_layers
        self.layer_dims = layer_dims
        self.layer_activations = layer_activations
        self.layer_dropouts = layer_dropouts
        self.epochs = epochs
        self.batchsize = batchsize
        self.optimizer_kwargs = optimizer_kwargs
        self.val_size = val_size
        self.verbose = verbose
        self.multiprocess = multiprocess
        self.seed = seed
        #self.optim = optim
        self.model = None
        if self.dense_cols:
            self.dense_dims = len(self.dense_cols)
            
    def _splitX(self, X):
        X_splits = []
        
        if self.embed_cols:
            for col in self.embed_cols :
                X_splits.append(X[col].values.reshape(X.shape[0], -1))
                
        if self.text_embed_cols:
            for i, col in enumerate(self.text_embed_cols):
                #max_features = self.text_embed_dims[i][0]
                #max_len = self.text_embed_seq_lens[i]
                #input_text = X[col].astype(str)
                #x_train = tok.texts_to_sequences(input_text)
                #print(np.mean([len(l) for l in x_train]))
                #x_train = sequence.pad_sequences(x_train, maxlen=max_len)
                #X_splits.append(np.array(x_train).reshape(X.shape[0], -1))
                X_splits.append(np.concatenate(X[col].values))
                
        if self.dense_cols:
            X_splits.append(X[self.dense_cols].values.reshape(X.shape[0], -1))
            
        return X_splits
    
    def get_pretrained_embeddings(self, tok):
        embeddings_index = {}
        f = open(os.path.join('/home/mohsin/Downloads/', 'glove.6B.100d.txt'))
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        f.close()
        
        embedding_matrix = np.zeros(self.text_embed_dims[0])
        for word, i in tok.items():
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                # words not found in embedding index will be all-zeros.
                embedding_matrix[i] = embedding_vector
        print('Found %s word vectors.' % len(embeddings_index))
        return embedding_matrix
    
    
    def _build_model(self):
        model_inputs = []
        model_layers = []
        
        if self.embed_cols:
            for col, dim in zip(self.embed_cols, self.embed_dims):
                x1 = Input( shape=(1,), name=col)
                model_inputs.append(x1)
                x1 = Embedding(input_dim=dim[0], output_dim=dim[1], )(x1)
                #x1 = Dropout(0.1)(x1)
                x1 = Reshape(target_shape=(dim[1],))(x1)
                model_layers.append(x1)
                
        if self.text_embed_cols:
            for col, dim, seq_len in zip(self.text_embed_cols, 
                                                self.text_embed_dims, 
                                                self.text_embed_seq_lens):
                x3 = Input( shape=(seq_len,))
                model_inputs.append(x3)
                x3 = Embedding(input_dim=dim[0], output_dim=dim[1], input_length=seq_len)(x3)
                x3 = GlobalAveragePooling1D()(x3)
                x3 = Reshape(target_shape=(dim[1],))(x3)
                model_layers.append(x3)
                
        if self.dense_cols:
            x2 = Input( shape=(self.dense_dims, ), name='dense_cols')
            model_inputs.append(x2)
            model_layers.append(x2)
        print(model_layers)
        x = concatenate(model_layers)
        
        if self.num_layers > 0:
            for dim, drops in zip(self.layer_dims, self.layer_dropouts):
                x = BatchNormalization()(x)
                x = Dropout(rate=drops)(x)
                x = Dense(dim, kernel_initializer='he_normal')(x)
                x = LeakyReLU()(x)
        
        x = BatchNormalization()(x)
        x = Dropout(0.05)(x)
        output = Dense(1, activation='linear', kernel_initializer='he_normal')(x)
        
        model = Model(inputs=model_inputs, outputs=output)
        #print(model.summary())
        #adam = Nadam(lr=0.002, schedule_decay=0.02)
        adam = Adam(lr=0.005, decay=0.001)
        #adam = SGD(lr=0.01, nesterov=True, momentum=0.9, decay=0.003)
        #adam = RMSprop(lr=0.01, decay=0.006)
        #adam = self.optim
        model.compile(optimizer=adam, loss='mean_squared_error' )
        
        return model 
    
    
    def fit(self, X, y):
        self.model = self._build_model()
        if self.val_size > 0:
            X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=self.val_size, random_state=self.seed)
            print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)
            
            callbacks= [ModelCheckpoint("embed_NN_"+str(self.seed)+".check", save_best_only=True, verbose=1)]
            if self.multiprocess == False:
                self.model.fit(self._splitX(X_train), y_train, batch_size=self.batchsize, epochs=self.epochs,
                               verbose=self.verbose,
                              validation_data=(self._splitX(X_val), y_val), shuffle=True,
                              callbacks=callbacks)
            else:
                X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=self.val_size, random_state=1)

        else:
            self.model.fit(self._splitX(X), y, batch_size=self.batchsize, epochs=self.epochs,
               verbose=self.verbose, shuffle=True)

        
        return self
    
    def predict(self, X, y=None):
        
        if self.model:
            model = load_model("embed_NN_"+str(self.seed)+".check")
            y_hat = model.predict(self._splitX(X))
        else:
            raise ValueError("Model not fit yet")
            
        return y_hat
        
def add_ngrams(text, ngram=2):
    word_list = str(text).lower().split(' ')
    out_list = [''.join(word_list[i:i+ngram]) for i in range(len(word_list))]
    return ' '.join(out_list[:-1])
        



In [None]:
#Read data
train_data, test_data = read_data("../input", "./")
print(train_data.shape, test_data.shape)
train_data.head()

Getting word/char len features
Get iphone features
Get count features
Index(['brand_name', 'y'], dtype='object')
Index(['category_name', 'y'], dtype='object')
Index(['cat1', 'y'], dtype='object')
Index(['cat2', 'y'], dtype='object')
Index(['cat3', 'y'], dtype='object')
Tokenizing data


In [None]:
X = train_data
y = np.log1p(train_data.price)

cvlist= list(KFold(5, random_state=786).split(X, y))

In [None]:
nnet1 = EM_NNRegressor(embed_cols=['brand_name','category_name','item_condition_id', 'cat1', 'cat2', 'cat3'], 
                  embed_dims=[(6000, 40),(1500, 30), (5,4), (15,4), (120, 10), (900, 20)],
                  text_embed_cols=['name', 'item_description', 'item_desc2gram'],
                  text_embed_dims=[(20000, 50), (100000, 100), (20000, 50)],
                  text_embed_seq_lens =[7, 70, 30],
                  #text_embed_tokenizers = [tok_name, tok_desc, tok_desc2],
                  dense_cols=['shipping', 'desc_words', 'desc_chars', 'name_chars', 'name_words',
                                'iphone_case', 'iphone6', 'iphone6p',
                                'iphone5', 'iphone5p', 'iphone7', 'iphone7p', 'unlocked_phone',
                              'brand_counts', 'cat_counts',
                                   'cat1_counts', 'cat2_counts', 'cat3_counts'
                                  ],
                  epochs=5,
                  batchsize=2048 ,
                  num_layers = 1,
                  layer_dropouts=[0.22],
                  layer_dims=[200],
                  seed=1,
                  val_size=0.025,
                 )

oof_preds1 = cross_val_predict(nnet1, X, y, verbose=10, cv=cvlist)
score = rmse(y, oof_preds1)
print(score)

In [82]:
gc.collect()

19063

In [64]:
nnet2 = EM_NNRegressor(embed_cols=['brand_name','category_name','item_condition_id', 'cat1', 'cat2', 'cat3'], 
                  embed_dims=[(6000, 30),(1500, 25), (5,4), (15,4), (120, 10), (900, 20)],
                  text_embed_cols=['name', 'item_description', 'item_desc2gram'],
                  text_embed_dims=[(20000, 30), (50000, 30), (20000, 30)],
                  text_embed_seq_lens =[7, 70, 30],
                  #text_embed_tokenizers = [tok_name, tok_desc, tok_desc2],
                  dense_cols=['shipping', 'desc_words', 'desc_chars', 'name_chars',
                                'iphone_case', 'iphone6', 'iphone6p',
                                'iphone5', 'iphone5p', 'iphone7', 'iphone7p', 'unlocked_phone',
                              'brand_counts', 'cat_counts',
                                   'cat1_counts', 'cat2_counts', 'cat3_counts'],
                  epochs=4,
                  batchsize=2048 ,
                  num_layers = 1,
                  layer_dropouts=[0.2],
                  layer_dims=[100],
                  seed=2,
                  val_size=0.02
                 )
oof_preds2 = cross_val_predict(nnet1, X, y, verbose=10, cv=cvlist)
score = rmse(y, oof_preds2)
print(score)

[<tf.Tensor 'reshape_298/Reshape:0' shape=(?, 40) dtype=float32>, <tf.Tensor 'reshape_299/Reshape:0' shape=(?, 30) dtype=float32>, <tf.Tensor 'reshape_300/Reshape:0' shape=(?, 4) dtype=float32>, <tf.Tensor 'reshape_301/Reshape:0' shape=(?, 4) dtype=float32>, <tf.Tensor 'reshape_302/Reshape:0' shape=(?, 10) dtype=float32>, <tf.Tensor 'reshape_303/Reshape:0' shape=(?, 20) dtype=float32>, <tf.Tensor 'reshape_304/Reshape:0' shape=(?, 50) dtype=float32>, <tf.Tensor 'reshape_305/Reshape:0' shape=(?, 50) dtype=float32>, <tf.Tensor 'reshape_306/Reshape:0' shape=(?, 50) dtype=float32>, <tf.Tensor 'dense_cols_59:0' shape=(?, 17) dtype=float32>]
(1155692, 29) (29634, 29) (1155692,) (29634,)
Train on 1155692 samples, validate on 29634 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.4min remaining:    0.0s


[<tf.Tensor 'reshape_307/Reshape:0' shape=(?, 40) dtype=float32>, <tf.Tensor 'reshape_308/Reshape:0' shape=(?, 30) dtype=float32>, <tf.Tensor 'reshape_309/Reshape:0' shape=(?, 4) dtype=float32>, <tf.Tensor 'reshape_310/Reshape:0' shape=(?, 4) dtype=float32>, <tf.Tensor 'reshape_311/Reshape:0' shape=(?, 10) dtype=float32>, <tf.Tensor 'reshape_312/Reshape:0' shape=(?, 20) dtype=float32>, <tf.Tensor 'reshape_313/Reshape:0' shape=(?, 50) dtype=float32>, <tf.Tensor 'reshape_314/Reshape:0' shape=(?, 50) dtype=float32>, <tf.Tensor 'reshape_315/Reshape:0' shape=(?, 50) dtype=float32>, <tf.Tensor 'dense_cols_61:0' shape=(?, 17) dtype=float32>]
(1155692, 29) (29634, 29) (1155692,) (29634,)
Train on 1155692 samples, validate on 29634 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  4.8min remaining:    0.0s


[<tf.Tensor 'reshape_316/Reshape:0' shape=(?, 40) dtype=float32>, <tf.Tensor 'reshape_317/Reshape:0' shape=(?, 30) dtype=float32>, <tf.Tensor 'reshape_318/Reshape:0' shape=(?, 4) dtype=float32>, <tf.Tensor 'reshape_319/Reshape:0' shape=(?, 4) dtype=float32>, <tf.Tensor 'reshape_320/Reshape:0' shape=(?, 10) dtype=float32>, <tf.Tensor 'reshape_321/Reshape:0' shape=(?, 20) dtype=float32>, <tf.Tensor 'reshape_322/Reshape:0' shape=(?, 50) dtype=float32>, <tf.Tensor 'reshape_323/Reshape:0' shape=(?, 50) dtype=float32>, <tf.Tensor 'reshape_324/Reshape:0' shape=(?, 50) dtype=float32>, <tf.Tensor 'dense_cols_63:0' shape=(?, 17) dtype=float32>]
(1155692, 29) (29634, 29) (1155692,) (29634,)
Train on 1155692 samples, validate on 29634 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  7.3min remaining:    0.0s


[<tf.Tensor 'reshape_325/Reshape:0' shape=(?, 40) dtype=float32>, <tf.Tensor 'reshape_326/Reshape:0' shape=(?, 30) dtype=float32>, <tf.Tensor 'reshape_327/Reshape:0' shape=(?, 4) dtype=float32>, <tf.Tensor 'reshape_328/Reshape:0' shape=(?, 4) dtype=float32>, <tf.Tensor 'reshape_329/Reshape:0' shape=(?, 10) dtype=float32>, <tf.Tensor 'reshape_330/Reshape:0' shape=(?, 20) dtype=float32>, <tf.Tensor 'reshape_331/Reshape:0' shape=(?, 50) dtype=float32>, <tf.Tensor 'reshape_332/Reshape:0' shape=(?, 50) dtype=float32>, <tf.Tensor 'reshape_333/Reshape:0' shape=(?, 50) dtype=float32>, <tf.Tensor 'dense_cols_65:0' shape=(?, 17) dtype=float32>]
(1155693, 29) (29634, 29) (1155693,) (29634,)
Train on 1155693 samples, validate on 29634 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  9.7min remaining:    0.0s


[<tf.Tensor 'reshape_334/Reshape:0' shape=(?, 40) dtype=float32>, <tf.Tensor 'reshape_335/Reshape:0' shape=(?, 30) dtype=float32>, <tf.Tensor 'reshape_336/Reshape:0' shape=(?, 4) dtype=float32>, <tf.Tensor 'reshape_337/Reshape:0' shape=(?, 4) dtype=float32>, <tf.Tensor 'reshape_338/Reshape:0' shape=(?, 10) dtype=float32>, <tf.Tensor 'reshape_339/Reshape:0' shape=(?, 20) dtype=float32>, <tf.Tensor 'reshape_340/Reshape:0' shape=(?, 50) dtype=float32>, <tf.Tensor 'reshape_341/Reshape:0' shape=(?, 50) dtype=float32>, <tf.Tensor 'reshape_342/Reshape:0' shape=(?, 50) dtype=float32>, <tf.Tensor 'dense_cols_67:0' shape=(?, 17) dtype=float32>]
(1155693, 29) (29634, 29) (1155693,) (29634,)
Train on 1155693 samples, validate on 29634 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.47740754 9.648284
0.4204833150009789


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 12.3min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 12.3min finished


In [65]:
nnet3 = EM_NNRegressor(embed_cols=['brand_name','category_name','item_condition_id', 'cat1', 'cat2', 'cat3'], 
                  embed_dims=[(6000, 30),(1500, 20), (5,4), (15,4), (120, 10), (900, 20)],
                  text_embed_cols=['name', 'item_description', 'item_desc2gram'],
                  text_embed_dims=[(20000, 50), (50000, 50), (20000, 50)],
                  text_embed_seq_lens =[7, 70, 30],
                  #text_embed_tokenizers = [tok_name, tok_desc, tok_desc2],
                  dense_cols=['shipping', 'desc_words', 'desc_chars', 'name_chars',
                                'iphone_case', 'iphone6', 'iphone6p',
                                'iphone5', 'iphone5p', 'iphone7', 'iphone7p', 'unlocked_phone',
                              'brand_counts', 'cat_counts',
                                   'cat1_counts', 'cat2_counts', 'cat3_counts'],
                  epochs=4,
                  batchsize=2048 ,
                  num_layers = 1,
                  layer_dropouts=[0.2],
                  layer_dims=[200],
                  seed=3,
                  val_size=0.02,
                 )
oof_preds3 = cross_val_predict(nnet1, X, y, verbose=10, cv=cvlist)
score = rmse(y, oof_preds3)
print(score)

[<tf.Tensor 'reshape_343/Reshape:0' shape=(?, 40) dtype=float32>, <tf.Tensor 'reshape_344/Reshape:0' shape=(?, 30) dtype=float32>, <tf.Tensor 'reshape_345/Reshape:0' shape=(?, 4) dtype=float32>, <tf.Tensor 'reshape_346/Reshape:0' shape=(?, 4) dtype=float32>, <tf.Tensor 'reshape_347/Reshape:0' shape=(?, 10) dtype=float32>, <tf.Tensor 'reshape_348/Reshape:0' shape=(?, 20) dtype=float32>, <tf.Tensor 'reshape_349/Reshape:0' shape=(?, 50) dtype=float32>, <tf.Tensor 'reshape_350/Reshape:0' shape=(?, 50) dtype=float32>, <tf.Tensor 'reshape_351/Reshape:0' shape=(?, 50) dtype=float32>, <tf.Tensor 'dense_cols_69:0' shape=(?, 17) dtype=float32>]
(1155692, 29) (29634, 29) (1155692,) (29634,)
Train on 1155692 samples, validate on 29634 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.5min remaining:    0.0s


[<tf.Tensor 'reshape_352/Reshape:0' shape=(?, 40) dtype=float32>, <tf.Tensor 'reshape_353/Reshape:0' shape=(?, 30) dtype=float32>, <tf.Tensor 'reshape_354/Reshape:0' shape=(?, 4) dtype=float32>, <tf.Tensor 'reshape_355/Reshape:0' shape=(?, 4) dtype=float32>, <tf.Tensor 'reshape_356/Reshape:0' shape=(?, 10) dtype=float32>, <tf.Tensor 'reshape_357/Reshape:0' shape=(?, 20) dtype=float32>, <tf.Tensor 'reshape_358/Reshape:0' shape=(?, 50) dtype=float32>, <tf.Tensor 'reshape_359/Reshape:0' shape=(?, 50) dtype=float32>, <tf.Tensor 'reshape_360/Reshape:0' shape=(?, 50) dtype=float32>, <tf.Tensor 'dense_cols_71:0' shape=(?, 17) dtype=float32>]
(1155692, 29) (29634, 29) (1155692,) (29634,)
Train on 1155692 samples, validate on 29634 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  5.0min remaining:    0.0s


[<tf.Tensor 'reshape_361/Reshape:0' shape=(?, 40) dtype=float32>, <tf.Tensor 'reshape_362/Reshape:0' shape=(?, 30) dtype=float32>, <tf.Tensor 'reshape_363/Reshape:0' shape=(?, 4) dtype=float32>, <tf.Tensor 'reshape_364/Reshape:0' shape=(?, 4) dtype=float32>, <tf.Tensor 'reshape_365/Reshape:0' shape=(?, 10) dtype=float32>, <tf.Tensor 'reshape_366/Reshape:0' shape=(?, 20) dtype=float32>, <tf.Tensor 'reshape_367/Reshape:0' shape=(?, 50) dtype=float32>, <tf.Tensor 'reshape_368/Reshape:0' shape=(?, 50) dtype=float32>, <tf.Tensor 'reshape_369/Reshape:0' shape=(?, 50) dtype=float32>, <tf.Tensor 'dense_cols_73:0' shape=(?, 17) dtype=float32>]
(1155692, 29) (29634, 29) (1155692,) (29634,)
Train on 1155692 samples, validate on 29634 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  7.6min remaining:    0.0s


[<tf.Tensor 'reshape_370/Reshape:0' shape=(?, 40) dtype=float32>, <tf.Tensor 'reshape_371/Reshape:0' shape=(?, 30) dtype=float32>, <tf.Tensor 'reshape_372/Reshape:0' shape=(?, 4) dtype=float32>, <tf.Tensor 'reshape_373/Reshape:0' shape=(?, 4) dtype=float32>, <tf.Tensor 'reshape_374/Reshape:0' shape=(?, 10) dtype=float32>, <tf.Tensor 'reshape_375/Reshape:0' shape=(?, 20) dtype=float32>, <tf.Tensor 'reshape_376/Reshape:0' shape=(?, 50) dtype=float32>, <tf.Tensor 'reshape_377/Reshape:0' shape=(?, 50) dtype=float32>, <tf.Tensor 'reshape_378/Reshape:0' shape=(?, 50) dtype=float32>, <tf.Tensor 'dense_cols_75:0' shape=(?, 17) dtype=float32>]
(1155693, 29) (29634, 29) (1155693,) (29634,)
Train on 1155693 samples, validate on 29634 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 10.1min remaining:    0.0s


[<tf.Tensor 'reshape_379/Reshape:0' shape=(?, 40) dtype=float32>, <tf.Tensor 'reshape_380/Reshape:0' shape=(?, 30) dtype=float32>, <tf.Tensor 'reshape_381/Reshape:0' shape=(?, 4) dtype=float32>, <tf.Tensor 'reshape_382/Reshape:0' shape=(?, 4) dtype=float32>, <tf.Tensor 'reshape_383/Reshape:0' shape=(?, 10) dtype=float32>, <tf.Tensor 'reshape_384/Reshape:0' shape=(?, 20) dtype=float32>, <tf.Tensor 'reshape_385/Reshape:0' shape=(?, 50) dtype=float32>, <tf.Tensor 'reshape_386/Reshape:0' shape=(?, 50) dtype=float32>, <tf.Tensor 'reshape_387/Reshape:0' shape=(?, 50) dtype=float32>, <tf.Tensor 'dense_cols_77:0' shape=(?, 17) dtype=float32>]
(1155693, 29) (29634, 29) (1155693,) (29634,)
Train on 1155693 samples, validate on 29634 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.5372175 8.981724
0.42130786201521814


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 12.7min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 12.7min finished


In [66]:
from scipy.stats import hmean, gmean

In [67]:
oof_preds = np.mean(np.hstack((oof_preds1, oof_preds2, oof_preds3)), axis=1)
print(oof_preds.shape)
rmse(y, oof_preds)


(1481658,)
0.4934621 9.394966


0.4137029815802479

In [None]:
#Observations

In [None]:
nnet1.fit(train_data, np.log1p(train_data.price) )
print("Predicting on test data")
test_preds1 = nnet1.predict(test_data)

nnet2.fit(train_data, np.log1p(train_data.price) )
print("Predicting on test data")
test_preds2 = nnet2.predict(test_data)

nnet3.fit(train_data, np.log1p(train_data.price) )
print("Predicting on test data")
test_preds3 = nnet3.predict(test_data)

test_preds = (1/3)*(test_preds1 + test_preds2 + test_preds3)
print("Write out submission")
submission: pd.DataFrame = test_data[['test_id']]
submission['price'] = np.expm1(test_preds)
submission.price = submission.price.clip(3, 2000)
submission.to_csv("embedding_nn_v2.csv", index=False)