In [1]:
import copy
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
import sys
import os
import re
import threading
import multiprocessing
import unicodedata
import string
import math

from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, KFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, RobustScaler, MaxAbsScaler, QuantileTransformer
import lightgbm as lgb
from sklearn import metrics
import gc
from tqdm import tqdm
tqdm.pandas(tqdm)
num_partitions = 8
num_cores = 4
from multiprocessing import Pool, cpu_count
import wordbatch
from  wordbatch.extractors import WordSeq
# from __future__ import print_function
np.random.seed(786)  # for reproducibility

from keras.models import Sequential, Model, load_model
from keras.layers import *
from keras.optimizers import *
from keras.utils import np_utils
from keras.layers.convolutional import Convolution1D, MaxPooling1D, ZeroPadding1D, AveragePooling1D
from keras.callbacks import EarlyStopping
from keras.layers.normalization import BatchNormalization
from keras.preprocessing import sequence
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN, LSTM, GRU
from keras.layers.wrappers import TimeDistributed
from keras.preprocessing.text import Tokenizer
from keras.callbacks import Callback, ModelCheckpoint
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasClassifier,  KerasRegressor
#Some classes
#Functions we need - Feature Selector, Fasttext_Estimator, Preprocessing Transformer, Binary_Encoder
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin
from pandas.api.types import is_numeric_dtype, is_string_dtype
from scipy.sparse.csr import csr_matrix
from sklearn.metrics import mean_squared_error, make_scorer

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_sklearn = make_scorer(rmse, greater_is_better=False)

def get_obj_cols(df):
    """Return columns with object dtypes"""
    obj_cols = []
    for idx, dt in enumerate(df.dtypes):
        if dt == 'object':
            obj_cols.append(df.columns.values[idx])

    return obj_cols


def convert_input(X):
    """if input not a dataframe convert it to one"""
    if not isinstance(X, pd.DataFrame):
        if isinstance(X, list):
            X = pd.DataFrame(np.array(X))
        elif isinstance(X, (np.generic, np.ndarray)):
            X = pd.DataFrame(X)
        elif isinstance(X, csr_matrix):
            X = pd.SparseDataFrame(X)
        else:
            raise ValueError('Unexpected input type: %s' % (str(type(X))))

        #X = X.apply(lambda x: pd.to_numeric(x, errors='ignore'))
    return X

class FeatureSelector(BaseEstimator, TransformerMixin):
    """ Class to do subset of features in sklearn pipeline"""
    def __init__(self, cols=None, return_df=True, verbose=0):
        self.cols = cols
        self.return_df = return_df
        self.verbose = verbose
        
    def fit(self, X, y=None):
        #Do nothing
        return self
    
    def transform(self, X, y=None):
        #if the input dataset isn't already a dataframe, convert it to one
        X = X.copy(deep=True)
        X = convert_input(X)
        X = X.loc[:, self.col]
        
        if self.verbose:
            print("Selecting columns are {}".format(self.col))
        if self.return_df:
            return X
        else:
            return X.values
    
class BinaryEncoder(BaseEstimator, TransformerMixin):
    """Binary encoding for categorical variables, similar to onehot, 
    but stores categories as binary bitstrings.
    Expects cols to numerical, else throws error
    """
    def __init__(self, verbose=0, cols=None, add_to_df=True, return_df=True):
        self.return_df = return_df
        self.verbose = verbose
        self.cols = cols
        self.add_to_df = add_to_df
        self._dim = None
        self.digits_per_col = {}

    def fit(self, X, y=None, **kwargs):
        # if the input dataset isn't already a dataframe, convert it to one (using default column names)
        # first check the type
        X = convert_input(X)

        self._dim = X.shape[1]

        # if columns aren't passed, just use every string column
        if self.cols is None:
            self.cols = get_obj_cols(X)
        #Check if all cols are numeric and no nan's else throws error
        if np.any([is_string_dtype(X[col]) for col in self.cols]):
            raise ValueError("Input contains non-numeric data or is has nan's")

        for col in self.cols:
            self.digits_per_col[col] = self.calc_required_digits(X, col)
            
        return self


    def transform(self, X):
        """Perform the transformation to new categorical data. """
        if self._dim is None:
            raise ValueError('Must train encoder before it can be used to transform data.')
        # first check the type
        X = convert_input(X)
        # then make sure that it is the right size
        if X.shape[1] != self._dim:
            raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim, ))

        #Check if all cols are numeric and no nan's else throws error
        if np.any([is_string_dtype(X[col]) for col in self.cols]):
            raise ValueError("Input contains non-numeric data ")
        
        X = self.binary(X, cols=self.cols)
        print(X.shape)
        if self.return_df:
            return X
        else:
            return X.values


    def binary(self, X_in, cols=None):
        """
        Binary encoding encodes the integers as binary code with one column per digit.
        """
        X = X_in.copy(deep=True)

        if cols is None:
            cols = X.columns.values
            pass_thru = []
        else:
            pass_thru = [col for col in X.columns.values if col not in cols]

        bin_cols = []
        for col in cols:
            # get how many digits we need to represent the classes present
            digits = self.digits_per_col[col]

            # map the ordinal column into a list of these digits, of length digits
            X[col] = X[col].map(lambda x: self.col_transform(x, digits))

            for dig in range(digits):
                X[str(col) + '_%d' % (dig, )] = X[col].map(lambda r: 
                                                int(r[dig]) if r is not None else None)
                bin_cols.append(str(col) + '_%d' % (dig, ))

        if self.add_to_df:
            X = X.reindex(columns=bin_cols + pass_thru)
        else:
            X =  X.reindex(columns=bin_cols)
        return X

        
    @staticmethod
    def calc_required_digits(X, col):
        """
        figure out how many digits we need to represent the classes present
        """
        return int( np.ceil(np.log2(X[col].nunique())) )

    
    @staticmethod
    def col_transform(col, digits):
        """
        The lambda body to transform the column values
        """
        if col is None or float(col) < 0.0:
            return None
        else:
            col = format(col, "0"+str(digits)+'b')
        return col
    
    
    
# the following functions allow for a parallelized batch generator
class threadsafe_iter(object):
    """
    Takes an iterator/generator and makes it thread-safe by
    serializing call to the `next` method of given iterator/generator.
    """
    def __init__(self, it):
        self.it = it
        self.lock = threading.Lock()
    def __iter__(self):
        return self

    def __next__(self):
        with self.lock:
            return next(self.it)

def threadsafe_generator(f):
    """
    A decorator that takes a generator function and makes it thread-safe.
    """
    def g(*a, **kw):
        return threadsafe_iter(f(*a, **kw))
    return g

@threadsafe_generator
def batch_generator(X_data, y_data, batch_size):
    
    #index = np.random.permutation(X_data.shape[0])    
    #X_data = X_data[index]
    #y_data = y_data[index]
    
    samples_per_epoch = X_data.shape[0]
    number_of_batches = samples_per_epoch/batch_size
    counter=0
    index = np.arange(np.shape(y_data)[0])
    #idx = 1
    while 1:
        index_batch = index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X_data[index_batch,:].todense()
        y_batch = y_data[index_batch]
        counter += 1
        yield np.array(X_batch),y_batch
        #print("")
        #print(X_batch.shape)
        #print("")
        #print('generator yielded a batch %d' % idx)
        #idx += 1
        if (counter > number_of_batches):
            counter=0
            
            
@threadsafe_generator
def batch_generator_x(X_data,batch_size):
    samples_per_epoch = X_data.shape[0]
    number_of_batches = samples_per_epoch/batch_size
    counter=0
    index = np.arange(np.shape(X_data)[0])
    while 1:
        index_batch = index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X_data[index_batch,:].todense()
        counter += 1
        yield np.array(X_batch)
        if (counter > number_of_batches):
            counter=0
            

class TargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cols=None, thresh=0, func=np.mean, add_to_orig=False):
        self.cols = cols
        self.thresh = thresh
        self.func = func
        self.add_to_orig = add_to_orig
    
    #@numba.jit        
    def fit(self, X, y):
        self.prior = self.func(y)
        self._dict = {}
        for col in self.cols:
            if isinstance(col, (list, tuple)):
                print('here')
                tmp_df = X.loc[: ,col]
                col = tuple(col)
            else:
                tmp_df = X.loc[: ,[col]]
            tmp_df['y'] = y
            print(tmp_df.columns)
            #tmp_df = pd.DataFrame({'eval_col':X[col].values, 'y':y})
            if isinstance(col, (list, tuple)):
                print('here')
                col = tuple(col)
            self._dict[col] = tmp_df.groupby(col)['y'].apply(lambda x: 
                                self.func(x) if len(x) >= self.thresh  else self.prior).to_dict()
                                
            del tmp_df
        return self
    #@numba.jit
    def transform(self, X, y=None):
        X_transformed = []
        for col in self.cols:
            
            if isinstance(col, (list, tuple)):
                tmp_df = X.loc[:, col]
                enc = tmp_df[col].apply(lambda x: self._dict[tuple(col)][tuple(x)]
                                                                     if tuple(x) in self._dict[tuple(col)]
                                                                     else self.prior, axis=1).values
            else:
                tmp_df = X.loc[:, [col]]
                enc = tmp_df[col].apply(lambda x: self._dict[col][x]
                                                                     if x in self._dict[col]
                                                                     else self.prior).values
            del tmp_df
            X_transformed.append(enc)
        
        X_transformed = np.vstack(X_transformed).T
        
        if self.add_to_orig:
            return np.concatenate((X.values, X_transformed), axis=1)
            
        else:
            return X_transformed            
stop_words = ['a', 'an', 'this', 'is', 'the', 'of', 'for']

def unicodeToAscii(s):
    return  unicodedata.normalize('NFKC', s)

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"'", r"", s)
    #s = re.sub(r"[.!?':;,]", r" ", s)
    s = re.sub(r"-", r"", s)
    s = re.sub(r"[^0-9a-zA-Z]+", r" ", s)
    s = re.sub(r"lily jade", r"lilyjade", s)
    s = re.sub(r"rae dunn cookie(s){0,1}", r"raedunncookie", s)
    s = re.sub(r"hatchimals", r"hatchimal", s)
    s = re.sub(r"virtual reality", r"vr", s)
    s = re.sub(r" vs ", r" victorias secret ", s)
    s = re.sub(r"google home", r"googlehome", s)
    s = re.sub(r"16 gb", r"16gb ", s)
    s = re.sub(r"256 gb", r"256gb ", s)
    s = re.sub(r"32 gb", r"32gb ", s)
    s = re.sub(r"(?=\w{1,2})iphone ", r"iphone", s)
    s = re.sub(r"(?=\w{1,2})galaxy ", r"galaxy", s)
    s = re.sub("14(k){0,1} gold", '14kgold', s)
    s = re.sub("lululemon bags", 'lululemonbags', s)
    s = re.sub("controller skin", 'controllerskin', s)
    s = re.sub("watch box", 'watchbox', s)
    s = re.sub("blaze band", 'blazeband', s)
    s = re.sub("vault boy", 'vaultboy', s)
    s = re.sub("lash boost", 'lashboost', s)
    s = re.sub("64 g ", '64gb ', s)
    s = re.sub("32 g ", '32gb ', s)
    s = re.sub("go(\s){0,1}pro hero", 'goprohero', s)
    s = re.sub("nmd(s){0,1}(\s){0,1}(r){0,1}(1){0,1}(\s|$)", 'nmdr ', s)
    s = re.sub("private sale", 'privatesale', s)
    s = re.sub("vutton", 'vuitton', s)
    s = re.sub("louis vuitton eva", 'louisvuittoneva', s)
    s = re.sub("apple watch", 'applewatch', s)
    
     
    #s = re.sub(r" 1 ", r" one ", s)
    #s = re.sub(r" 2 ", r" two ", s)
    #s = re.sub(r" 3 ", r" three ", s)
    #s = re.sub(r" 4 ", r" four ", s)
    #s = re.sub(r" 5 ", r" five ", s)
    #s = re.sub(r" 6 ", r" six ", s)
    #s = re.sub(r"7", r"seven", s)
    #s = re.sub(r"8", r"eight", s)
    #s = re.sub(r"/s/s", r"/s", s)
    return s

def _normalize_and_ngrams(sent, ngram):
    input_list = normalizeString(sent).split()
    input_list = [word for word in input_list if word not in stop_words]
    #s = input_list.copy()
    #for i in range(2, ngrams+1):
    #    s += [' '.join(input_list[j:j+i]) for j in range(len(input_list)-i + 1)]
        #s += list((zip(*[input_list[j:] for j in range(i)])))
    s = [''.join(input_list[i:i+ngram]) for i in range(len(input_list))]
    return ' '.join(s[:-1])
            

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
def get_cat_1(x): return str(x).split('/')[0]
def get_cat_2(x): return str(x).split('/')[1] if len(str(x).split('/')) > 1 else -1
def get_cat_3(x): return ' '.join(str(x).split('/')[2:]) if len(str(x).split('/')) > 2 else -1

def applycat1(df): 
    return df['category_name'].progress_apply(get_cat_1)
    

def applycat2(df): 
    return df['category_name'].progress_apply(get_cat_2)
    

def applycat3(df): 
    return df['category_name'].progress_apply(get_cat_3)

def get_words(series): return series.progress_apply(lambda x: len(str(x).split()))

def get_chars(series): return series.progress_apply(lambda x: len(str(x)))

def get_tokens(series): return np.sum(np.array(series.tolist()) > 0, axis=1)

def isphonecase(series): return series.str.contains(' case ', flags=re.IGNORECASE).astype(int)

def isiphone6(series): return (series.str.contains('iphone', flags=re.IGNORECASE) & 
                        series.str.contains('6|six', flags=re.IGNORECASE) &
                        ~(series.str.contains('plus|\+', flags=re.IGNORECASE)) &
                                ~(series.str.contains('case', flags=re.IGNORECASE)) ).astype(int)

def isiphone6p(series): return (series.str.contains('iphone', flags=re.IGNORECASE) & 
                        series.str.contains('6|six', flags=re.IGNORECASE) &
                        series.str.contains('plus|\+', flags=re.IGNORECASE) &
                                ~(series.str.contains('case', flags=re.IGNORECASE)) ).astype(int)

def isiphone5(series): return (series.str.contains('iphone', flags=re.IGNORECASE) & 
                        series.str.contains('5|five', flags=re.IGNORECASE) &
                        ~(series.str.contains('plus|\+', flags=re.IGNORECASE)) &
                                ~(series.str.contains('case', flags=re.IGNORECASE)) ).astype(int)

def isiphone5p(series): return (series.str.contains('iphone', flags=re.IGNORECASE) & 
                        series.str.contains('5|five', flags=re.IGNORECASE) &
                        series.str.contains('plus|\+', flags=re.IGNORECASE) &
                                ~(series.str.contains('case', flags=re.IGNORECASE)) ).astype(int)

def isiphone7(series): return (series.str.contains('iphone', flags=re.IGNORECASE) & 
                        series.str.contains('7|seven', flags=re.IGNORECASE) &
                        ~(series.str.contains('plus|\+', flags=re.IGNORECASE)) &
                                ~(series.str.contains('case', flags=re.IGNORECASE)) ).astype(int)

def isiphone7p(series): return (series.str.contains('iphone', flags=re.IGNORECASE) & 
                        series.str.contains('7|seven', flags=re.IGNORECASE) &
                        series.str.contains('plus|\+', flags=re.IGNORECASE) &
                                ~(series.str.contains('case', flags=re.IGNORECASE)) ).astype(int)

def isunlocked(series): return series.str.contains('unlocked', flags=re.IGNORECASE).astype(int)

def plussigns(series): return series.apply(lambda x: sum([(s == '+') | (s == '➕') for s in str(x)]))

def andsigns(series): return series.apply(lambda x: sum([(s == '&') | (s == ' and ') for s in str(x)]))

def commas(series): return series.apply(lambda x: sum([s == ',' for s in str(x)]))

def add_ngrams(text, ngram=2):
    word_list = normalizeString(text).split(' ')
    out_list = [''.join(word_list[i:i+ngram]) for i in range(len(word_list))]
    return ' '.join(out_list[:-1])

def get_2grams(series): return series.apply(lambda x: _normalize_and_ngrams(str(x), 2))

def norm3grams(s): return _normalize_and_ngrams(s, 3)

def applyname(series): return series.progress_apply(norm3grams)

def index2sent1(x, name_vocab): return indexesFromSentence(name_vocab, x, 3, 10)

def name2index(series): return series.progress_apply(lambda x: index2sent1(x, name_vocab))

def norm2grams(s): return _normalize_and_ngrams(s, 1)

def applydesc(series):return series.progress_apply(norm2grams)

def index2sent2(x, desc_vocab): return indexesFromSentence(desc_vocab, x, 1, 80)

def desc2index(series): return series.progress_apply(lambda x: index2sent2(x, desc_vocab))

def indexesFromSentence(vocab, tokens, ngrams, max_len):
    num_list = []
    for i, item in enumerate(tokens):
        if len(num_list) == max_len:
            break
        elif item in vocab.word2index:
            num_list.append(vocab.word2index[item])
        else:
            continue
        
    if len(num_list) < max_len :
        num_list += [0]*(max_len - len(num_list) )
        
    return num_list

def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df


def read_data(in_path, out_path):
    if os.path.exists(os.path.join(out_path, 'train_2.pkl')) and os.path.exists(os.path.join(out_path, 'test_2.pkl')):
        train_data = pd.read_pickle(os.path.join(out_path, 'train_2.pkl'))
        test_data  = pd.read_pickle(os.path.join(out_path, 'test_2.pkl'))
        
        return train_data, test_data
    
    else:
        train_data = pd.read_table(os.path.join(in_path, 'train.tsv'))
        test_data  = pd.read_table(os.path.join(in_path, 'test.tsv'))
    
        train_rows = len(train_data)
        data = pd.concat([train_data, test_data], ignore_index=True)
        
        data["item_description"] = data["item_description"].replace("No description yet", "missing")
        
        data['cat1'] = parallelize_dataframe(data[['category_name']], applycat1)
        data['cat2'] = parallelize_dataframe(data[['category_name']], applycat2)
        data['cat3'] = parallelize_dataframe(data[['category_name']], applycat3)
        data.fillna(-1, inplace=True)
        
        print("Getting word/char len features")
        data['desc_words'] = parallelize_dataframe(data['item_description'], get_words)
        data['desc_chars'] = parallelize_dataframe(data['item_description'], get_chars)
        data['name_words'] = parallelize_dataframe(data['name'], get_words)
        data['name_chars'] = parallelize_dataframe(data['name'], get_chars)
        
        
        print("Get iphone features")
        data['iphone_case'] = parallelize_dataframe(data['name'], isphonecase)
        data['iphone6'] = parallelize_dataframe(data['name'], isiphone6)
        data['iphone6p'] = parallelize_dataframe(data['name'], isiphone6p)
        data['iphone5'] = parallelize_dataframe(data['name'], isiphone5)
        data['iphone5p'] = parallelize_dataframe(data['name'], isiphone5p)
        data['iphone7'] = parallelize_dataframe(data['name'], isiphone7)
        data['iphone7p'] = parallelize_dataframe(data['name'], isiphone7p)
        data['unlocked_phone'] = parallelize_dataframe(data['name'], isunlocked)
        
        print("Get brand words")
        wb_brands = wordbatch.WordBatch(normalizeString, n_words=4500)
        wb_brands.fit(data["brand_name"].fillna("missing").astype(str))
        
        print("Label encoding features")
        cat_cols = ['category_name', 'brand_name', 'cat1', 'cat2', 'cat3', 'item_condition_id']
        for col in cat_cols:
            data[col] = LabelEncoder().fit_transform(data[col].astype(str)) + 1
            
        print("Get count features")
        data['brand_counts'] = data.brand_name.map(data["brand_name"].value_counts()).fillna(0).astype(int)

        data['cat_counts'] = data.brand_name.map(data["category_name"].value_counts()).fillna(0).astype(int)
        
        data['cat1_counts'] = data.brand_name.map(data["cat1"].value_counts()).fillna(0).astype(int)

        data['cat2_counts'] = data.brand_name.map(data["cat2"].value_counts()).fillna(0).astype(int)

        data['cat3_counts'] = data.brand_name.map(data["cat3"].value_counts()).fillna(0).astype(int)
  
        
        print("Getting punct related features")
        data["plus_counts"] = parallelize_dataframe(data["item_description"], plussigns)
        data["ands_counts"] = parallelize_dataframe(data["item_description"], andsigns)
        data["comma_counts"] = parallelize_dataframe(data["item_description"], commas)
        data["all_counts"] = data["plus_counts"] + data["ands_counts"] + data["comma_counts"]
        
        #for col in ["name", "item_description"]:
        #    data[col] = data[col].str.replace("'", '').replace('-', '').progress_apply(unicodeToAscii)
        #    data[col] = data[col].progress_apply(remove_puncts)
        
        
        num_cols =  ["desc_words", "desc_chars", "name_words", "name_chars", "plus_counts", 
                    "ands_counts", "comma_counts", "all_counts", "brand_counts", "cat1_counts", 
                   "cat2_counts", "cat3_counts"]
        data[num_cols]  = MaxAbsScaler().fit_transform(data[num_cols])
            
        data["brand_cat"] = data["brand_name"].astype(str) + ' ' + data["category_name"].astype(str)
        data["category_shipping"] = data["category_name"].astype(str) + ' ' + data["shipping"].astype(str)
        
        print("transform brand cat and category_shipping")
        data["brand_cat"] = LabelEncoder().fit_transform(data["brand_cat"])
        data["category_shipping"] = LabelEncoder().fit_transform(data["category_shipping"])
        data['item_desc2gram'] = parallelize_dataframe(data["item_description"], get_2grams)
        
        print("Name to sequences")
        wb_name = wordbatch.WordBatch(normalizeString, n_words=20000)
        wb_name.fit(data["name"])
        
        seq_name = WordSeq(wb_name, {"seq_maxlen": 7,  "seq_truncstart":False, "remove_oovs":True})
        seq_name_desc = WordSeq(wb_name, {"seq_maxlen": 30,  "seq_truncstart":False, "remove_oovs":True})
        seq_brands = WordSeq(wb_brands, {"seq_maxlen": 3,  "seq_truncstart":False, "remove_oovs":True})
        
        data["item_name"] = list(zip(seq_name_desc.transform(wb_name.transform(data["item_description"].astype(str)))))
        data["name_brand"] = list(zip(seq_brands.transform(wb_brands.transform(data["name"].astype(str)))))
        data["name"] = list(zip(seq_name.transform(wb_name.transform(data["name"].astype(str)))))
        
        del wb_name, seq_name, seq_name_desc
        
        print("Desc to sequences")
        wb_desc = wordbatch.WordBatch(normalizeString, n_words=50000, extractor=(WordSeq, {"seq_maxlen": 70,
                                                                                           "seq_truncstart":False,
                                                                                           "remove_oovs":True
                                                                            } ))
        #wb_desc.fit(data["item_description"].astype(str))
        data["desc_brand"] = list(zip(seq_brands.transform(wb_brands.transform(data["item_description"].astype(str)))))
        data["item_description"] = list(zip(wb_desc.fit_transform(data["item_description"].astype(str))))
        del wb_desc
        
        print("Desc 2gram to sequences")
        #wb_desc2 = wordbatch.WordBatch(normalizeString, n_words=20000, extractor=(WordSeq, {"seq_maxlen": 30,
        #                                                                                    "seq_truncstart":False,
        #                                                                                    "remove_oovs":True
        #                                                                    } ))
        #wb_desc2.fit(data["item_desc2gram"].astype(str))
        tok_desc2 = Tokenizer(20000)
        tok_desc2.fit_on_texts(data['item_desc2gram'].astype(str))
        data["item_desc2gram"] = list(zip(sequence.pad_sequences(tok_desc2.texts_to_sequences(data.item_desc2gram.astype(str)),
                                         maxlen=20, padding='post', truncating='post')))
        #del wb_desc2
        
        print("split train test")
        train_data = data.loc[: train_rows - 1, :].reset_index(drop=True)
        train_data = train_data.loc[(train_data.price >= 3), :].reset_index(drop=True)
        test_data  = data.loc[train_rows: , :].reset_index(drop=True)
        
        print("Get target encodings for stuff")
        cvlist = list(KFold(6, random_state=100).split(train_data))
        
        #enc_1 = TargetEncoder(cols=['brand_name'])
        #train_data["brand_mean"] = cross_val_predict(enc_1, train_data[['brand_name']], train_data['price'], cv=cvlist, verbose=10, method='transform', n_jobs=1)
        #test_data["brand_mean"] = enc_1.fit(train_data[['brand_name']], train_data['price']).transform(test_data)
        
        
        #enc_2 = TargetEncoder(cols=['category_name'])
        #train_data["category_mean"] = cross_val_predict(enc_2, train_data[['category_name']], train_data['price'], cv=cvlist, verbose=10, method='transform', n_jobs=1)
        #test_data["category_mean"] = enc_2.fit(train_data[['category_name']], train_data['price']).transform(test_data)
        
        #enc_3 = TargetEncoder(cols=[['brand_name', 'category_name']])
        #train_data["brandcat_mean"] = cross_val_predict(enc_3, train_data[['brand_name', 'category_name']], train_data['price'], cv=cvlist, verbose=10, method='transform', n_jobs=1)
        #test_data["brandcat_mean"] = enc_3.fit(train_data[['brand_name', 'category_name']], train_data['price']).transform(test_data)
        
        #train_data["brandcat_rat"] = train_data["brandcat_mean"]/(1 + train_data["category_mean"])
        #test_data["brandcat_rat"] = test_data["brandcat_mean"]/(1 + test_data["category_mean"])
        
        #train_data["catbrand_rat"] = train_data["brandcat_mean"]/(1 + train_data["category_mean"])
        #test_data["catbrand_rat"] = test_data["brandcat_mean"]/(1 + test_data["brand_mean"])
        
        #enc_4 = TargetEncoder(cols=['brand_name'])
        #train_data["brandvalue"] = cross_val_predict(enc_4, train_data[['brand_name']], train_data["brandcat_rat"], cv=cvlist, verbose=10, method='transform', n_jobs=1)
        #test_data["brandvalue"] = enc_4.fit(train_data[['brand_name']], train_data["brandcat_rat"]).transform(test_data)
        
        #extra_cols = ["brand_mean","category_mean", "brandcat_mean","catbrand_rat", "brandvalue"]
        #scaler = QuantileTransformer()
        #train_data[extra_cols]  = scaler.fit_transform(train_data[extra_cols])
        #train_data[extra_cols]  = scaler.transform(train_data[extra_cols])
        
        print(train_data.head())
        del train_data['test_id']
        del test_data['train_id']
        del data 
        test_data['test_id'] = test_data['test_id'].astype(int)
        train_data.to_pickle(os.path.join(out_path, 'train_2.pkl'))
        test_data.to_pickle(os.path.join(out_path, 'test_2.pkl'))
        
        return train_data, test_data

In [3]:
class EM_NNRegressor(BaseEstimator, RegressorMixin):
    
    def __init__(self, embed_cols=None, dense_cols=None, embed_dims=None, 
                 text_embed_cols=None, text_embed_seq_lens=None, 
                 text_embed_dims=None, 
                 #text_embed_tokenizers=None,
                 num_layers=2, multiprocess=False,
                layer_activations=None, layer_dims=None,layer_dropouts=None, epochs=20, batchsize=32,
                optimizer_kwargs=None, val_size=0.1, verbose=1, optim=None, seed=1):
        
        self.embed_cols = embed_cols
        self.dense_cols = dense_cols
        self.embed_dims = embed_dims
        self.text_embed_cols = text_embed_cols
        self.text_embed_dims = text_embed_dims
        #self.text_embed_tokenizers = text_embed_tokenizers
        self.text_embed_seq_lens = text_embed_seq_lens
        self.dense_dims = None
        self.num_layers = num_layers
        self.layer_dims = layer_dims
        self.layer_activations = layer_activations
        self.layer_dropouts = layer_dropouts
        self.epochs = epochs
        self.batchsize = batchsize
        self.optimizer_kwargs = optimizer_kwargs
        self.val_size = val_size
        self.verbose = verbose
        self.multiprocess = multiprocess
        self.seed = seed
        self.optim = optim
        self.model = None
        if self.dense_cols:
            self.dense_dims = len(self.dense_cols)
            
    def _splitX(self, X):
        X_splits = []
        
        if self.embed_cols:
            for col in self.embed_cols :
                X_splits.append(X[col].values.reshape(X.shape[0], -1))
                
        if self.text_embed_cols:
            for i, col in enumerate(self.text_embed_cols):
                #max_features = self.text_embed_dims[i][0]
                #max_len = self.text_embed_seq_lens[i]
                #input_text = X[col].astype(str)
                #x_train = tok.texts_to_sequences(input_text)
                #print(np.mean([len(l) for l in x_train]))
                #x_train = sequence.pad_sequences(x_train, maxlen=max_len)
                #X_splits.append(np.array(x_train).reshape(X.shape[0], -1))
                X_splits.append(np.concatenate(X[col].values))
                
        if self.dense_cols:
            X_splits.append(X[self.dense_cols].values.reshape(X.shape[0], -1))
            
        return X_splits
    
    
    def _build_model(self):
        model_inputs = []
        model_layers = []
        
        if self.embed_cols:
            for col, dim in zip(self.embed_cols, self.embed_dims):
                x1 = Input( shape=(1,), name=col)
                model_inputs.append(x1)
                x1 = Embedding(input_dim=dim[0], output_dim=dim[1], )(x1)
                #x1 = Dropout(0.1)(x1)
                x1 = Reshape(target_shape=(dim[1],))(x1)
                model_layers.append(x1)
                
        if self.text_embed_cols:
            for col, dim, seq_len in zip(self.text_embed_cols, 
                                                self.text_embed_dims, 
                                                self.text_embed_seq_lens):
                x3 = Input( shape=(seq_len,))
                model_inputs.append(x3)
                x3 = Embedding(input_dim=dim[0], output_dim=dim[1], input_length=seq_len)(x3)
                x3 = GlobalAveragePooling1D()(x3)
                x3 = Reshape(target_shape=(dim[1],))(x3)
                model_layers.append(x3)
                
        if self.dense_cols:
            x2 = Input( shape=(self.dense_dims, ), name='dense_cols')
            model_inputs.append(x2)
            model_layers.append(x2)
        print(model_layers)
        x = concatenate(model_layers)
        
        if self.num_layers > 0:
            for dim, drops in zip(self.layer_dims, self.layer_dropouts):
                x = BatchNormalization()(x)
                x = Dropout(rate=drops)(x)
                x = Dense(dim, activation='selu', kernel_initializer='he_normal')(x)
                x = PReLU()(x)
        
        x = BatchNormalization()(x)
        x = Dropout(0.03)(x)
        output = Dense(1, activation='linear', kernel_initializer='normal')(x)
        
        model = Model(inputs=model_inputs, outputs=output)
        #print(model.summary())
        adam = self.optim
        #adam = Nadam(lr=0.0012, schedule_decay=0.01)
        model.compile(optimizer=adam, loss='mean_squared_error' )
        
        return model 
    
    
    def fit(self, X, y):
        self.model = self._build_model()
        if self.val_size > 0:
            X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=self.val_size, random_state=self.seed)
            print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)
            
            callbacks= [ModelCheckpoint("embed_NN_"+str(self.seed)+".check", save_best_only=True, verbose=1)]
            if self.multiprocess == False:
                self.model.fit(self._splitX(X_train), y_train, batch_size=self.batchsize, epochs=self.epochs,
                               verbose=self.verbose,
                              validation_data=(self._splitX(X_val), y_val), shuffle=True,
                              callbacks=callbacks)
            else:
                X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=self.val_size, random_state=1)

        else:
            self.model.fit(self._splitX(X), y, batch_size=self.batchsize, epochs=self.epochs,
               verbose=self.verbose, shuffle=True)

        
        return self
    
    def predict(self, X, y=None):
        
        if self.model:
            if (self.val_size > 0):
                model = load_model("embed_NN_"+str(self.seed)+".check")
                y_hat = model.predict(self._splitX(X))
            else:
                y_hat = self.model.predict(self._splitX(X))
        else:
            #y_hat = self.model.predict(self._splitX(X))
            raise ValueError("Model not fit yet")
            
        return y_hat
        
def add_ngrams(text, ngram=2):
    word_list = str(text).lower().split(' ')
    out_list = [''.join(word_list[i:i+ngram]) for i in range(len(word_list))]
    return ' '.join(out_list[:-1])


In [4]:
train_data, test_data = read_data("../input", "./")

In [5]:
train_data.head()

Unnamed: 0,brand_name,category_name,item_condition_id,item_description,name,price,shipping,train_id,cat1,cat2,...,plus_counts,ands_counts,comma_counts,all_counts,brand_cat,category_shipping,item_desc2gram,item_name,name_brand,desc_brand
0,3,831,3,"([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","([2628, 4723, 5010, 122, 20, 5, 54],)",10.0,1,0.0,6,104,...,0.0,0.0,0.0,0.0,22240,2138,"([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","([306, 1708, 4443],)","([0, 0, 0],)"
1,3891,88,3,"([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","([0, 0, 0, 5791, 12917, 10230, 1562],)",52.0,0,1.0,2,32,...,0.0,0.0,0.0,0.0,31503,2240,"([25, 4, 305, 1337, 7645, 1015, 1822, 6129, 12...","([1084, 1562, 975, 93, 822, 950, 11, 242, 249,...","([0, 0, 821],)","([803, 658, 2846],)"
2,4590,1279,1,"([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","([0, 0, 0, 0, 0, 0, 195],)",10.0,1,2.0,10,105,...,0.0,0.0,0.008264,0.008264,37612,603,"([624, 4268, 7937, 3321, 235, 2168, 74, 392, 4...","([0, 1354, 19, 76, 169, 24, 85, 11, 169, 646, ...","([0, 0, 0],)","([934, 177, 3323],)"
3,3,505,1,"([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","([0, 0, 0, 0, 118, 1608, 11691],)",35.0,1,3.0,4,57,...,0.0,0.0,0.0,0.0,21883,1426,"([6, 9, 92, 193, 37, 3849, 787, 34, 259, 194, ...","([4, 76, 767, 118, 5196, 2585, 7, 137, 2135, 9...","([0, 794, 2679],)","([86, 794, 243],)"
4,3,1206,1,"([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","([0, 0, 0, 3529, 48, 911, 139],)",44.0,0,4.0,10,60,...,0.0,0.0,0.0,0.0,21352,445,"([2687, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","([0, 1821, 1254],)","([0, 2960, 289],)"


In [None]:
for seed in [1,2,3, 786]:
    nnet1 = EM_NNRegressor(embed_cols=['brand_name','category_name','item_condition_id', 'cat1', 'cat2', 'cat3', 
                                       'category_shipping', 
                                       "brand_cat"], 
                      embed_dims=[(5600, 32),(1500, 32), (6,4), (16,4), (121, 8), (900, 16), 
                                  (3500, 32), 
                                  (51000, 32)],
                      text_embed_cols=['name', 'item_description', 'item_desc2gram',  'name_brand', 'desc_brand'],
                      text_embed_dims=[(20002, 32), (50002, 32), (20002, 32), (4501, 16), (4501, 16)],
                      text_embed_seq_lens =[7, 70, 20, 3, 3],
                      #text_embed_tokenizers = [tok_name, tok_desc, tok_desc2],
                      dense_cols=['shipping', 
                                  'desc_words', 'desc_chars', 'name_chars', 'name_words',
                                'iphone_case', 'iphone6', 'iphone6p',
                                'iphone5', 'iphone5p', 'iphone7', 'iphone7p', 'unlocked_phone',
                                  'brand_counts', 'cat_counts',
                                   'cat1_counts', 'cat2_counts', 'cat3_counts',
                                  'plus_counts', 'ands_counts', 'comma_counts', 'all_counts',
                                   #"brand_mean","category_mean", "brandcat_mean","catbrand_rat", "brandvalue"
                                   ],
                      epochs=5,
                      batchsize=2048 ,
                      num_layers = 1,
                      layer_dropouts=[0.12],
                      layer_dims=[256],
                      seed=seed,
                      val_size=0.02,
                      optim=RMSprop(lr=0.009, decay=0.005, clipvalue=2.5, rho=0.9)
                      #optim=Adam(lr=0.004, beta_1=0.9, decay=0.004, clipvalue=2.5)
                     )
    nnet1.fit(train_data, np.log1p(train_data.price) )

[<tf.Tensor 'reshape_183/Reshape:0' shape=(?, 32) dtype=float32>, <tf.Tensor 'reshape_184/Reshape:0' shape=(?, 32) dtype=float32>, <tf.Tensor 'reshape_185/Reshape:0' shape=(?, 4) dtype=float32>, <tf.Tensor 'reshape_186/Reshape:0' shape=(?, 4) dtype=float32>, <tf.Tensor 'reshape_187/Reshape:0' shape=(?, 8) dtype=float32>, <tf.Tensor 'reshape_188/Reshape:0' shape=(?, 16) dtype=float32>, <tf.Tensor 'reshape_189/Reshape:0' shape=(?, 32) dtype=float32>, <tf.Tensor 'reshape_190/Reshape:0' shape=(?, 32) dtype=float32>, <tf.Tensor 'reshape_191/Reshape:0' shape=(?, 32) dtype=float32>, <tf.Tensor 'reshape_192/Reshape:0' shape=(?, 32) dtype=float32>, <tf.Tensor 'reshape_193/Reshape:0' shape=(?, 32) dtype=float32>, <tf.Tensor 'reshape_194/Reshape:0' shape=(?, 16) dtype=float32>, <tf.Tensor 'reshape_195/Reshape:0' shape=(?, 16) dtype=float32>, <tf.Tensor 'dense_cols_14:0' shape=(?, 22) dtype=float32>]
(1452027, 38) (29634, 38) (1452027,) (29634,)
Train on 1452027 samples, validate on 29634 samples


In [None]:
import gc
gc.collect()

In [None]:
##Base
#scores - .1689, .1670, .1706, .1690 - .1689
#scores - .1686, .1660, .1739, .1667 - .1688

#Removing item_name
#scores - .1676, .1685, .1717, .1670 - .1687
#scores - .1676, .1679, .1713, .1662 - .1683

#Removing len features
#scores - .1706, .1681, .1723, .1692 - .1701 -- including again

#Adding normalization (item_name, category_shipping removed and len features included)
#scores - .1690, .1671, .1713, .1679 - .1689
#scores - .1701, .1676, .1727, .1672 - .1694

#desc embed dim 64 --> 32
#scores - .1683, .1674, .1724, .1672 - .1688 (hard to measure change, lets keep it here)

#Introducing grad clip value of 2.5
#scores - .1690, .1670, .1715, .1663 - .1685

#Introducing grad clip value of 1.0 -- no impact i guess
#scores - .1688, .1672, .1714, .1683 - .1689

#Add ncategory shipping and grad clip 1.0
#scores - .1682, .1671, .1718, .1672 - .1686

#Add ncategory shipping and grad clip 5.0
#scores - .1686, .1677, .1726, .1674 - 

#grad clip 2.5 and rho 0.8
#scores - .1702, .1674, .1721, .1675

#grad clip 2.5 and rho 0.9, epislon 1e-5
#scores - .1688, .1674, .1737, .1677

#Adam(lr=0.004, beta_1=0.9, decay=0.004, clipvalue=2.5)
#scores - .1732, .1735, .1763, .1712

#128 --> 256 intermediate layer
#scores - .1693, .1680, .

#128 --> 64 intermediate layer
#scores