In [1]:
import copy
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
import sys
import os
import pickle
import hashlib
import string
import unicodedata
import re
from tqdm import tqdm, tqdm_notebook
tqdm.pandas(tqdm_notebook())

from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, KFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, QuantileTransformer
import lightgbm as lgb
from sklearn import metrics
import gc

from collections import defaultdict, OrderedDict, Counter
#from nltk.corpus import stopwords
#from spacy.lang.en.stop_words import STOP_WORDS
from itertools import chain

# from __future__ import print_function
np.random.seed(786)  # for reproducibility

from keras.models import Sequential, Model, load_model
from keras.layers import *
from keras.optimizers import *
from keras.utils import np_utils
from keras.layers.convolutional import Convolution1D, MaxPooling1D, ZeroPadding1D, AveragePooling1D
from keras.callbacks import EarlyStopping
from keras.layers.normalization import BatchNormalization
from keras.preprocessing import sequence
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN, LSTM, GRU
from keras.layers.wrappers import TimeDistributed
from keras.preprocessing.text import Tokenizer
from keras.callbacks import Callback, ModelCheckpoint
from keras.regularizers import l2
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasClassifier,  KerasRegressor

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
#Functions we need - Feature Selector, Fasttext_Estimator, Preprocessing Transformer, Binary_Encoder
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin
from pandas.api.types import is_numeric_dtype, is_string_dtype
from scipy.sparse.csr import csr_matrix
from sklearn.metrics import mean_squared_error, make_scorer

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_sklearn = make_scorer(rmse, greater_is_better=False)    
    
# the following functions allow for a parallelized batch generator
class threadsafe_iter(object):
    """
    Takes an iterator/generator and makes it thread-safe by
    serializing call to the `next` method of given iterator/generator.
    """
    def __init__(self, it):
        self.it = it
        self.lock = threading.Lock()
    def __iter__(self):
        return self

    def __next__(self):
        with self.lock:
            return next(self.it)

def threadsafe_generator(f):
    """
    A decorator that takes a generator function and makes it thread-safe.
    """
    def g(*a, **kw):
        return threadsafe_iter(f(*a, **kw))
    return g

@threadsafe_generator
def batch_generator(X_data, y_data, batch_size):
    
    #index = np.random.permutation(X_data.shape[0])    
    #X_data = X_data[index]
    #y_data = y_data[index]
    
    samples_per_epoch = X_data.shape[0]
    number_of_batches = samples_per_epoch/batch_size
    counter=0
    index = np.arange(np.shape(y_data)[0])
    #idx = 1
    while 1:
        index_batch = index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X_data[index_batch,:].todense()
        y_batch = y_data[index_batch]
        counter += 1
        yield np.array(X_batch),y_batch
        #print("")
        #print(X_batch.shape)
        #print("")
        #print('generator yielded a batch %d' % idx)
        #idx += 1
        if (counter > number_of_batches):
            counter=0
            
            
@threadsafe_generator
def batch_generator_x(X_data,batch_size):
    samples_per_epoch = X_data.shape[0]
    number_of_batches = samples_per_epoch/batch_size
    counter=0
    index = np.arange(np.shape(X_data)[0])
    while 1:
        index_batch = index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X_data[index_batch,:].todense()
        counter += 1
        yield np.array(X_batch)
        if (counter > number_of_batches):
            counter=0

In [7]:
num_partitions = 30
num_cores = 16
from multiprocessing import Pool, cpu_count
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
stop_words = ['ourselves', 'hers', 'between', 'yourself', 'but', 'again', 
              'there', 'about', 'once', 'during', 'out', 'very', 'having', 
              'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 
              'its', 'yours', 'such', 'into', 'most', 'itself', 'other', 
              'off', 'is', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 
              'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 
              'through', 'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 
              'down', 'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to', 
              'ours', 'had', 'she', 'all', 'when', 'at', 'any', 'before', 'them',
              'same', 'and', 'been', 'have', 'in', 'will', 'does', 'yourselves', 
              'then', 'that', 'because', 'what', 'over', 'why’, ‘so', 'can', 'did',
              'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only',
              'myself', 'which', 'those', 'i','after', 'few', 'whom', 'being', 'if', 
              'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than']

def unicodeToAscii(s):
    return  unicodedata.normalize('NFKC', s)

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"'", r"", s)
    s = re.sub(r"[.!?':;,]", r" ", s)
    s = re.sub(r"-", r"", s)
    s = re.sub(r"[^0-9a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"0", r"zero", s)
    s = re.sub(r"1", r"one", s)
    s = re.sub(r"2", r"two", s)
    s = re.sub(r"3", r"three", s)
    s = re.sub(r"4", r"four", s)
    s = re.sub(r"5", r"five", s)
    s = re.sub(r"6", r"six", s)
    s = re.sub(r"7", r"seven", s)
    s = re.sub(r"8", r"eight", s)
    #s = re.sub(r"/s/s", r"/s", s)
    return s

def _normalize_and_ngrams(sent, ngrams):
    input_list = normalizeString(sent).split()
    input_list = [word for word in input_list if word not in stop_words]
    s = input_list.copy()
    for i in range(2, ngrams+1):
        s += [' '.join(input_list[j:j+i]) for j in range(len(input_list)-i + 1)]
        #s += list((zip(*[input_list[j:] for j in range(i)])))
    return s

#tmp = "I am not a dance'r and i am a 6ixy   c-o:d;er programmer"
#print(normalizeString(tmp))
#print(_normalize_and_ngrams(tmp, 3))

class Vocab_topwords():
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.index2word = {}
        
    def fit_data(self, data, col, ngrams=3, max_features=50000):
        c = Counter(list(chain.from_iterable(data[col].tolist())))
        for i, (w, count) in enumerate(c.most_common(max_features)):
            self.word2index[w] = i
        return
    

            
            
def prepareVocab(name, data, max_features):
    vocab = Vocab_topwords(name)
    vocab.fit_data(data, name, max_features=max_features)
    
    print("Counted words:")
    print(vocab.name, len(vocab.word2index))
    return vocab

def indexesFromSentence(vocab, tokens, ngrams, max_len):
    num_list = []
    for i, item in enumerate(tokens):
        if len(num_list) == max_len:
            break
        elif item in vocab.word2index:
            num_list.append(vocab.word2index[item])
        else:
            continue
        
    if len(num_list) < max_len :
        num_list += [0]*(max_len - len(num_list) )
        
    return num_list

def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

def get_cat_1(x): return str(x).split('/')[0]
def get_cat_2(x): return str(x).split('/')[1] if len(str(x).split('/')) > 1 else -1
def get_cat_3(x): return ' '.join(str(x).split('/')[2:]) if len(str(x).split('/')) > 2 else -1

def applycat1(df): 
    df['cat1'] = df['category_name'].progress_apply(get_cat_1)
    return df

def applycat2(df): 
    df['cat2'] = df['category_name'].progress_apply(get_cat_2)
    return df

def applycat3(df): 
    df['cat3'] = df['category_name'].progress_apply(get_cat_3)
    return df

def norm3grams(s): return _normalize_and_ngrams(s, 3)

def applyname(series): return series.progress_apply(norm3grams)

def index2sent1(x, name_vocab): return indexesFromSentence(name_vocab, x, 3, 10)

def name2index(series): return series.progress_apply(lambda x: index2sent1(x, name_vocab))

def norm2grams(s): return _normalize_and_ngrams(s, 2)

def applydesc(series):return series.progress_apply(norm2grams)

def index2sent2(x, desc_vocab): return indexesFromSentence(desc_vocab, x, 2, 80)

def desc2index(series): return series.progress_apply(lambda x: index2sent2(x, desc_vocab))

def read_data(in_path, out_path):
    if False and os.path.exists(os.path.join(out_path, 'train_2.pkl')) and os.path.exists(os.path.join(out_path, 'test_2.pkl')):
        train_data = pd.read_pickle(os.path.join(out_path, 'train_2.pkl'))
        test_data  = pd.read_pickle(os.path.join(out_path, 'test_2.pkl'))
        
        return train_data, test_data
    
    else:
        train_data = pd.read_table(os.path.join(in_path, 'train.tsv'))
        test_data  = pd.read_table(os.path.join(in_path, 'test.tsv'))
    
        train_rows = len(train_data)
        data = pd.concat([train_data, test_data], ignore_index=True)
        
        data['name'] = data['name'].astype(str)
        data['item_description'] = data['item_description'].astype(str)
        
        #ddata = dd.from_pandas(data, 4)

        
        data = applycat1(data)
        data = applycat2(data)
        data = applycat3(data)
        data.fillna(-1, inplace=True)
        cat_cols = ['category_name', 'brand_name', 'item_condition_id', 'cat1', 'cat2', 'cat3']
        print("Label enoding categoricals")
        for col in cat_cols:
            data[col] = LabelEncoder().fit_transform(data[col].astype(str)).astype(np.int32)
            
        print("Tokenizing text columns")
        data['name'] = parallelize_dataframe(data['name'], applyname)
        print("Preparing vocabs")
        global name_vocab
        name_vocab = prepareVocab('name', data[['name']], 50000)
        data['name'] = name2index(data['name'])
        del name_vocab
        
        print("Transforming text to sequences")
        data['item_description'] = parallelize_dataframe(data['item_description'], applydesc)
        global desc_vocab
        desc_vocab = prepareVocab('item_description', data[['item_description']], 50000)
        data['item_description'] = desc2index(data['item_description'])
        del desc_vocab
        
        train_data = data.loc[: train_rows - 1, :]
        train_data = train_data.loc[(train_data.price >= 3) & (train_data.price <= 2000), :].reset_index(drop=True)
        test_data  = data.loc[train_rows: , :].reset_index(drop=True)
        
        del train_data['test_id']
        del test_data['train_id']
        del data
        gc.collect()
        print("Writing out new pickles dataframes")
        train_data.to_pickle(os.path.join(out_path, 'train_2.pkl'))
        test_data.to_pickle(os.path.join(out_path, 'test_2.pkl'))
        
        return train_data, test_data

In [8]:
%%time
train_data, test_data = read_data("../input", "./")

100%|██████████| 2175894/2175894 [00:02<00:00, 813412.97it/s]
100%|██████████| 2175894/2175894 [00:03<00:00, 616726.31it/s]
100%|██████████| 2175894/2175894 [00:03<00:00, 574410.68it/s]


Label enoding categoricals
Tokenizing text columns


100%|██████████| 72530/72530 [00:03<00:00, 23079.55it/s]
100%|██████████| 72530/72530 [00:03<00:00, 22854.34it/s]
100%|██████████| 72530/72530 [00:03<00:00, 23605.23it/s]
100%|██████████| 72530/72530 [00:03<00:00, 22775.35it/s]
100%|██████████| 72530/72530 [00:03<00:00, 22318.84it/s]
100%|██████████| 72530/72530 [00:03<00:00, 22948.05it/s]
100%|██████████| 72530/72530 [00:03<00:00, 22705.52it/s]
100%|██████████| 72530/72530 [00:03<00:00, 22278.23it/s]
100%|██████████| 72530/72530 [00:03<00:00, 22934.23it/s]
100%|██████████| 72530/72530 [00:03<00:00, 21508.33it/s]
100%|██████████| 72530/72530 [00:03<00:00, 23358.91it/s]
  4%|▍         | 3003/72530 [00:00<00:04, 15431.23it/s]]
100%|██████████| 72530/72530 [00:03<00:00, 21866.61it/s]
100%|██████████| 72530/72530 [00:03<00:00, 22577.77it/s]
100%|██████████| 72530/72530 [00:03<00:00, 21462.82it/s]
100%|██████████| 72530/72530 [00:03<00:00, 21561.35it/s]
100%|██████████| 72530/72530 [00:02<00:00, 33201.57it/s]
100%|██████████| 72530/72530 [0

Preparing vocabs


  1%|          | 13306/2175894 [00:00<00:16, 133056.00it/s]

Counted words:
name 50000


100%|██████████| 2175894/2175894 [00:15<00:00, 138474.97it/s]


Transforming text to sequences


100%|██████████| 72530/72530 [00:07<00:00, 9361.64it/s]]
100%|██████████| 72530/72530 [00:07<00:00, 9265.97it/s] 
100%|██████████| 72530/72530 [00:08<00:00, 8998.05it/s]
100%|██████████| 72530/72530 [00:07<00:00, 9101.70it/s]
100%|██████████| 72530/72530 [00:08<00:00, 9019.85it/s]
100%|██████████| 72530/72530 [00:08<00:00, 9025.59it/s]
100%|██████████| 72530/72530 [00:08<00:00, 8767.98it/s]
100%|██████████| 72530/72530 [00:08<00:00, 8986.46it/s]
100%|██████████| 72530/72530 [00:07<00:00, 9181.96it/s]
100%|██████████| 72530/72530 [00:08<00:00, 8912.42it/s]
100%|██████████| 72530/72530 [00:08<00:00, 8925.70it/s]
100%|██████████| 72530/72530 [00:08<00:00, 8745.38it/s]
100%|██████████| 72530/72530 [00:08<00:00, 8825.58it/s]
100%|██████████| 72530/72530 [00:08<00:00, 8862.39it/s]
100%|██████████| 72530/72530 [00:08<00:00, 8503.16it/s]
100%|██████████| 72530/72530 [00:08<00:00, 8755.29it/s]
100%|██████████| 72530/72530 [00:05<00:00, 13059.71it/s]
100%|██████████| 72530/72530 [00:05<00:00, 14

Counted words:
item_description 50000


100%|██████████| 2175894/2175894 [00:32<00:00, 67027.47it/s]


Writing out new pickles dataframes
CPU times: user 2min 30s, sys: 8.18 s, total: 2min 38s
Wall time: 2min 51s


In [5]:
cvlist = list(KFold(n_splits=10).split(train_data, train_data.price))
print(train_data.shape, test_data.shape)
train_data.head()

(1481658, 11) (693359, 11)


Unnamed: 0,brand_name,category_name,item_condition_id,item_description,name,price,shipping,train_id,cat1,cat2,cat3
0,2,830,2,"[3, 34, 37, 39, 40, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[4084, 9421, 10256, 125, 16, 3, 55, 20981, 326...",10.0,1,0.0,5,103,774
1,3890,87,2,"[4327, 16, 8, 153, 28, 983, 2, 31, 2, 3952, 65...","[12669, 53875, 34664, 2087, 62723, 0, 0, 0, 0, 0]",52.0,0,1.0,1,31,216
2,4589,1278,0,"[523, 53, 8072, 2, 214, 1076, 1227, 61, 2204, ...","[208, 0, 0, 0, 0, 0, 0, 0, 0, 0]",10.0,1,2.0,9,104,98
3,2,504,0,"[0, 32, 154, 14565, 147, 11, 1211, 1797, 109, ...","[119, 2139, 44620, 0, 0, 0, 0, 0, 0, 0]",35.0,1,3.0,3,56,411
4,2,1205,0,"[788, 12732, 2, 2072, 24425, 6156, 0, 0, 0, 0,...","[6132, 46, 1122, 145, 9686, 1480, 93627, 0, 0, 0]",44.0,0,4.0,9,59,543


In [34]:
from sklearn.base import BaseEstimator, RegressorMixin
class EM_NNRegressor(BaseEstimator, RegressorMixin):
    
    def __init__(self, embed_cols=None, dense_cols=None, embed_dims=None, 
                 text_embed_cols=None, text_embed_seq_lens=None, 
                 text_embed_dims=None,
                 num_layers=2, multiprocess=False,
                layer_activations=None, layer_dims=None,layer_dropouts=None, epochs=20, batchsize=32,
                optimizer_kwargs=None, val_size=0.1, verbose=1, seed=1):
        
        self.embed_cols = embed_cols
        self.dense_cols = dense_cols
        self.embed_dims = embed_dims
        self.text_embed_cols = text_embed_cols
        self.text_embed_dims = text_embed_dims
        #self.text_embed_tokenizers = text_embed_tokenizers
        self.text_embed_seq_lens = text_embed_seq_lens
        self.dense_dims = None
        self.num_layers = num_layers
        self.layer_dims = layer_dims
        self.layer_activations = layer_activations
        self.layer_dropouts = layer_dropouts
        self.epochs = epochs
        self.batchsize = batchsize
        self.optimizer_kwargs = optimizer_kwargs
        self.val_size = val_size
        self.verbose = verbose
        self.multiprocess = multiprocess
        self.seed = seed
        self.model = None
        if self.dense_cols:
            self.dense_dims = len(self.dense_cols)
            
    def _splitX(self, X):
        X_splits = []
        
        if self.embed_cols:
            for col in self.embed_cols :
                X_splits.append(np.asarray(X[col]))
                
        if self.text_embed_cols:
            for i, col in enumerate(self.text_embed_cols):
                X_splits.append(np.asarray([*X[col].values]))
                
        if self.dense_cols:
            X_splits.append(X[self.dense_cols].values.reshape(X.shape[0], -1))
            
        return X_splits
    
    
    def _build_model(self):
        model_inputs = []
        model_layers = []
        
        if self.embed_cols:
            for col, dim in zip(self.embed_cols, self.embed_dims):
                x1 = Input( shape=(1,), name=col)
                model_inputs.append(x1)
                x1 = Embedding(input_dim=dim[0], output_dim=dim[1],)(x1)
                #x1 = Dropout(0.1)(x1)
                x1 = Reshape(target_shape=(dim[1],))(x1)
                model_layers.append(x1)
                
        if self.text_embed_cols:
            for col, dim, seq_len in zip(self.text_embed_cols, 
                                                self.text_embed_dims, 
                                                self.text_embed_seq_lens):
                x3 = Input( shape=(seq_len,))
                model_inputs.append(x3)
                x3 = Embedding(input_dim=dim[0], output_dim=dim[1], input_length=seq_len,)(x3)
                #x3 = Conv1D(16, return_sequences=True)(x3)
                x3 = GlobalAveragePooling1D()(x3)
                x3 = Reshape(target_shape=(dim[1],))(x3)
                model_layers.append(x3)
                
        if self.dense_cols:
            x2 = Input( shape=(self.dense_dims, ), name='dense_cols')
            model_inputs.append(x2)
            model_layers.append(x2)
        print(model_layers)
        x = concatenate(model_layers)
        
        if self.num_layers > 0:
            for dim, drops in zip(self.layer_dims, self.layer_dropouts):
                x = BatchNormalization()(x)
                x = Dropout(rate=drops)(x)
                x = Dense(dim, activation='selu', kernel_initializer='he_normal')(x)
                #x = PReLU()(x)
        
        x = BatchNormalization()(x)
        x = Dropout(0.02)(x)
        output = Dense(1, activation='linear', kernel_initializer='normal')(x)
        
        model = Model(inputs=model_inputs, outputs=output)
        #print(model.summary())
        #adam = RMSprop(lr=0.001, decay=0.001)
        adam = Adam(lr=0.001, decay=1e-4)
        model.compile(optimizer=adam, loss='mean_squared_error' )
        
        return model 
    
    
    def fit(self, X, y):
        self.model = self._build_model()
        if self.val_size > 0:
            X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=self.val_size, random_state=self.seed)
            print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)
            
            callbacks= [ModelCheckpoint("embed_NN_"+str(self.seed)+".check", save_best_only=True, verbose=1)]
            if self.multiprocess == False:
                self.model.fit(self._splitX(X_train), y_train, batch_size=self.batchsize, epochs=self.epochs,
                               verbose=self.verbose,
                              validation_data=(self._splitX(X_val), y_val), shuffle=True,
                              callbacks=callbacks)
            else:
                X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=self.val_size, random_state=1)

        else:
            self.model.fit(self._splitX(X), y, batch_size=self.batchsize, epochs=self.epochs,
               verbose=self.verbose, shuffle=True)

        
        return self
    
    def predict(self, X, y=None):
        
        if self.model:
            self.model = load_model("embed_NN_"+str(self.seed)+".check")
            y_hat = self.model.predict(self._splitX(X))
        else:
            raise ValueError("Model not fit yet")
            
        return y_hat

In [35]:
nnet = EM_NNRegressor(embed_cols=['brand_name','category_name','item_condition_id', 'cat1', 'cat2', 'cat3'], 
                      embed_dims=[(6000, 30),(1500, 15), (5,4), (15,4), (120, 10), (900, 15)],
                      text_embed_cols=['name', 'item_description'],
                      text_embed_dims=[(50000, 80), (50000, 80)],
                      text_embed_seq_lens =[10, 80], 
                      dense_cols=['shipping'],
                      epochs=10,
                      batchsize=2048,
                      num_layers = 1,
                      layer_dropouts=[0.1],
                      layer_dims=[200],
                      val_size=0.05
                     )



In [36]:
#n_workers = multiprocessing.cpu_count()
#batch_size = 32

In [37]:
#scores = cross_val_score(nnet, train_data, np.log1p(train_data.price), scoring=rmse_sklearn, cv=cvlist, verbose =10)
#print(scores, np.mean(scores))
for seed in range(1):
    nnet.set_params(seed=seed).fit(train_data, np.log1p(train_data.price))

[<tf.Tensor 'reshape_48/Reshape:0' shape=(?, 30) dtype=float32>, <tf.Tensor 'reshape_49/Reshape:0' shape=(?, 15) dtype=float32>, <tf.Tensor 'reshape_50/Reshape:0' shape=(?, 4) dtype=float32>, <tf.Tensor 'reshape_51/Reshape:0' shape=(?, 4) dtype=float32>, <tf.Tensor 'reshape_52/Reshape:0' shape=(?, 10) dtype=float32>, <tf.Tensor 'reshape_53/Reshape:0' shape=(?, 15) dtype=float32>, <tf.Tensor 'reshape_54/Reshape:0' shape=(?, 80) dtype=float32>, <tf.Tensor 'reshape_55/Reshape:0' shape=(?, 80) dtype=float32>, <tf.Tensor 'dense_cols_5:0' shape=(?, 1) dtype=float32>]
(1407575, 11) (74083, 11) (1407575,) (74083,)
Train on 1407575 samples, validate on 74083 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [40]:
nnet.model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_12 (InputLayer)           (None, 10)           0                                            
__________________________________________________________________________________________________
input_13 (InputLayer)           (None, 80)           0                                            
__________________________________________________________________________________________________
brand_name (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
category_name (InputLayer)      (None, 1)            0                                            
__________________________________________________________________________________________________
item_condi

In [46]:
nnet.model.layers[10]

[<keras.engine.topology.InputLayer at 0x7fbeb917add8>,
 <keras.engine.topology.InputLayer at 0x7fbeb912ef60>,
 <keras.engine.topology.InputLayer at 0x7fbebd4ae5f8>,
 <keras.engine.topology.InputLayer at 0x7fbebd4aef28>,
 <keras.engine.topology.InputLayer at 0x7fbebaa74d68>,
 <keras.engine.topology.InputLayer at 0x7fbebaa98ac8>,
 <keras.engine.topology.InputLayer at 0x7fbebaa95630>,
 <keras.engine.topology.InputLayer at 0x7fbebaa31198>,
 <keras.layers.embeddings.Embedding at 0x7fbeb9193a58>,
 <keras.layers.embeddings.Embedding at 0x7fbeb912e470>,
 <keras.layers.embeddings.Embedding at 0x7fbebd4aedd8>,
 <keras.layers.embeddings.Embedding at 0x7fbebaa749e8>,
 <keras.layers.embeddings.Embedding at 0x7fbebaab5ef0>,
 <keras.layers.embeddings.Embedding at 0x7fbebaa955f8>,
 <keras.layers.embeddings.Embedding at 0x7fbebaa31e80>,
 <keras.layers.embeddings.Embedding at 0x7fbebaa48c50>,
 <keras.layers.pooling.GlobalAveragePooling1D at 0x7fbeb91937b8>,
 <keras.layers.pooling.GlobalAveragePooling1D 

In [10]:
#Checking sequence lengths
#sns.distplot(data.item_description.apply(lambda x: len(str(x).split())))