In [87]:
import re
import nltk
nltk.download('punkt')
import feather
import pandas as pd
from keras.callbacks import *
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from gensim.models.fasttext import FastText
from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPooling1D, concatenate, BatchNormalization, PReLU
from keras.layers import Reshape, Flatten, Concatenate, SpatialDropout1D, GlobalAveragePooling1D, Multiply
from keras.optimizers import Adam, Optimizer
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, optimizers, layers
from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold
from pymagnitude import *
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from scipy.stats import rankdata
from gensim.models import word2vec, KeyedVectors


ps = nltk.stem.PorterStemmer()
lc = nltk.stem.lancaster.LancasterStemmer()
sb = nltk.stem.snowball.SnowballStemmer('english')

def analyzer_embed(text):
    text = text.lower() # 小文字化
    text = text.replace('\n', '') # 改行削除
    text = text.replace('\t', '') # タブ削除
    puncts = r',.":)(-!?|;\'$&/[]>%=#*+\\•~@£·_{}©^®`<→°€™›♥←×§″′Â█½à…“★”–●â►−¢²¬░¶↑±¿▾═¦║―¥▓—‹─▒：¼⊕▼▪†■’▀¨▄♫☆é¯♦¤▲è¸¾Ã⋅‘∞∙）↓、│（»，♪╩╚³・╦╣╔╗▬❤ïØ¹≤‡√。【】'
    for punct in puncts:
        text = text.replace(punct, f' {punct} ')
    for bad_word in contraction_mapping:
        if bad_word in text:
            text = text.replace(bad_word, contraction_mapping[bad_word])
    text = text.split(' ') # スペースで区切る
    
    words = []
    for word in text:
        if (re.compile(r'^.*[0-9]+.*$').fullmatch(word) is not None): # 数字が含まれるものは分割
            for w in re.findall(r'(\d+|\D+)', word):
                words.append(w)
            continue
        if len(word) < 1: #  0文字（空文字）は除外
            continue
        words.append(word)
        
    return " ".join(words)

def get_score(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')
    
def to_bins(x, borders):
    for i in range(len(borders)):
        if x <= borders[i]:
            return i
    return len(borders)
    
class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _loss(self, coef, X, y, idx):
        X_p = np.array([to_bins(pred, coef) for pred in X])
        ll = -get_score(y, X_p)
        return ll

    def fit(self, X, y):
        coef = [0.2, 0.4, 0.6, 0.8]
        golden1 = 0.618
        golden2 = 1 - golden1
        ab_start = [(0.01, 0.3), (0.15, 0.56), (0.35, 0.75), (0.6, 0.9)]
        for it1 in range(10):
            for idx in range(4):
                # golden section search
                a, b = ab_start[idx]
                # calc losses
                coef[idx] = a
                la = self._loss(coef, X, y, idx)
                coef[idx] = b
                lb = self._loss(coef, X, y, idx)
                for it in range(20):
                    # choose value
                    if la > lb:
                        a = b - (b - a) * golden1
                        coef[idx] = a
                        la = self._loss(coef, X, y, idx)
                    else:
                        b = b - (b - a) * golden2
                        coef[idx] = b
                        lb = self._loss(coef, X, y, idx)
        self.coef_ = {'x': coef}

    def predict(self, X, coef):
        X_p = np.array([to_bins(pred, coef) for pred in X])
        return X_p

    def coefficients(self):
        return self.coef_['x']
    
class StratifiedGroupKFold():
    def __init__(self, n_splits=5):
        self.n_splits = n_splits
    
    def split(self, X, y=None, groups=None):
        fold = pd.DataFrame([X, y, groups]).T
        fold.columns = ['X', 'y', 'groups']
        fold['y'] = fold['y'].astype(int)
        g = fold.groupby('groups')['y'].agg('mean').reset_index()
        fold = fold.merge(g, how='left', on='groups', suffixes=('', '_mean'))
        fold['y_mean'] = fold['y_mean'].apply(np.round)
        fold['fold_id'] = 0
        for unique_y in fold['y_mean'].unique():
            mask = fold.y_mean==unique_y
            selected = fold[mask].reset_index(drop=True)
            cv = GroupKFold(n_splits=n_splits)
            for i, (train_index, valid_index) in enumerate(cv.split(range(len(selected)), y=None, groups=selected['groups'])):
                selected.loc[valid_index, 'fold_id'] = i
            fold.loc[mask, 'fold_id'] = selected['fold_id'].values
            
        for i in range(self.n_splits):
            indices = np.arange(len(fold))
            train_index = indices[fold['fold_id'] != i]
            valid_index = indices[fold['fold_id'] == i]
            yield train_index, valid_index
    
def get_keras_data(df, description_embeds):
    X = {
        "numerical": df[numerical].values,
        "important_numerical": df[important_numerical].values,
        "description": description_embeds,
        "dense_cols": df[dense_cols],
        "inception_cols": df[inception_cols]
    }
    for c in categorical_features + important_categorical:
        X[c] = df[c]
    return X

def rmse(y, y_pred):
    return K.sqrt(K.mean(K.square(y-y_pred), axis=-1))

def w2v_fornn(train_text, model, max_len):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(list(train_text))
    train_text = tokenizer.texts_to_sequences(train_text)
    train_text = pad_sequences(train_text, maxlen=max_len)
    word_index = tokenizer.word_index
    
    embedding_dim = model.dim
    embedding_matrix = np.zeros((len(word_index)+1, embedding_dim))
    
    result = []
    for word, i in word_index.items():
        if word in model:  # 0.9906
            embedding_matrix[i] = model.query(word)
            continue
        word_ = word.upper()
        if word_ in model:  # 0.9909
            embedding_matrix[i] = model.query(word_)
            continue
        word_ = word.capitalize()
        if word_ in model:  # 0.9925
            embedding_matrix[i] = model.query(word_)
            continue
        word_ = ps.stem(word)
        if word_ in model:  # 0.9927
            embedding_matrix[i] = model.query(word_)
            continue
        word_ = lc.stem(word)
        if word_ in model:  # 0.9932
            embedding_matrix[i] = model.query(word_)
            continue
        word_ = sb.stem(word)
        if word_ in model:  # 0.9933
            embedding_matrix[i] = model.query(word_)
            continue
        embedding_matrix[i] = model.query(word)

    return train_text, embedding_matrix, embedding_dim, word_index

def fasttext_fornn(train_text, model, max_len):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(list(train_text))
    train_text = tokenizer.texts_to_sequences(train_text)
    train_text = pad_sequences(train_text, maxlen=max_len)
    word_index = tokenizer.word_index
    
    embedding_dim = model.vector_size
    embedding_matrix = np.zeros((len(word_index)+1, embedding_dim))

    result = []
    for word, i in word_index.items():
        if word in model:  # 0.9906
            embedding_matrix[i] = model.wv[word]
            continue
        word_ = word.upper()
        if word_ in model:  # 0.9909
            embedding_matrix[i] = model.wv[word_]
            continue
        word_ = word.capitalize()
        if word_ in model:  # 0.9925
            embedding_matrix[i] = model.wv[word_]
            continue
        word_ = ps.stem(word)
        if word_ in model:  # 0.9927
            embedding_matrix[i] = model.wv[word_]
            continue
        word_ = lc.stem(word)
        if word_ in model:  # 0.9932
            embedding_matrix[i] = model.wv[word_]
            continue
        word_ = sb.stem(word)
        if word_ in model:  # 0.9933
            embedding_matrix[i] = model.wv[word_]
            continue
        embedding_matrix[i] = np.zeros(embedding_dim)
        
    return train_text, embedding_matrix, embedding_dim, word_index

def self_train_w2v_tonn(train_text, max_len, w2v_params, mode="w2v"):
    train_corpus = [text_to_word_sequence(text) for text in train_text]
    if mode == "w2v":
        model = word2vec.Word2Vec(train_corpus, **w2v_params)
    elif mode == "fasttext":
        model = FastText(train_corpus, **w2v_params)
    
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(list(train_text))
    train_text = tokenizer.texts_to_sequences(train_text)
    train_text = pad_sequences(train_text, maxlen=max_len)
    word_index = tokenizer.word_index
    
    embedding_dim = model.vector_size
    embedding_matrix = np.zeros((len(word_index)+1, embedding_dim))

    for word, i in word_index.items():
        if word in model:  # 0.9906
            embedding_matrix[i] = model.wv[word]
            continue
        word_ = word.upper()
        if word_ in model:  # 0.9909
            embedding_matrix[i] = model.wv[word_]
            continue
        word_ = word.capitalize()
        if word_ in model:  # 0.9925
            embedding_matrix[i] = model.wv[word_]
            continue
        word_ = ps.stem(word)
        if word_ in model:  # 0.9927
            embedding_matrix[i] = model.wv[word_]
            continue
        word_ = lc.stem(word)
        if word_ in model:  # 0.9932
            embedding_matrix[i] = model.wv[word_]
            continue
        word_ = sb.stem(word)
        if word_ in model:  # 0.9933
            embedding_matrix[i] = model.wv[word_]
            continue
        embedding_matrix[i] = np.zeros(embedding_dim)
        
    return train_text, embedding_matrix, embedding_dim, word_index


[nltk_data] Downloading package punkt to /home/keras/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [88]:
categorical_features = [
     'Breed1',
     'Breed2',
     'Color1',
     'Color2',
     'Color3',
     'Dewormed',
     'FurLength',
     'Gender',
     'Health',
     'MaturitySize',
     'State',
     'Sterilized',
     'Type',
     'Vaccinated',
     'Type_main_breed',
     'BreedName_main_breed',
     'Type_second_breed',
     'BreedName_second_breed',
]
max_len=128
n_important = 100

X_train = feather.read_dataframe('from_kernel/all_datav7.feather')
ranking = feather.read_dataframe("from_kernel/all_datav17.feather")["BreedDogRank_second"]
X_train["BreedDogRank_second"] = ranking
ranking = feather.read_dataframe("from_kernel/all_datav17.feather")["BreedDogRank_main"]
X_train["BreedDogRank_main"] = ranking
len_train = 14993

use_cols = pd.read_csv("importance10.csv")
use_cols["gain"] = use_cols["gain"] / use_cols["gain"].sum()
use_cols = list(use_cols[use_cols.gain>0.0002].feature.values)
use_cols.remove("BreedID_y")
use_cols.remove("BreedDogRank_second")
use_cols.remove("BreedDogRank_main")
dense_cols = [c for c in X_train.columns if "dense" in c and "svd" not in c and "nmf" not in c]
inception_cols = [c for c in X_train.columns if "inception" in c and "svd" not in c and "nmf" not in c]
numerical = [c for c in use_cols if c not in categorical_features and c not in inception_cols+dense_cols]
#numerical = [c for c in numerical if c in use_cols]

important_numerical = [c for c in numerical if c in use_cols[:n_important]]
numerical = [c for c in numerical if c not in use_cols[:n_important]]
important_categorical = [c for c in categorical_features if c in use_cols[:n_important]]
categorical_features = [c for c in categorical_features if c not in use_cols[:n_important]]

y =  feather.read_dataframe('../input/X_train.feather')["AdoptionSpeed"].values
rescuer_id = pd.read_csv('../input/petfinder-adoption-prediction/train/train.csv').loc[:, 'RescuerID'].iloc[:len_train]

#embedding = '../input/quora-embedding/GoogleNews-vectors-negative300.bin'
#model = KeyedVectors.load_word2vec_format(embedding, binary=True)
#X_desc, embedding_matrix, embedding_dim, word_index  = fasttext_fornn(X_train["Description_Emb"], model, max_len)

embedding = "../input/pymagnitude-data/glove.840B.300d.magnitude"
model = Magnitude(embedding)
X_desc, embedding_matrix, embedding_dim, word_index = w2v_fornn(X_train["Description_Emb"], model, max_len)

"""w2v_params = {
    "size": 300,
    "seed": 0,
    "min_count": 1,
    "workers": 1
}
X_desc, embedding_matrix, embedding_dim, word_index = self_train_w2v_tonn(X_train["Description_bow"], max_len, w2v_params, "fasttext")
"""

'w2v_params = {\n    "size": 300,\n    "seed": 0,\n    "min_count": 1,\n    "workers": 1\n}\nX_desc, embedding_matrix, embedding_dim, word_index = self_train_w2v_tonn(X_train["Description_bow"], max_len, w2v_params, "fasttext")\n'

In [89]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

for c in categorical_features + important_categorical:
    X_train[c] = LabelEncoder().fit_transform(X_train[c])
X_train.replace(np.inf, np.nan, inplace=True)
X_train.replace(-np.inf, np.nan, inplace=True)
X_train[important_numerical+numerical] = StandardScaler().fit_transform(X_train[important_numerical+numerical].rank())
X_train.fillna(0, inplace=True)

X_test = X_train.iloc[len_train:]
X_train = X_train.iloc[:len_train]
X_desc_test = X_desc[len_train:]
X_desc_train = X_desc[:len_train]

In [99]:
from keras.engine import InputSpec
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints


class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with
        return_sequences = True.
        The dimensions are inferred based on the output shape of the RNN.
        Example:
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
        """
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim
    
class CyclicLR(Callback):
    def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',
                 gamma=1., scale_fn=None, scale_mode='cycle'):
        super(CyclicLR, self).__init__()

        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.mode = mode
        self.gamma = gamma
        if scale_fn == None:
            if self.mode == 'triangular':
                self.scale_fn = lambda x: 1.
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = lambda x: 1/(2.**(x-1))
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = lambda x: gamma**(x)
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        self.clr_iterations = 0.
        self.trn_iterations = 0.
        self.history = {}

        self._reset()

    def _reset(self, new_base_lr=None, new_max_lr=None,
               new_step_size=None):
        """Resets cycle iterations.
        Optional boundary/step size adjustment.
        """
        if new_base_lr != None:
            self.base_lr = new_base_lr
        if new_max_lr != None:
            self.max_lr = new_max_lr
        if new_step_size != None:
            self.step_size = new_step_size
        self.clr_iterations = 0.
        
    def clr(self):
        cycle = np.floor(1+self.clr_iterations/(2*self.step_size))
        x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1)
        if self.scale_mode == 'cycle':
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle)
        else:
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations)
        
    def on_train_begin(self, logs={}):
        logs = logs or {}

        if self.clr_iterations == 0:
            K.set_value(self.model.optimizer.lr, self.base_lr)
        else:
            K.set_value(self.model.optimizer.lr, self.clr())        
            
    def on_batch_end(self, epoch, logs=None):
        
        logs = logs or {}
        self.trn_iterations += 1
        self.clr_iterations += 1

        self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
        self.history.setdefault('iterations', []).append(self.trn_iterations)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)
        
        K.set_value(self.model.optimizer.lr, self.clr())
        
class ShakeShake(Layer):
    """ Shake-Shake-Image Layer """

    def __init__(self, **kwargs):
        self.ndim = 2
        super(ShakeShake, self).__init__(**kwargs)

    def build(self, input_shape):
        super(ShakeShake, self).build(input_shape)

    def call(self, x):
        # unpack x1 and x2
        assert isinstance(x, list)
        x1, x2 = x
        # create alpha and beta
        batch_size = K.shape(x1)[0]
        alpha = K.random_uniform((batch_size, 1, 1, 1))
        beta = K.random_uniform((batch_size, 1, 1, 1))
        # shake-shake during training phase
        def x_shake():
            return beta * x1 + (1 - beta) * x2 + K.stop_gradient((alpha - beta) * x1 + (beta - alpha) * x2)
        # even-even during testing phase
        def x_even():
            return 0.5 * x1 + 0.5 * x2
        return K.in_train_phase(x_shake, x_even)

    def compute_output_shape(self, input_shape):
        assert isinstance(input_shape, list)
        return input_shape[0]
        
def se_block(input, channels, r=8):
    x = Dense(channels//r, activation="relu")(input)
    x = Dense(channels, activation="sigmoid")(x)
    return Multiply()([input, x])
    
import keras

class SWA(keras.callbacks.Callback):
    
    def __init__(self, filepath, swa_epoch):
        super(SWA, self).__init__()
        self.filepath = filepath
        self.swa_epoch = swa_epoch 
    
    def on_train_begin(self, logs=None):
        self.nb_epoch = self.params['epochs']
        print('Stochastic weight averaging selected for last {} epochs.'
              .format(self.nb_epoch - self.swa_epoch))
        
    def on_epoch_end(self, epoch, logs=None):
        
        if epoch == self.swa_epoch:
            self.swa_weights = self.model.get_weights()
            
        elif epoch > self.swa_epoch:    
            for i, layer in enumerate(self.model.layers):
                self.swa_weights[i] = (self.swa_weights[i] * \
                                       (epoch - self.swa_epoch) + self.model.get_weights()[i]) \
                /((epoch - self.swa_epoch)  + 1)  

        else:
            pass
        
    def on_train_end(self, logs=None):
        self.model.set_weights(self.swa_weights)
        print('Final model parameters set to stochastic weight average.')
        self.model.save_weights(self.filepath)
        print('Final stochastic averaged weights saved to file.')

def get_model(max_len, embedding_dim, emb_n=4, emb_n_imp=16, dout=.5, weight_decay=0.1):
    inp_cats = []
    embs = []
    for c in categorical_features:
        inp_cat = Input(shape=[1], name=c)
        inp_cats.append(inp_cat)
        embs.append((Embedding(X_train[c].max()+1, emb_n)(inp_cat)))
    for c in important_categorical:
        inp_cat = Input(shape=[1], name=c)
        inp_cats.append(inp_cat)
        embs.append((Embedding(X_train[c].max()+1, emb_n_imp)(inp_cat)))
    cats = Flatten()(concatenate(embs))
    imp_cats = Flatten()(concatenate(embs))
    cats = Dense(8, activation="linear")(cats)
    cats = BatchNormalization()(cats)
    cats = PReLU()(cats)
    cats = Dropout(dout/2)(cats)
    
    inp_numerical =  Input(shape=(len(numerical),), name="numerical")
    inp_important_numerical = Input(shape=(len(important_numerical),), name="important_numerical")
    nums = concatenate([inp_numerical, inp_important_numerical])
    nums = Dense(32, activation="linear")(nums)
    nums = BatchNormalization()(nums)
    nums = PReLU()(nums)
    nums = Dropout(dout)(nums)
    
    inp_dense =  Input(shape=(len(dense_cols),), name="dense_cols")
    x_dense = Dense(16, activation="linear")(inp_dense)
    x_dense = BatchNormalization()(x_dense)
    x_dense = PReLU()(x_dense)
    
    inp_inception =  Input(shape=(len(inception_cols),), name="inception_cols")
    x_inception = Dense(16, activation="linear")(inp_inception)
    x_inception = BatchNormalization()(x_inception)
    x_inception = PReLU()(x_inception)
    
    x_img = concatenate([x_dense, x_inception])
    x_img = Dense(32, activation="linear")(x_img)
    x_img = BatchNormalization()(x_img)
    x_img = PReLU()(x_img)
    x_img = Dropout(dout)(x_img)
    
    inp_desc = Input(shape=(max_len, ), name="description")
    emb_desc = Embedding(len(embedding_matrix), embedding_dim, weights=[embedding_matrix], trainable=False)(inp_desc)
    emb_desc = SpatialDropout1D(0.2)(emb_desc)
    x1 = Bidirectional(CuDNNLSTM(32, return_sequences=True))(emb_desc)
    x2 = Bidirectional(CuDNNGRU(32, return_sequences=True))(x1)
    #x2 = Conv1D(64, 1)(x1)
    
    max_pool2 = GlobalMaxPooling1D()(x2)
    avg_pool2 = GlobalAveragePooling1D()(x2)
    att2 = Attention(max_len)(x2)
    conc = Concatenate()([max_pool2, avg_pool2, att2])
    conc = se_block(conc,64+64+64)
    conc = BatchNormalization()(conc)
    
    conc = Dense(32, activation="linear")(conc)
    conc = BatchNormalization()(conc)
    conc = PReLU()(conc)
    conc = Dropout(dout)(conc)
    
    x = concatenate([conc, x_img, nums, cats, inp_important_numerical])
    x = se_block(x,32+32+32+8+len(important_numerical))
    x = BatchNormalization()(x)
    x = Dropout(dout)(x)
    x = concatenate([x, inp_important_numerical])
    x = BatchNormalization()(x)
    x = Dropout(dout/2)(x)
    
    out = Dense(1, activation="linear")(x)
    
    model = Model(inputs=inp_cats+[inp_numerical, inp_important_numerical, inp_dense, inp_inception, inp_desc], outputs=out)
    model.compile(optimizer="adam", loss=rmse)
    return model

In [100]:
n_splits=5
avg_valid_kappa = 0
batch_size=128
epochs = 20
coeffs=None

x_test = get_keras_data(X_test, X_desc_test)
y_nn_test = np.zeros((len(X_test),))
y_nn_oof = np.zeros((X_train.shape[0]))

cv = StratifiedGroupKFold(n_splits=n_splits)
for fold_id, (train_idx, valid_idx) in enumerate(cv.split(range(len(X_train)), y=y, groups=rescuer_id)): 
    x_train = get_keras_data(X_train.iloc[train_idx], X_desc_train[train_idx])
    x_valid = get_keras_data(X_train.iloc[valid_idx], X_desc_train[valid_idx])
    y_train, y_valid = y[train_idx], y[valid_idx]
    
    model = get_model(max_len, embedding_dim)
    clr_tri = CyclicLR(base_lr=1e-5, max_lr=1e-2, step_size=len(X_train)//batch_size, mode="triangular2")
    ckpt = ModelCheckpoint('model.hdf5', save_best_only=True,
                               monitor='val_loss', mode='min')
    swa = SWA("swa.hdf5", epochs-2)
    history = model.fit(x_train, y_train, batch_size=batch_size, validation_data=(x_valid, y_valid), 
                        epochs=epochs, callbacks=[ckpt, clr_tri, swa])
    model.load_weights('model.hdf5')
    
    y_pred = model.predict(x_valid, batch_size=1000).reshape(-1,)
    rmse_ = np.sqrt(mean_squared_error(y_valid, y_pred))
    y_pred = rankdata(y_pred)/len(y_pred)
    y_nn_oof[valid_idx] = y_pred
    
    y_pred_test = model.predict(x_test, batch_size=1000).reshape(-1,)
    y_pred_test = rankdata(y_pred_test)/len(y_pred_test)
    y_nn_test += y_pred_test / n_splits
    print("Fold{} rmse={}".format(fold_id, rmse_))

optR = OptimizedRounder()
optR.fit(y_nn_oof, y)
coefficients = optR.coefficients()
y_nn_oof_opt = optR.predict(y_nn_oof, coefficients)
score = get_score(y, y_nn_oof_opt)
print(score)

Train on 11992 samples, validate on 3001 samples
Stochastic weight averaging selected for last 2 epochs.
Epoch 1/20
  512/11992 [>.............................] - ETA: 3:37 - loss: 2.6857

  % (delta_t_median, self._delta_t_batch))


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Final model parameters set to stochastic weight average.
Final stochastic averaged weights saved to file.
Fold0 rmse=1.0403989991734457
Train on 11993 samples, validate on 3000 samples
Stochastic weight averaging selected for last 2 epochs.
Epoch 1/20
  512/11993 [>.............................] - ETA: 3:42 - loss: 2.6510

  % (delta_t_median, self._delta_t_batch))


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Stochastic weight averaging selected for last 2 epochs.
Epoch 1/20
  512/11995 [>.............................] - ETA: 3:45 - loss: 2.6936

  % (delta_t_median, self._delta_t_batch))


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Final model parameters set to stochastic weight average.
Final stochastic averaged weights saved to file.
Fold2 rmse=1.0567077347760256
Train on 11995 samples, validate on 2998 samples
Stochastic weight averaging selected for last 2 epochs.
Epoch 1/20
  512/11995 [>.............................] - ETA: 3:51 - loss: 2.6963

  % (delta_t_median, self._delta_t_batch))


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [101]:
print(score)

0.44622823887866003


In [None]:
0.4481

In [35]:
np.save("y_nn_oof_nn10_451selffast.npy", y_nn_oof)
np.save("y_nn_test_nn10_451selffast.npy", y_nn_test)

In [6]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
description (InputLayer)        (None, 128)          0                                            
__________________________________________________________________________________________________
embedding_19 (Embedding)        (None, 128, 300)     7245000     description[0][0]                
__________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDro (None, 128, 300)     0           embedding_19[0][0]               
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 128, 64)      85504       spatial_dropout1d_1[0][0]        
__________________________________________________________________________________________________
bidirectio

In [34]:
np.save("y_nn_oof_nn4463.npy", y_nn_oof)
np.save("y_nn_test_nn4463.npy", y_nn_test)

In [68]:
from keras import initializers
from keras.engine import InputSpec, Layer
from keras import backend as K


# https://github.com/bfelbo/DeepMoji/blob/master/deepmoji/attlayer.py
class AttentionWeightedAverage(Layer):
    """
    Computes a weighted average of the different channels across timesteps.
    Uses 1 parameter pr. channel to compute the attention value for a single timestep.
    """

    def __init__(self, return_attention=False, **kwargs):
        self.init = initializers.get('uniform')
        self.supports_masking = True
        self.return_attention = return_attention
        super(AttentionWeightedAverage, self).__init__(** kwargs)

    def build(self, input_shape):
        self.input_spec = [InputSpec(ndim=3)]
        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[2], 1),
                                 name='{}_W'.format(self.name),
                                 initializer=self.init)
        self.trainable_weights = [self.W]
        super(AttentionWeightedAverage, self).build(input_shape)

    def call(self, x, mask=None):
        # computes a probability distribution over the timesteps
        # uses 'max trick' for numerical stability
        # reshape is done to avoid issue with Tensorflow
        # and 1-dimensional weights
        logits = K.dot(x, self.W)
        x_shape = K.shape(x)
        logits = K.reshape(logits, (x_shape[0], x_shape[1]))
        ai = K.exp(logits - K.max(logits, axis=-1, keepdims=True))

        # masked timesteps have zero weight
        if mask is not None:
            mask = K.cast(mask, K.floatx())
            ai = ai * mask
        att_weights = ai / (K.sum(ai, axis=1, keepdims=True) + K.epsilon())
        weighted_input = x * K.expand_dims(att_weights)
        result = K.sum(weighted_input, axis=1)
        if self.return_attention:
            return [result, att_weights]
        return result

    def get_output_shape_for(self, input_shape):
        return self.compute_output_shape(input_shape)

    def compute_output_shape(self, input_shape):
        output_len = input_shape[2]
        if self.return_attention:
            return [(input_shape[0], output_len), (input_shape[0], input_shape[1])]
        return (input_shape[0], output_len)

    def compute_mask(self, input, input_mask=None):
        if isinstance(input_mask, list):
            return [None] * len(input_mask)
        else:
            return None