In [1]:
import re
import nltk
nltk.download('punkt')
import feather
import pandas as pd
from keras.callbacks import *
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPooling1D, concatenate, BatchNormalization
from keras.layers import Reshape, Flatten, Concatenate, SpatialDropout1D, GlobalAveragePooling1D
from keras.optimizers import Adam, Optimizer
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, optimizers, layers
from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold
from pymagnitude import *
from keras.preprocessing.text import text_to_word_sequence


def analyzer(text):
    stop_words = ['i', 'a', 'an', 'the', 'to', 'and', 'or', 'if', 'is', 'are', 'am', 'it', 'this', 'that', 'of', 'from', 'in', 'on']
    text = text.lower() # 小文字化
    text = text.replace('\n', '') # 改行削除
    text = text.replace('\t', '') # タブ削除
    text = re.sub(re.compile(r'[!-\/:-@[-`{-~]'), ' ', text) # 記号をスペースに置き換え
    text = nltk.word_tokenize(text)
    
    stemmer = nltk.stem.snowball.SnowballStemmer('english')
    text = [stemmer.stem(t) for t in text]
    
    words = []
    for word in text:
        if (re.compile(r'^.*[0-9]+.*$').fullmatch(word) is not None): # 数字が含まれるものは除外
            continue
        if word in stop_words: # ストップワードに含まれるものは除外
            continue
        if len(word) < 2: #  1文字、0文字（空文字）は除外
            continue
        words.append(word)
        
    return " ".join(words)

def get_score(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')
    
def to_bins(x, borders):
    for i in range(len(borders)):
        if x <= borders[i]:
            return i
    return len(borders)
    
class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _loss(self, coef, X, y, idx):
        X_p = np.array([to_bins(pred, coef) for pred in X])
        ll = -get_score(y, X_p)
        return ll

    def fit(self, X, y):
        coef = [1.5, 2.0, 2.5, 3.0]
        golden1 = 0.618
        golden2 = 1 - golden1
        ab_start = [(1, 2), (1.5, 2.5), (2, 3), (2.5, 3.5)]
        for it1 in range(10):
            for idx in range(4):
                # golden section search
                a, b = ab_start[idx]
                # calc losses
                coef[idx] = a
                la = self._loss(coef, X, y, idx)
                coef[idx] = b
                lb = self._loss(coef, X, y, idx)
                for it in range(20):
                    # choose value
                    if la > lb:
                        a = b - (b - a) * golden1
                        coef[idx] = a
                        la = self._loss(coef, X, y, idx)
                    else:
                        b = b - (b - a) * golden2
                        coef[idx] = b
                        lb = self._loss(coef, X, y, idx)
        self.coef_ = {'x': coef}

    def predict(self, X, coef):
        X_p = np.array([to_bins(pred, coef) for pred in X])
        return X_p

    def coefficients(self):
        return self.coef_['x']
    
def get_keras_data(df, description_embeds):
    X = {
        "numerical": df[numerical].values,
        "description": description_embeds,
        "img": df[img_cols]
    }
    for c in categorical_features:
        X[c] = df[c]
    return X

def rmse(y, y_pred):
    return K.sqrt(K.mean(K.square(y-y_pred), axis=-1))

def get_model(max_features, embedding_dim, emb_n=5, dout=.4):
    inp_cats = []
    embs = []
    for c in categorical_features:
        inp_cat = Input(shape=[1], name=c)
        inp_cats.append(inp_cat)
        embs.append((Embedding(X_train[c].max()+1, emb_n)(inp_cat)))
    cats = Flatten()(concatenate(embs))
    cats = Dense(8, activation="relu")(cats)
    cats = Dropout(dout)(cats)
    cats = BatchNormalization()(cats)
    
    inp_numerical =  Input(shape=(len(numerical),), name="numerical")
    nums = Dense(128, activation="relu")(inp_numerical)
    nums = Dropout(dout)(nums)
    nums = BatchNormalization()(nums)
    
    inp_img =  Input(shape=(len(img_cols),), name="img")
    x_img = BatchNormalization()(inp_img)
    
    inp_desc = Input(shape=(max_features, embedding_dim), name="description")
    emb_desc = SpatialDropout1D(0.3)(inp_desc)
    x1 = Bidirectional(CuDNNLSTM(128, return_sequences=True))(emb_desc)
    x2 = Bidirectional(CuDNNGRU(128, return_sequences=True))(x1)
    max_pool1 = GlobalMaxPooling1D()(x1)
    max_pool2 = GlobalMaxPooling1D()(x2)
    avg_pool1 = GlobalAveragePooling1D()(x1)
    avg_pool2 = GlobalAveragePooling1D()(x2)
    conc = Concatenate()([max_pool1, max_pool2, avg_pool1, avg_pool2])
    conc = BatchNormalization()(conc)
    
    x = concatenate([conc, x_img, nums, cats])
    x = Dense(32, activation="relu")(x)
    x = BatchNormalization()(x)
    x = Dropout(dout/2)(x)
    
    out = Dense(1, activation="linear")(x)
    
    model = Model(inputs=inp_cats+[inp_numerical, inp_img, inp_desc], outputs=out)
    model.compile(optimizer="adam", loss=rmse)
    return model


def w2v_pymagnitude_tonn(train_text, path, max_features):
    train_corpus = [text_to_word_sequence(text) for text in train_text]
    model = Magnitude(path)
    embedding_dim = model.dim
    
    result = []
    for text in train_corpus:
        vec = []
        for word in text:
            try:
                vec_ = model.query(word)
            except:
                continue
            vec.append(vec_)
        if len(vec) == 0:
            vec = np.zeros((max_features, embedding_dim))
        else:
            vec_ = [[0 for i in range(300)] for _ in range(max_features-len(vec))]
            vec_.extend(vec)
            vec = np.array(vec_)[:max_features]
            
        result.append(vec)
    
    return np.array(result), embedding_dim

[nltk_data] Downloading package punkt to /home/keras/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Using TensorFlow backend.


In [2]:
use_cols = ['ratio_median_Age_groupby_Type_Breed1_Breed2',
 'ratio_median_Age_groupby_Type_Breed1',
 'BreedName_main_breed',
 'crop_y_max',
 'diff_var_Sterilized_groupby_RescuerID_State',
 'annots_score_sum_median',
 'Age_mul_Quantity',
 'img_9',
 'img_163',
 'img_224',
 'diff_mean_Fee_groupby_Type_Breed1_Breed2',
 'ratio_count_Quantity_groupby_RescuerID',
 'img_148',
 'median_Age_groupby_RescuerID_State',
 'diff_max_Quantity_groupby_State',
 'img_141',
 'var_Quantity_groupby_Type_Breed1_Breed2',
 'img_135',
 'gnvec155',
 'diff_var_Age_groupby_MaturitySize',
 'img_49',
 'ratio_sum_Sterilized_groupby_State',
 'mean_Sterilized_groupby_RescuerID_State',
 'img_132',
 'diff_mean_Age_groupby_Type_Breed1',
 'img_223',
 'img_172',
 'img_232',
 'mean_Fee_groupby_Type_Breed1',
 'diff_mean_Fee_groupby_Type_Breed1',
 'diff_var_Sterilized_groupby_RescuerID_Type',
 'img_150',
 'BreedName_second_breed',
 'crop_y_mean',
 'mean_Quantity_groupby_RescuerID_Type',
 'img_189',
 'img_157',
 'ratio_count_Age_groupby_RescuerID_Type',
 'img_56',
 'img_164',
 'img_46',
 'var_MaturitySize_groupby_RescuerID_State',
 'dog_cat_scores_sum_max',
 'num_images_per_pet',
 'glove_mag187',
 'gnvec258',
 'annots_score_sum_max',
 'mean_Quantity_groupby_Type_Breed1_Breed2',
 'annots_score_amax_max',
 'diff_var_Fee_groupby_Type_Breed1',
 'ratio_count_Age_groupby_RescuerID_State',
 'var_MaturitySize_groupby_RescuerID_Type',
 'img_233',
 'img_142',
 'crop_y_min',
 'img_249',
 'mean_Fee_groupby_Type_Breed1_Breed2',
 'ratio_sum_Age_groupby_RescuerID',
 'glove_mag177',
 'diff_mean_Fee_groupby_State',
 'img_35',
 'ratio_var_Fee_groupby_Type_Breed1',
 'dog_cat_scores_amin_max',
 'dog_cat_topics_mean_median',
 'ratio_max_Quantity_groupby_State',
 'img_178',
 'img_166',
 'img_250',
 'img_31',
 'img_191',
 'diff_mean_Age_groupby_Type_Breed1_Breed2',
 'var_Sterilized_groupby_RescuerID',
 'glove_mag276',
 'img_151',
 'glove_mag35',
 'Gender',
 'var_Quantity_groupby_RescuerID_Type',
 'State',
 'annots_top_desc_count_svd_4',
 'var_Fee_groupby_Type_Breed1_Breed2',
 'ratio_sum_Quantity_groupby_State',
 'img_8',
 'mean_Age_groupby_RescuerID',
 'ratio_sum_Age_groupby_State',
 'img_105',
 'img_28',
 'gnvec285',
 'sum_Age_groupby_RescuerID',
 'diff_median_Age_groupby_Type_Breed1_Breed2',
 'glove_mag132',
 'color_red_score_amax_var',
 'img_174',
 'ratio_mean_Fee_groupby_Type_Breed1_Breed2',
 'crop_x_var',
 'gnvec101',
 'var_Age_groupby_RescuerID',
 'img_186',
 'diff_var_Age_groupby_RescuerID_State',
 'fix_Breed1',
 'img_244',
 'gnvec34',
 'min_Quantity_groupby_RescuerID_State',
 'diff_sum_Age_groupby_RescuerID',
 'img_175',
 'img_237',
 'gnvec15',
 'glove_mag44',
 'annots_score_mean_max',
 'annots_score_mean_mean',
 'ratio_var_Fee_groupby_Type_Breed1_Breed2',
 'gnvec6',
 'img_133',
 'img_119',
 'diff_var_Fee_groupby_RescuerID_State',
 'img_243',
 'img_113',
 'img_116',
 'img_156',
 'gnvec282',
 'diff_var_Fee_groupby_Type_Breed1_Breed2',
 'img_242',
 'img_117',
 'img_173',
 'gnvec27',
 'img_4',
 'gnvec87',
 'glove_mag121',
 'gnvec18',
 'glove_mag188',
 'img_188',
 'img_63',
 'img_183',
 'img_48',
 'glove_mag159',
 'gnvec241',
 'img_36',
 'img_221',
 'gnvec31',
 'ratio_mean_Age_groupby_Type_Breed1',
 'img_126',
 'mean_Quantity_groupby_RescuerID',
 'annots_score_amax_median',
 'annots_top_desc_count_svd_3',
 'gnvec230',
 'gnvec175',
 'glove_mag16',
 'glove_mag6',
 'glove_mag57',
 'diff_var_Age_groupby_Type_Breed1',
 'annots_score_amin_mean']

In [3]:
categorical_features = [
     'Breed1',
     'Breed2',
     'Color1',
     'Color2',
     'Color3',
     'Dewormed',
     'FurLength',
     'Gender',
     'Health',
     'MaturitySize',
     'State',
     'Sterilized',
     'Type',
     'Vaccinated',
     'Type_main_breed',
     'BreedName_main_breed',
     'Type_second_breed',
     'BreedName_second_breed',
]
max_features=128

X_train = feather.read_dataframe('X_train9.feather')
n_train = len(X_train)
img_cols = ["img_{}".format(i) for i in range(256)]
numerical = [c for c in X_train.columns if c not in categorical_features and c not in img_cols]
numerical = [c for c in numerical if c in use_cols]

y =  feather.read_dataframe('../input/X_train.feather')["AdoptionSpeed"].values
rescuer_id = pd.read_csv('../input/petfinder-adoption-prediction/train/train.csv').loc[:, 'RescuerID'].iloc[:n_train]

embedding = "../input/pymagnitude-data/glove.840B.300d.magnitude"
train = pd.read_csv('../input/petfinder-adoption-prediction/train/train.csv')
train[['Description', 'Name']] = train[['Description', 'Name']].astype(str)
train["Description"] = [analyzer(text) for text in train["Description"]]
X_desc, embedding_dim = w2v_pymagnitude_tonn(train["Description"][:n_train], embedding, max_features)

In [4]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

for c in categorical_features:
    X_train[c] = LabelEncoder().fit_transform(X_train[c])
X_train.replace(np.inf, np.nan, inplace=True)
X_train.replace(-np.inf, np.nan, inplace=True)
X_train[numerical] = StandardScaler().fit_transform(X_train[numerical].rank())
X_train.fillna(0, inplace=True)

In [7]:
from keras.engine import InputSpec
class CyclicLR(Callback):
    def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',
                 gamma=1., scale_fn=None, scale_mode='cycle'):
        super(CyclicLR, self).__init__()

        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.mode = mode
        self.gamma = gamma
        if scale_fn == None:
            if self.mode == 'triangular':
                self.scale_fn = lambda x: 1.
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = lambda x: 1/(2.**(x-1))
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = lambda x: gamma**(x)
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        self.clr_iterations = 0.
        self.trn_iterations = 0.
        self.history = {}

        self._reset()

    def _reset(self, new_base_lr=None, new_max_lr=None,
               new_step_size=None):
        """Resets cycle iterations.
        Optional boundary/step size adjustment.
        """
        if new_base_lr != None:
            self.base_lr = new_base_lr
        if new_max_lr != None:
            self.max_lr = new_max_lr
        if new_step_size != None:
            self.step_size = new_step_size
        self.clr_iterations = 0.
        
    def clr(self):
        cycle = np.floor(1+self.clr_iterations/(2*self.step_size))
        x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1)
        if self.scale_mode == 'cycle':
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle)
        else:
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations)
        
    def on_train_begin(self, logs={}):
        logs = logs or {}

        if self.clr_iterations == 0:
            K.set_value(self.model.optimizer.lr, self.base_lr)
        else:
            K.set_value(self.model.optimizer.lr, self.clr())        
            
    def on_batch_end(self, epoch, logs=None):
        
        logs = logs or {}
        self.trn_iterations += 1
        self.clr_iterations += 1

        self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
        self.history.setdefault('iterations', []).append(self.trn_iterations)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)
        
        K.set_value(self.model.optimizer.lr, self.clr())
    
def get_model(max_features, embedding_dim, emb_n=5, dout=.4, weight_decay=0.1):
    inp_cats = []
    embs = []
    for c in categorical_features:
        inp_cat = Input(shape=[1], name=c)
        inp_cats.append(inp_cat)
        embs.append((Embedding(X_train[c].max()+1, emb_n)(inp_cat)))
    cats = Flatten()(concatenate(embs))
    cats = Dense(8, activation="relu")(cats)
    cats = Dropout(dout)(cats)
    cats = BatchNormalization()(cats)
    
    inp_numerical =  Input(shape=(len(numerical),), name="numerical")
    nums = Dense(256, activation="relu")(inp_numerical)
    nums = Dropout(dout)(nums)
    nums = BatchNormalization()(nums)
    
    inp_img =  Input(shape=(len(img_cols),), name="img")
    x_img = BatchNormalization()(inp_img)
    
    inp_desc = Input(shape=(max_features, embedding_dim), name="description")
    emb_desc = SpatialDropout1D(0.3)(inp_desc)
    x1 = Bidirectional(CuDNNLSTM(128, return_sequences=True))(emb_desc)
    x2 = Bidirectional(CuDNNGRU(128, return_sequences=True))(x1)

    
    max_pool1 = GlobalMaxPooling1D()(x1)
    max_pool2 = GlobalMaxPooling1D()(x2)
    avg_pool1 = GlobalAveragePooling1D()(x1)
    avg_pool2 = GlobalAveragePooling1D()(x2)
    #attn = AttentionWeightedAverage()(x1)
    #att1 = Attention(max_features)(x1)
    #att2 = Attention(max_features)(x2)
    conc = Concatenate()([max_pool1, max_pool2, avg_pool1, avg_pool2])
    conc = BatchNormalization()(conc)
    
    x = concatenate([conc, x_img, nums, cats])
    x = Dense(128, activation="relu")(x)
    x = BatchNormalization()(x)
    x = Dropout(dout/2)(x)
    
    out = Dense(1, activation="linear")(x)
    
    model = Model(inputs=inp_cats+[inp_numerical, inp_img, inp_desc], outputs=out)
    #model.compile(optimizer=AdamW(weight_decay=weight_decay), loss=rmse)
    model.compile(optimizer="adam", loss=rmse)
    return model

In [None]:
n_splits=5
cv = GroupKFold(n_splits=n_splits)
avg_valid_kappa = 0
batch_size=128
coeffs=None

#x_test = get_keras_data(test_df, desc_embs[len(train_df):])
#y_nn_test = np.zeros((len(test_df),))
y_nn_oof = np.zeros((X_train.shape[0]))

for i, (train_idx, valid_idx) in enumerate(cv.split(range(len(X_train)), y=None, groups=rescuer_id)):
    x_train = get_keras_data(X_train.iloc[train_idx], X_desc[train_idx])
    x_valid = get_keras_data(X_train.iloc[valid_idx], X_desc[valid_idx])
    y_train, y_valid = y[train_idx], y[valid_idx]
    
    model = get_model(max_features, embedding_dim)
    clr_tri = CyclicLR(base_lr=2e-3, max_lr=4e-2, step_size=len(X_train)//batch_size, mode="triangular2")
    ckpt = ModelCheckpoint('model.hdf5', save_best_only=True,
                               monitor='val_loss', mode='min')
    history = model.fit(x_train, y_train, batch_size=batch_size, validation_data=(x_valid, y_valid), 
                        epochs=25, callbacks=[ckpt, clr_tri])
    model.load_weights('model.hdf5')
    
    y_pred = model.predict(x_valid, batch_size=1000).reshape(-1,)
    y_nn_oof[valid_idx] = y_pred
    #y_nn_test += model.predict(x_test, batch_size=batch_size).reshape(-1,) / n_splits
    print("Fold{} rmse={}".format(i, np.sqrt(mean_squared_error(y_valid, y_pred))))

optR = OptimizedRounder()
optR.fit(y_nn_oof, y)
coefficients = optR.coefficients()
y_nn_oof_opt = optR.predict(y_nn_oof, coefficients)
score = get_score(y, y_nn_oof_opt)
print(np.sqrt(mean_squared_error(y, y_nn_oof)), score)

Train on 11994 samples, validate on 2999 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
 1152/11994 [=>............................] - ETA: 7s - loss: 0.6999

In [9]:
print(np.sqrt(mean_squared_error(y, y_nn_oof)), score)

1.0726994102895673 0.4340790356296971


In [None]:
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints


class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with
        return_sequences = True.
        The dimensions are inferred based on the output shape of the RNN.
        Example:
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
        """
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim