In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['glove6b50dtxt', 'petfinder-adoption-prediction']


In [2]:
import re
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model,Sequential

from keras.layers import Dense, Input, Dropout, LSTM, GRU, Activation, BatchNormalization
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
from keras import optimizers 
np.random.seed(6)

Using TensorFlow backend.


In [3]:
data_df = pd.read_csv("../input/petfinder-adoption-prediction/train/train.csv")
test_df = pd.read_csv("../input/petfinder-adoption-prediction/test/test.csv")

In [4]:
data_desc_df = data_df[['AdoptionSpeed', "Description"]]
data_desc_df = data_desc_df[data_desc_df['Description'] != ""]
data_descriptions = data_desc_df['Description'].apply(lambda x: re.sub(r"[^a-z0-9 ]+", "", str(x).lower()))



In [5]:
print (data_df.shape)
print (data_desc_df.shape)
print (len(data_descriptions))

(14993, 24)
(14993, 2)
14993


In [6]:
# Code from DLNg to read in the Glove weights:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words, words_to_index, index_to_words, word_to_vec_map


In [7]:
vocab, word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('../input/glove6b50dtxt/glove.6B.50d.txt')

In [8]:
"worm" in vocab

True

In [9]:
#test
word = "yielded"
index = 400000
print("the index of", word, "in the vocabulary is", word_to_index[word])
print("the", str(index) + "th word in the vocabulary is", index_to_word[index])
print(word_to_vec_map[word])
print(index_to_word[10])
word_to_vec_map[index_to_word[10]].shape[0]

the index of yielded in the vocabulary is 393810
the 400000th word in the vocabulary is ￥
[ 4.1846e-01  4.1030e-01  2.0033e-01  3.6384e-01  6.7866e-01  1.6747e-01
 -4.8366e-01  3.7455e-01 -2.3350e-01  1.0087e+00 -7.6460e-01 -4.9268e-01
  1.3099e-01 -2.2862e-01  4.0837e-01 -4.0880e-01  2.9919e-02 -8.7655e-01
 -2.2908e-01  2.0841e-01  2.1926e-01 -1.5913e-01  6.9237e-01 -1.3015e+00
  3.4532e-01  1.3344e-01 -3.4478e-02 -1.7596e-03  3.5787e-01  1.0180e-01
  1.8439e+00 -7.9155e-01  1.9466e-01  1.1955e-01  1.0515e+00 -6.4883e-02
 -2.1768e-01  8.3643e-01 -3.5975e-01 -2.8557e-01 -5.8929e-01 -7.0715e-01
  1.3815e-01  1.5321e-01 -8.9717e-02 -2.8388e-01  3.8445e-01  1.2361e+00
  2.4727e-02 -7.1862e-01]
##


50

In [10]:
#maxLen = len(max(data_descriptions, key=len).split())
maxLen = 15

In [11]:
def sentences_to_indices(X, vocab, word_to_index, max_len):
    """
    Arguments:
    X -- array of sentences (strings), of shape (m, 1)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. We will cut the sentence short at max_len. 
    
    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """
    
    m = X.shape[0]                                   # number of training examples
    
    X_indices = np.zeros ((m, max_len))
    
    for i in range(m):                               # loop over training examples
        
        # Convert the ith training sentence in lower case and split is into words. You should get a list of words.
        sentence_words =X[i].lower().split()[:maxLen]
        j = 0
        # Loop over the words of sentence_words
        for w in sentence_words:
            if w in vocab:
                X_indices[i, j] = word_to_index [w]
                j = j + 1
    return X_indices

In [12]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of your GloVe word vectors (= 50)
    
    # Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors = emb_dim)
    emb_matrix = np.zeros ((vocab_len, emb_dim))
   
    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    # Define Keras embedding layer with the correct output/input sizes, make it trainable. 
    embedding_layer = Embedding (vocab_len, emb_dim, trainable = False)

    # Build the embedding layer, it is required before setting the weights of the embedding layer. Do not modify the "None".
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [13]:
embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
print("weights[0][1][3] =", embedding_layer.get_weights()[0][1][3])

Instructions for updating:
Colocations handled automatically by placer.
weights[0][1][3] = -0.3403


In [14]:
def build_model (input_shape, word_to_vec_map, word_to_index):
    """
    Function creating the Emojify-v2 model's graph.
    
    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    model -- a model instance in Keras
    """
    
    # Define sentence_indices as the input of the graph, it should be of shape input_shape and dtype 'int32' (as it contains indices).
    sentence_indices = Input (shape=input_shape, dtype = "int32")
    
    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    # Propagate sentence_indices through your embedding layer, you get back the embeddings
    embeddings = embedding_layer(sentence_indices)   
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a batch of sequences.
    X = GRU(128, return_sequences=True) (embeddings)
    # Add dropout with a probability of 0.5
    X = Dropout (0.5) (X)
    X = BatchNormalization()(X)
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a single hidden state, not a batch of sequences.
    X = GRU(128) (X)
    # Add dropout with a probability of 0.5
    X = Dropout (0.5) (X)
    X = BatchNormalization()(X)
    # Propagate X through a Dense layer with softmax activation to get back a batch of 5-dimensional vectors.
    X = Dense (5, activation = "softmax" ) (X)
    # Add a softmax activation
    X = Activation ("softmax") (X)
    
    # Create Model instance which converts sentence_indices into X.
    model = Model (inputs = sentence_indices, outputs = X)
       
    return model

In [15]:
gru_model = build_model ((maxLen,), word_to_vec_map, word_to_index)
gru_model.summary()

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 15)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 15, 50)            20000050  
_________________________________________________________________
gru_1 (GRU)                  (None, 15, 128)           68736     
_________________________________________________________________
dropout_1 (Dropout)          (None, 15, 128)           0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 15, 128)           512       
_________________________________________________________________
gru_2 (GRU)                  (None, 128)               98688     
___________________________

In [16]:
def kappa_loss(y_pred, y_true, y_pow=2, eps=1e-10, bsize=256, N=5, name='kappa'):
    """A continuous differentiable approximation of discrete kappa loss.
        Args:
            y_pred: 2D tensor or array, [batch_size, num_classes]
            y_true: 2D tensor or array,[batch_size, num_classes]
            y_pow: int,  e.g. y_pow=2
            N: typically num_classes of the model
                        eps: a float, prevents divide by zero
            name: Optional scope/name for op_scope.
        Returns:
            A tensor with the kappa loss."""
    with tf.name_scope(name):
        y_true = tf.to_float(y_true)
        repeat_op = tf.to_float(tf.tile(tf.reshape(tf.range(0, N), [N, 1]), [1, N]))
        repeat_op_sq = tf.square((repeat_op - tf.transpose(repeat_op)))
        weights = repeat_op_sq / tf.to_float((N - 1) ** 2)
    
        pred_ = y_pred ** y_pow
        try:
            pred_norm = pred_ / (eps + tf.reshape(tf.reduce_sum(pred_, 1), [-1, 1]))
        except Exception:
            pred_norm = pred_ / (eps + tf.reshape(tf.reduce_sum(pred_, 1), [bsize, 1]))
    
        hist_rater_a = tf.reduce_sum(pred_norm, 0)
        hist_rater_b = tf.reduce_sum(y_true, 0)
    
        conf_mat = tf.matmul(tf.transpose(pred_norm), y_true)
    
        nom = tf.reduce_sum(weights * conf_mat)
        denom = tf.reduce_sum(weights * tf.matmul(
            tf.reshape(hist_rater_a, [N, 1]), tf.reshape(hist_rater_b, [1, N])) /
                              tf.to_float(bsize))
    
        return nom / (denom + eps)


In [17]:
# The following 3 functions have been taken from Ben Hamner's github repository
# https://github.com/benhamner/Metrics
def Cmatrix(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Returns the confusion matrix between rater's ratings
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_ratings = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_ratings)]
                for j in range(num_ratings)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat


def histogram(ratings, min_rating=None, max_rating=None):
    """
    Returns the counts of each type of rating that a rater made
    """
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings


def quadratic_weighted_kappa(y, y_pred):
    """
    Calculates the quadratic weighted kappa
    axquadratic_weighted_kappa calculates the quadratic weighted kappa
    value, which is a measure of inter-rater agreement between two raters
    that provide discrete numeric ratings.  Potential values range from -1
    (representing complete disagreement) to 1 (representing complete
    agreement).  A kappa value of 0 is expected if all agreement is due to
    chance.
    quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
    each correspond to a list of integer ratings.  These lists must have the
    same length.
    The ratings should be integers, and it is assumed that they contain
    the complete range of possible ratings.
    quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating
    is the minimum possible rating, and max_rating is the maximum possible
    rating
    """
    rater_a = y
    rater_b = y_pred
    min_rating=None
    max_rating=None
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(min(rater_a), min(rater_b))
    if max_rating is None:
        max_rating = max(max(rater_a), max(rater_b))
    conf_mat = Cmatrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return (1.0 - numerator / denominator)


In [18]:
data_description_indices = sentences_to_indices(data_descriptions, vocab, word_to_index, maxLen)

In [19]:
#train valid split
np.random.seed(seed=6)
mask = np.random.randn(len(data_desc_df)) < 0.9
train_desc_df = data_desc_df[mask]
valid_desc_df = data_desc_df[~mask]

In [20]:
train_indices = data_description_indices[mask]
valid_indices = data_description_indices[~mask]

In [21]:
train_descriptions = data_descriptions[mask]
train_labels = train_desc_df['AdoptionSpeed']

valid_descriptions = data_descriptions[~mask]
valid_labels = valid_desc_df['AdoptionSpeed']


In [22]:
# Shorten train for faster learning - only on tune-up stage !!
#train_indices = train_indices[:3000]
#train_labels = train_labels[:3000]

In [23]:
# To one-hot representation
train_labels_oh = to_categorical(train_labels)
valid_labels_oh = to_categorical(valid_labels)


In [24]:
'''tr1 = train_indices[:1000]
tr1_labels_oh = train_labels_oh[:1000]'''

'tr1 = train_indices[:1000]\ntr1_labels_oh = train_labels_oh[:1000]'

In [25]:
#train_labels_oh[:5]

In [26]:
def model_evaluate (my_lr, my_epochs, my_batch_size):  
    mko_optimizer = optimizers.rmsprop(lr = my_lr) 
    #mko_optimizer = optimizers.Adam(lr = my_lr)
    #model.compile(loss='categorical_crossentropy', optimizer=mko_optimizer, metrics=['accuracy'])
    gru_model.compile(loss=kappa_loss, optimizer=mko_optimizer, metrics=['accuracy'])
    #model.fit(tr1, tr1_labels_oh, epochs = my_epochs, batch_size = my_batch_size, verbose = 0, shuffle=True)
    gru_model.fit(train_indices, train_labels_oh, epochs = my_epochs, batch_size = my_batch_size, verbose = 0, shuffle=True)
    #valid_pred = model.predict (valid_indices)
    v_df = pd.DataFrame(model.predict (valid_indices))
    v = v_df.values.argmax(axis=1)
    return quadratic_weighted_kappa(valid_labels, v)


In [27]:

'''for my_epochs in (3,5,10 ):
    for my_batch_size in (32,64):
        for my_lr in (0.0005, 0.0001):
            qwk = model_evaluate (my_lr, my_epochs, 64)
            print ("Epochs = ", str(my_epochs), " lr = ", str(my_lr), "batch_size = ", str(my_batch_size), " qwk = ", str(qwk))        '''

'for my_epochs in (3,5,10 ):\n    for my_batch_size in (32,64):\n        for my_lr in (0.0005, 0.0001):\n            qwk = model_evaluate (my_lr, my_epochs, 64)\n            print ("Epochs = ", str(my_epochs), " lr = ", str(my_lr), "batch_size = ", str(my_batch_size), " qwk = ", str(qwk))        '

In [28]:
'''for my_epochs in (30,100 ):
    for my_batch_size in (16,32,64):
        for my_lr in (0.0005, 0.0001):
            qwk = model_evaluate (my_lr, my_epochs, 64)
            print ("Epochs = ", str(my_epochs), " lr = ", str(my_lr), "batch_size = ", str(my_batch_size), " qwk = ", str(qwk))        '''

'for my_epochs in (30,100 ):\n    for my_batch_size in (16,32,64):\n        for my_lr in (0.0005, 0.0001):\n            qwk = model_evaluate (my_lr, my_epochs, 64)\n            print ("Epochs = ", str(my_epochs), " lr = ", str(my_lr), "batch_size = ", str(my_batch_size), " qwk = ", str(qwk))        '

In [29]:
'''mko_optimizer = optimizers.rmsprop(lr = 0.0005)
#model.compile(loss='categorical_crossentropy', optimizer=mko_optimizer, metrics=['accuracy'])
model.compile(loss=kappa_loss, optimizer=mko_optimizer, metrics=['accuracy'])
model.fit(train_indices, train_labels_oh, epochs = 100, batch_size = 16, shuffle=True)'''

"mko_optimizer = optimizers.rmsprop(lr = 0.0005)\n#model.compile(loss='categorical_crossentropy', optimizer=mko_optimizer, metrics=['accuracy'])\nmodel.compile(loss=kappa_loss, optimizer=mko_optimizer, metrics=['accuracy'])\nmodel.fit(train_indices, train_labels_oh, epochs = 100, batch_size = 16, shuffle=True)"

In [30]:
'''v_df = pd.DataFrame(model.predict (valid_indices))
v = v_df.values.argmax(axis=1)
print (quadratic_weighted_kappa(valid_labels, v))'''

'v_df = pd.DataFrame(model.predict (valid_indices))\nv = v_df.values.argmax(axis=1)\nprint (quadratic_weighted_kappa(valid_labels, v))'

In [31]:
N_epochs_GRU = 100
Batch_size_GRU = 16
Lr_GRU = 0.0001

In [32]:
data_labels_oh = to_categorical (data_desc_df['AdoptionSpeed'])

In [33]:
#Create the model on the whole text corpus with the best fine-tuned parameters
mko_optimizer = optimizers.rmsprop(lr = Lr_GRU)
#model.compile(loss='categorical_crossentropy', optimizer=mko_optimizer, metrics=['accuracy'])
gru_model.compile(loss=kappa_loss, optimizer=mko_optimizer, metrics=['accuracy'])
gru_model.fit(data_description_indices, data_labels_oh, epochs = N_epochs_GRU, batch_size = Batch_size_GRU, shuffle=True)


Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use tf.cast instead.
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100

In [34]:
test_desc_df = test_df[["Description"]]
test_desc_df = test_desc_df[test_desc_df['Description'] != ""]
test_descriptions = test_desc_df['Description'].apply(lambda x: re.sub(r"[^a-z0-9 ]+", "", str(x).lower()))


In [35]:
test_description_indices = sentences_to_indices(test_descriptions, vocab, word_to_index, maxLen)


In [36]:
pred = gru_model.predict(test_description_indices)

In [37]:
pred[6:18]

array([[0.14884758, 0.14884758, 0.14884758, 0.14884758, 0.40460968],
       [0.14884758, 0.14884758, 0.14884758, 0.14884758, 0.40460968],
       [0.14884806, 0.14884819, 0.14884844, 0.14884937, 0.4046059 ],
       [0.40444732, 0.14889392, 0.14890654, 0.14888377, 0.14886841],
       [0.14884764, 0.14884767, 0.1488477 , 0.14884777, 0.4046092 ],
       [0.40427053, 0.14901376, 0.14891756, 0.14889446, 0.14890371],
       [0.14892071, 0.14895102, 0.14897238, 0.14910308, 0.40405288],
       [0.14884764, 0.14884768, 0.1488477 , 0.14884783, 0.40460908],
       [0.14893366, 0.14897655, 0.14899762, 0.14906839, 0.40402383],
       [0.1488476 , 0.1488476 , 0.1488476 , 0.1488476 , 0.40460962],
       [0.14884758, 0.14884758, 0.14884758, 0.14884758, 0.40460968],
       [0.14884764, 0.14884768, 0.14884776, 0.14884767, 0.40460923]],
      dtype=float32)

In [38]:
## Submission
sample_submission_df = pd.read_csv('../input/petfinder-adoption-prediction/test/sample_submission.csv')
print(sample_submission_df.shape)
sample_submission_df.head()

(3948, 2)


Unnamed: 0,PetID,AdoptionSpeed
0,378fcc4fc,0
1,73c10e136,0
2,72000c4c5,0
3,e147a4b9f,0
4,43fbba852,0


In [39]:
p = pd.DataFrame(pred).values.argmax(axis=1)
#submission = np.concatenate(sample_submission_df, pd.DataFrame(pred), axis=1 )
p[:10]

array([0, 4, 4, 4, 4, 4, 4, 4, 4, 0])

In [40]:
sample_submission_df['AdoptionSpeed'] = p
#submission_df.drop(columns=[0,1,2,3,4], inplace=True)
print(sample_submission_df.shape)
sample_submission_df.head()

(3948, 2)


Unnamed: 0,PetID,AdoptionSpeed
0,378fcc4fc,0
1,73c10e136,4
2,72000c4c5,4
3,e147a4b9f,4
4,43fbba852,4


In [41]:
sample_submission_df['AdoptionSpeed'] = sample_submission_df['AdoptionSpeed'].fillna(3)
sample_submission_df['AdoptionSpeed'] = sample_submission_df['AdoptionSpeed'].astype(np.int32)


In [42]:
sample_submission_df.to_csv('submission.csv',index=False)