In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import keras
from keras.preprocessing import image
from keras.layers import Input, Subtract, Lambda, BatchNormalization, Dense, Flatten, Conv2D, MaxPooling2D, GlobalMaxPooling2D, Dropout
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras import Model, Sequential

from keras.applications.resnet50 import ResNet50

from sklearn.neighbors import KNeighborsClassifier

import cv2

import itertools

import bcolz

import pdb

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
path = 'D:/msc/kaggle_data/whales/'
train_path = path + 'train/'
test_path = path+'test/'
validation_path = path+'validation/'

In [3]:
labels = pd.read_csv(path+'train.csv')

# Compute loss

In [None]:
#Assume there are 5 preditions all of which are different
def map_loss(y_true, preds):
    ps = np.zeros(len(y_true))
    for i in range(5):
        ps += ((y_true == preds[:,i]).astype(np.int64))/(i+1)
    return ps.mean()

# Make the correct directory structure

In [None]:
folders = list(labels['Id'].unique())

In [None]:
%cd $train_path

In [None]:
for fd in folders:
    %mkdir $fd

In [None]:
for i in range(len(labels)):
    os.rename(labels['Image'][i], labels['Id'][i]+'/'+labels['Image'][i])

In [None]:
%cd $test_path

In [None]:
for fd in folders:
    %mkdir $fd

## Validation

In [None]:
%cd $path

In [None]:
%mkdir validation

In [None]:
%cd $train_path

In [None]:
from glob import glob
for fd in folders:
    %cd $fd
    fls = glob('*.jpg')
    shuf = np.random.permutation(fls)
    for i in range(len(fls)//4): os.rename(shuf[i], validation_path+fd+shuf[i])
    %cd $train_path

In [None]:
%cd $validation_path

In [None]:
fls = glob('*.jpg')

In [None]:
for fd in folders:
    %mkdir $fd

In [None]:
for f in fls:
    os.rename(f, f[:9] + "/" + f[9:])

In [None]:
%pwd

In [None]:
os.rename('new_whale007c3603.jpg', '007c3603.jpg')

# Base model

In [None]:
labels['Id'].value_counts()[:5]

In [None]:
commons = np.array(['new_whale', 'w_1287fbc', 'w_98baff9', 'w_7554f44', 'w_1eafe46']).reshape(1,-1)

In [None]:
base_preds = np.repeat(commons, len(labels), axis=0)

In [None]:
base_subm = np.concatenate([ids.reshape(-1,1), base_preds],axis=1)

In [None]:
np.savetxt('base_submission.csv', base_subm, fmt = '%s, %s %s %s %s %s', header = 'Image,Id', comments = '')

## Just new whale

In [None]:
commons2 = np.array(['new_whale']).reshape(1,-1)

In [None]:
base2_preds = np.repeat(commons2, len(ids), axis=0)

In [None]:
base2_preds.shape

In [None]:
ids.shape

In [None]:
base2_subm = np.concatenate([ids.reshape(-1,1), base2_preds],axis=1)

In [None]:
np.savetxt('base2_submission.csv', base2_subm, fmt = '%s, %s', header = 'Image,Id', comments = '')

# Look at data

In [None]:
labels['Id'].value_counts()[:4].plot(kind='bar')

In [None]:
img=cv2.imread(train_path+labels['Image'][17], cv2.IMREAD_GRAYSCALE)

In [None]:
train_path

In [None]:
plt.imshow(img,cmap='gray')

In [None]:
labels['Id'].value_counts()[0]/len(labels)

# Augmented Data Generator

In [None]:
batch_size = 10

In [None]:
gen = image.ImageDataGenerator(rotation_range=10, width_shift_range=0.1, 
       height_shift_range=0.1, shear_range=0.15, zoom_range=0.1, horizontal_flip=True)

In [None]:
aug_img_iter = gen.flow_from_directory(train_path, target_size = (128,128), batch_size = batch_size, color_mode = 'grayscale')

# Create a simple CNN

In [None]:
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Flatten, BatchNormalization
from keras.optimizers import Adam

## Set up the architecture

In [None]:
num_ids = len(labels['Id'].unique())

In [None]:
model = Sequential()

model.add(BatchNormalization(axis=1,input_shape = (128,128,1)))

#L1
model.add(Conv2D(64, kernel_size = (3,3), activation= 'relu',padding='same'))
model.add(BatchNormalization(axis=1))
model.add(MaxPooling2D())
#model.add(Dropout(0.2))

#L2
model.add(Conv2D(128, kernel_size = (3,3), activation= 'relu',padding='same'))
model.add(BatchNormalization())
model.add(MaxPooling2D())
#model.add(Dropout(0.2))

#L3
model.add(Conv2D(128, kernel_size = (3,3), activation= 'relu',padding='same'))
model.add(BatchNormalization())
model.add(MaxPooling2D())
#model.add(Dropout(0.2))

#L4
model.add(Conv2D(64, kernel_size = (3,3), activation= 'relu',padding='same'))
model.add(BatchNormalization())
model.add(MaxPooling2D())
#model.add(Dropout(0.2))

model.add(Flatten())

#L5
model.add(Dense(128, activation = 'relu'))
model.add(BatchNormalization())
#model.add(Dropout(0.2))

#L6
model.add(Dense(num_ids, activation = 'softmax'))


In [None]:
model.compile(loss='categorical_crossentropy',optimizer=Adam(),metrics=['accuracy'])

In [None]:
model.summary()

## Fit model

In [None]:
model.optimizer.lr = .01

In [None]:
model.fit_generator(aug_img_iter, steps_per_epoch=aug_img_iter.n/batch_size, epochs=2)

## Add pseudo-labeling

In [None]:
class MixIterator(object):
    def __init__(self, iters):
        self.iters = iters
        self.multi = type(iters) is list
        if self.multi:
            self.n = sum([it.n for it in self.iters])
        else:
            self.n = it.n

    def reset(self):
        for it in self.iters: it.reset()
    def __iter__(self):
        return self
    def __next__(self):
        nexts = [next(it) for it in self.iters]
        n0 = np.concatenate([n[0] for n in nexts])
        n1 = np.concatenate([n[1] for n in nexts])
        return (n0, n1)

In [None]:
pl_gen = image.ImageDataGenerator()
pl_iter = pl_gen.flow_from_directory(test_path, target_size = (128,128), batch_size = batch_size//3, color_mode = 'grayscale')

In [None]:
combined_iter = MixIterator([aug_img_iter, pl_iter])

In [None]:
model.fit_generator(combined_iter, steps_per_epoch=combined_iter.n/batch_size, epochs=2)

# Train embedding as in "Learning a Similarity Metric Discriminatively, with Application to Face Verification"

## Make generator which gives pairs of images

### Make dictionaries for (filename <-> idx file), (idx file -> whale id); known whales come after new ones 

In [4]:
dirs = sorted(os.listdir(train_path))

#First index of known whale
first_kw_idx = len(os.listdir(train_path + dirs[0]))

start_index = 0
fn2ind = dict()
ind2fn = dict()
ind2dir = dict()

for dr in dirs:
    files = os.listdir(train_path + dr)
    fn2ind.update(dict(zip(files, range(start_index, start_index + len(files)))))
    ind2dir.update(dict(zip(range(start_index, start_index + len(files)), [dr]*len(files))))
    start_index+=len(files)

ind2fn = {b : a for a,b in fn2ind.items()}

num_whales = len(ind2fn)

### Make three sets of pairs of indices; one same-whale pairs, one different whale pairs, one new-whale/some-whale pair

In [5]:
nw_kw_pairs = itertools.product(range(first_kw_idx), range(first_kw_idx, num_whales))
nw_kw_list = list(nw_kw_pairs)

kw_kw_pairs = itertools.product(range(first_kw_idx, num_whales), range(first_kw_idx, num_whales))

kw_kw_list = list(kw_kw_pairs)

kw_kw_gen, kw_kw_imp = [], []
for s in kw_kw_list:  
    (kw_kw_gen if (ind2dir[s[0]] == ind2dir[s[1]]) else kw_kw_imp).append(s)

### Do the above for validation

In [6]:
val_dirs = sorted(os.listdir(validation_path))
val_first_kw_idx = len(os.listdir(validation_path + val_dirs[0]))
val_start_index = 0
val_fn2ind = dict()
val_ind2fn = dict()
val_ind2dir = dict()
for dr in dirs:
    files = os.listdir(validation_path + dr)
    val_fn2ind.update(dict(zip(files, range(val_start_index, val_start_index + len(files)))))
    val_ind2dir.update(dict(zip(range(val_start_index, val_start_index + len(files)), [dr]*len(files))))
    val_start_index+=len(files)
val_ind2fn = {b : a for a,b in val_fn2ind.items()}
val_num_whales = len(val_ind2fn)

In [7]:
val_nw_kw_pairs = itertools.product(range(val_first_kw_idx), range(val_first_kw_idx, val_num_whales))
val_nw_kw_list = list(val_nw_kw_pairs)
val_kw_kw_pairs = itertools.product(range(val_first_kw_idx, val_num_whales), range(val_first_kw_idx, val_num_whales))
val_kw_kw_list = list(val_kw_kw_pairs)
val_kw_kw_gen, val_kw_kw_imp = [], []
for s in val_kw_kw_list:  
    (val_kw_kw_gen if (val_ind2dir[s[0]] == val_ind2dir[s[1]]) else val_kw_kw_imp).append(s)

### Make a generator which takes a filename array and list of index pairs and produces batches

In [8]:
from tensorflow.python.keras._impl.keras.utils.data_utils import Sequence
from tensorflow.python.keras._impl.keras.preprocessing.image import *
class PairsIterator(Iterator):
    def __init__(self, 
                 image_data_generator, # data augmentation
                 pairs, #Set of pairs of indices
                 path, # Path to where class folders are located
                 ind2dir, # dictionary of directories corresponding to indices 
                 ind2fn, # dictionary of filenames corresponding to indices
                 label, # Label to attach: 0 for same entity, 1 for different entities 
                 batch_size = 16,
                 target_size=(256, 256),
                 shuffle = True,
                 seed=None,
                 data_format=None,
                 color_mode='grayscale',
                 interpolation='nearest',
                 images=None):
        self.image_data_generator = image_data_generator
        self.target_size = tuple(target_size)
        self.pairs=pairs
        self.path=path
        self.ind2dir = ind2dir
        self.ind2fn = ind2fn
        self.label = label
        self.interpolation = interpolation
        if data_format is None:
            data_format = K.image_data_format()
        self.data_format = data_format
        if self.data_format == 'channels_last':
            if(color_mode=='grayscale'):
                self.image_shape = self.target_size + (1,)
            else:
                self.image_shape = self.target_size + (3,)
        else:
            if(color_mode=='grayscale'):
                self.image_shape = (1,) + self.target_size
            else:
                self.image_shape = (3,) + self.target_size
        if(images is None):
            self.images = np.zeros( (len(self.ind2dir),) + self.image_shape)
            for ind in list(ind2dir.keys()):
                img = load_img(os.path.join(self.path, self.ind2dir[ind], self.ind2fn[ind]),
                             grayscale=False,
                             target_size=self.target_size)
                x = img_to_array(img, data_format=self.data_format)
                self.images[ind,:,:,:] = x
        else:
            self.images = images
        super(PairsIterator, self).__init__(len(self.pairs), batch_size, shuffle,seed)
    def reset(self):
        self.current_index = 0
    def __iter__(self):
        return self
    def _get_batches_of_transformed_samples(self, index_array):
        batch_x0 = np.zeros((len(index_array[0]),) + self.image_shape, dtype=K.floatx())
        batch_x1 = np.zeros((len(index_array[0]),) + self.image_shape, dtype=K.floatx())
        # build batch of image data
        for i, j in enumerate(index_array[0]):
            ind0, ind1 = self.pairs[j]           
            batch_x0[i] = self.image_data_generator.standardize(self.image_data_generator.random_transform(self.images[ind0,:,:,:]))
            batch_x1[i] = self.image_data_generator.standardize(self.image_data_generator.random_transform(self.images[ind1,:,:,:]))
        #if len(batch_x1) != self.batch_size:
        #    pdb.set_trace()
        # build batch of labels
        batch_y = np.repeat(self.label, len(batch_x1), axis=0)
        return [batch_x0, batch_x1], batch_y

    def __next__(self):
        with self.lock:
            index_array = next(self.index_generator)
        # The transformation of images is not under thread lock
        # so it can be done in parallel
        return self._get_batches_of_transformed_samples(index_array)

In [9]:
from tensorflow.python.keras._impl.keras.utils.data_utils import Sequence
from tensorflow.python.keras._impl.keras.preprocessing.image import *
class PairsIteratorFromArray(Iterator):
    def __init__(self, 
                 pairs, #Set of pairs of indices
                 arr, # Path to where class folders are located
                 label, # Label to attach: 0 for same entity, 1 for different entities 
                 batch_size = 16,
                 shuffle = True,
                 seed=None,
                 interpolation='nearest'
                ):
        self.pairs=pairs
        self.arr=arr
        self.label = label
        self.interpolation = interpolation
        super(PairsIteratorFromArray, self).__init__(len(self.pairs), batch_size, shuffle,seed)
    def reset(self):
        self.current_index = 0
    def __iter__(self):
        return self
    def _get_batches_of_transformed_samples(self, index_array):
        batch_x0 = np.zeros((len(index_array[0]),) + self.arr.shape[1:])
        batch_x1 = np.zeros((len(index_array[0]),) + self.arr.shape[1:])
        # build batch of data
        for i, j in enumerate(index_array[0]):
            ind0, ind1 = self.pairs[j]           
            batch_x0[i] = self.arr[ind0]
            batch_x1[i] = self.arr[ind1]
        # build batch of labels
        batch_y = np.repeat(self.label, len(batch_x1), axis=0)
        return [batch_x0, batch_x1], batch_y

    def __next__(self):
        with self.lock:
            index_array = next(self.index_generator)
        # The transformation of images is not under thread lock
        # so it can be done in parallel
        return self._get_batches_of_transformed_samples(index_array)

In [10]:
class MixIterator(object):
    def __init__(self, iters):
        self.iters = iters
        self.multi = type(iters) is list
        if self.multi:
            self.n = sum([it.n for it in self.iters])
        else:
            self.n = it.n

    def reset(self):
        for it in self.iters: it.reset()
    def __iter__(self):
        return self
    def __next__(self):
        nexts = [next(it) for it in self.iters]
        n0 = [np.concatenate([n[0][0] for n in nexts]), np.concatenate([n[0][1] for n in nexts])]
        n1 = np.concatenate([n[1] for n in nexts])
        return n0, n1

## Make model

### Mix the final generator in appropriate proportions

In [11]:
target_size = (128,128)
batch_size = 1
data_format="channels_last"
embedding_size=128

In [12]:
#Make a numpy array with all the training images

if(data_format == "channels_first"):
    images = np.zeros( (len(ind2dir),) + (3,) + target_size)
else:
    images = np.zeros( (len(ind2dir),) + target_size + (3,))
for ind in ind2dir.keys():
    img = load_img(os.path.join(train_path, ind2dir[ind], ind2fn[ind]),
                 grayscale=False,
                 target_size=target_size)
    x = img_to_array(img, data_format=data_format)
    images[ind,:,:,:] = x

In [13]:
gen2 = ImageDataGenerator()
genuine_pairs_iterator = PairsIterator(gen2, kw_kw_gen, train_path, ind2dir, ind2fn, 0,target_size=target_size, batch_size = batch_size//2, color_mode="rgb", data_format=data_format, images=images)
k_impostor_pairs_iterator = PairsIterator(gen2, kw_kw_imp, train_path, ind2dir, ind2fn, 1,target_size=target_size, batch_size = batch_size//4, color_mode="rgb", data_format=data_format, images=images)
n_impostor_pairs_iterator = PairsIterator(gen2, nw_kw_list, train_path, ind2dir, ind2fn, 1,target_size=target_size, batch_size = batch_size//4, color_mode="rgb", data_format=data_format, images=images)

In [14]:
combined_iter = MixIterator([genuine_pairs_iterator, k_impostor_pairs_iterator, n_impostor_pairs_iterator])

#### Validation

In [15]:
#Make a numpy array with all the validation images

if(data_format == "channels_first"):
    val_images = np.zeros( (len(val_ind2dir),) + (3,) + target_size)
else:
    val_images = np.zeros( (len(val_ind2dir),) + target_size + (3,))
for ind in val_ind2dir.keys():
    img = load_img(os.path.join(validation_path, val_ind2dir[ind], val_ind2fn[ind]),
                 grayscale=False,
                 target_size=target_size)
    x = img_to_array(img, data_format=data_format)
    val_images[ind,:,:,:] = x

In [16]:
val_gen2 = ImageDataGenerator()

val_genuine_pairs_iterator = PairsIterator(val_gen2, val_kw_kw_gen, validation_path, val_ind2dir, val_ind2fn, 0,target_size=target_size, batch_size = batch_size//2, color_mode="rgb", data_format=data_format, images=val_images)
val_k_impostor_pairs_iterator = PairsIterator(val_gen2, val_kw_kw_imp, validation_path, val_ind2dir, val_ind2fn, 1,target_size=target_size, batch_size = batch_size//4, color_mode="rgb", data_format=data_format, images=val_images)
val_n_impostor_pairs_iterator = PairsIterator(val_gen2, val_nw_kw_list, validation_path, val_ind2dir, val_ind2fn, 1,target_size=target_size, batch_size = batch_size//4, color_mode="rgb", data_format=data_format, images=val_images)

In [17]:
val_combined_iter = MixIterator([val_genuine_pairs_iterator, val_k_impostor_pairs_iterator, val_n_impostor_pairs_iterator])

### Make the Embedding model

#### Basic

In [18]:
def get_emedding_model(input_shape = (128,128,1), embedding_size = 128):
    model = Sequential()
    
    model.add(BatchNormalization(axis=1,input_shape = input_shape))

    #L1
    model.add(Conv2D(64, kernel_size = (3,3), activation= 'relu',padding='same'))
    model.add(BatchNormalization(axis=1))
    model.add(MaxPooling2D())
    #model.add(Dropout(0.2))

    #L2
    model.add(Conv2D(128, kernel_size = (3,3), activation= 'relu',padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D())
    #model.add(Dropout(0.2))

    #L3
    model.add(Conv2D(128, kernel_size = (3,3), activation= 'relu',padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D())
    #model.add(Dropout(0.2))

    #L4
    model.add(Conv2D(64, kernel_size = (3,3), activation= 'relu',padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D())
    #model.add(Dropout(0.2))

    model.add(Flatten())

    #L5
    model.add(Dense(128, activation = 'relu'))
    model.add(BatchNormalization())
    #model.add(Dropout(0.2))

    #L6
    model.add(Dense(embedding_size, activation = 'relu'))
    
    return(model)

In [None]:
model = get_emedding_model(input_shape = target_size+(1,))

#### VGG16-based

In [None]:
import importlib
import vgg16bn; importlib.reload(vgg16bn)
from vgg16bn import Vgg16BN

In [None]:
def get_vgg_emedding_model(input_shape = (128,128), embedding_size = embedding_size):
    model = Vgg16BN(input_shape).model
    for layer in model.layers: layer.trainable=False
    model.add(Flatten())
    model.add(Dense(1024, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.7))
    model.add(Dense(1024, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.7))
    model.add(Dense(embedding_size, activation='relu'))
    return(model)

In [None]:
model = get_vgg_emedding_model(input_shape = target_size)

#### Resnet50 based

In [19]:
def get_resnet50_emedding_model(input_shape = (128,128), embedding_size = embedding_size):
    model = ResNet50(include_top=False)
    for layer in model.layers:
         layer.trainable = False
    x = model.output
    x = GlobalMaxPooling2D()(x)
    x = Dropout(0.5)(x)
    x = Dense(embedding_size)(x)
    model = Model(model.input, x)
    return(model)

In [20]:
model_resnet = get_resnet50_emedding_model(input_shape=target_size)

#### Split VGG16 model after convolution

In [None]:
layers = model.layers
last_conv_idx = [index for index,layer in enumerate(layers) 
                     if type(layer) is Conv2D][-1]

In [None]:
conv_layers = layers[:last_conv_idx+1]
conv_model = Sequential(conv_layers)
# Dense layers - also known as fully connected or 'FC' layers
fc_layers = layers[last_conv_idx+1:]

In [None]:
trn_gen = image.ImageDataGenerator(data_format="channels_first")
trn_iter = trn_gen.flow_from_directory(train_path, target_size = target_size, batch_size = batch_size, color_mode="rgb")
conv_features = conv_model.predict_generator(trn_iter)

c = bcolz.carray(conv_features, rootdir=path+"train_conv_layer_features.bc", mode='w')
c.flush()

In [None]:
val_gen = image.ImageDataGenerator(data_format="channels_first")
val_iter = val_gen.flow_from_directory(validation_path, target_size = target_size, batch_size = batch_size, color_mode="rgb")
conv_features_val = conv_model.predict_generator(val_iter)
c = bcolz.carray(conv_features_val, rootdir=path+"val_conv_layer_features.bc", mode='w')
c.flush()

In [None]:
def get_fc_model():
    model2 = Sequential([
        MaxPooling2D(input_shape=conv_layers[-1].output_shape[1:]),
        Flatten(),
        Dense(1024, activation='relu'),
        BatchNormalization(),
        Dropout(0.8),
        Dense(1024, activation='relu'),
        BatchNormalization(),
        Dropout(0.8),
        Dense(embedding_size, activation='relu')
        ])
    
    for l1,l2 in zip(model2.layers, fc_layers): l1.set_weights(l2.get_weights())
    return model2

In [None]:
fc_model = get_fc_model()

#### Mix new generators

In [None]:
genuine_pairs_iterator_pcc = PairsIteratorFromArray(kw_kw_gen, conv_features, 0, batch_size = batch_size//2)
k_impostor_pairs_iterator_pcc = PairsIteratorFromArray(kw_kw_imp, conv_features, 1, batch_size = batch_size//4)
n_impostor_pairs_iterator_pcc = PairsIteratorFromArray(nw_kw_list, conv_features, 1, batch_size = batch_size//4)

combined_iter_pcc = MixIterator([genuine_pairs_iterator_pcc, k_impostor_pairs_iterator_pcc, n_impostor_pairs_iterator_pcc])

In [None]:
val_genuine_pairs_iterator_pcc = PairsIteratorFromArray(val_kw_kw_gen, conv_features_val, 0, batch_size = batch_size//2)
val_k_impostor_pairs_iterator_pcc = PairsIteratorFromArray(val_kw_kw_imp, conv_features_val, 1, batch_size = batch_size//4)
val_n_impostor_pairs_iterator_pcc = PairsIteratorFromArray(val_nw_kw_list, conv_features_val, 1, batch_size = batch_size//4)

val_combined_iter_pcc = MixIterator([val_genuine_pairs_iterator_pcc, val_k_impostor_pairs_iterator_pcc, val_n_impostor_pairs_iterator_pcc])

### Make the Siamese model which outputs energy

In [21]:
def make_siamese_model(embedding_model):
    input1 = Input(shape=embedding_model.layers[0].input_shape[1:])
    input2 = Input(shape=embedding_model.layers[0].input_shape[1:])
    g1 = embedding_model(input1)
    g2 = embedding_model(input2)
    subtr = Subtract()([g1,g2])
    E = Lambda(lambda x: K.sum(K.abs(x), axis=1, keepdims=True))(subtr)
    #merge two encoded inputs with the l1 distance between them
    #L1_distance = lambda x: K.abs(x[0]-x[1])
    #E = Merge([g1,g2], mode = L1_distance, output_shape=lambda x: x[0])
    siamese_model = Model(inputs = [input1, input2], outputs=E)
    return(siamese_model)

In [22]:
#s_model = make_siamese_model(model)

In [23]:
s_model_rn = make_siamese_model(model_resnet)

In [None]:
#s_model_pcc = make_siamese_model(fc_model)

### Define loss function for Siamese model & compile

In [24]:
Q = 10 # I don't really understand what Q is...
def chopra_loss(y_true, y_pred):
    lss = K.mean((1.0-y_true)*2/Q*K.square(y_pred) + y_true*2*Q*K.exp(-2.77/Q*y_pred) )
    #lss.set_shape((1,))
    return(lss)

#Copied this out of an example to play with it.
def contrastive_loss(y, d):
    """ Contrastive loss from Hadsell-et-al.'06
        http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    """
    margin = 10
    return K.mean(y * K.square(d) + (1 - y) * K.square(K.maximum(margin - d, 0)))

In [25]:
#s_model.compile(optimizer = 'Adam', loss = chopra_loss)

In [26]:
s_model_rn.compile(optimizer = 'Adam', loss = chopra_loss)

In [None]:
#s_model_pcc.compile(optimizer = 'Adam', loss = chopra_loss)

### Train Siamese model

In [None]:
from keras.callbacks import Callback
from lr_utils.keras_lr_finder import LRFinder
from lr_utils.keras_SGDR import SGDRScheduler
from lr_utils.keras_CLR import CyclicLR

In [None]:
lr_finder = LRFinder(min_lr=1e-3, max_lr=3e-1, steps_per_epoch=np.ceil(2*genuine_pairs_iterator.n//batch_size), epochs=1)

#### Full Model

In [None]:
# K.set_value(s_model.optimizer.lr,0.001)

In [None]:
#s_model.optimizer.lr=K.variable(1e-1)

In [None]:
#,callbacks=[ModelCheckpoint('ft_vgg_model_weights.hdf5', bsave_best_only=True)]
s_model.fit_generator(combined_iter, steps_per_epoch=np.ceil(2*genuine_pairs_iterator.n//batch_size),
                      validation_data=val_combined_iter, validation_steps=np.ceil(2*val_genuine_pairs_iterator.n//batch_size),
                      epochs=1,
                      callbacks=[lr_finder])

In [None]:
lr_finder.plot_loss()

In [None]:
clr_triangular = CyclicLR(mode='triangular', step_size=np.ceil(2*genuine_pairs_iterator.n//batch_size),base_lr=1e-3, max_lr=1e-2)

In [None]:
s_model.fit_generator(combined_iter, steps_per_epoch=np.ceil(2*genuine_pairs_iterator.n//batch_size),
                      validation_data=val_combined_iter, validation_steps=np.ceil(2*val_genuine_pairs_iterator.n//batch_size),
                      epochs=1,
                      callbacks=[clr_triangular, ModelCheckpoint(path+'ft_vgg_model_best_weights.hdf5', save_best_only=True)])

In [None]:
s_model.fit_generator(combined_iter, steps_per_epoch=np.ceil(2*genuine_pairs_iterator.n//batch_size),
                      validation_data=val_combined_iter, validation_steps=np.ceil(2*val_genuine_pairs_iterator.n//batch_size),
                      epochs=3,
                      callbacks=[clr_triangular, ModelCheckpoint(path+'ft_vgg_model_best_weights.hdf5', save_best_only=True)])

#### ResNet model

In [None]:
#K.set_value(s_model_rn.optimizer.lr,1e-3)

In [None]:
#s_model_rn.optimizer.lr=K.variable(1e-3)

In [None]:
s_model_rn.fit_generator(combined_iter, steps_per_epoch=np.ceil(2*genuine_pairs_iterator.n//batch_size), epochs=1)


Epoch 1/1


In [None]:
s_model_rn.fit_generator(combined_iter, steps_per_epoch=np.ceil(2*genuine_pairs_iterator.n//batch_size),
                      validation_data=val_combined_iter, validation_steps=np.ceil(2*val_genuine_pairs_iterator.n//batch_size),
                      epochs=1,
                      callbacks=[ModelCheckpoint(path+'ft_resnet_model_best_weights.hdf5', save_best_only=True)])

#### Convolution precomputed model

In [None]:
s_model_pcc.optimizer.lr=1e-1
s_model_pcc.fit_generator(combined_iter_pcc, steps_per_epoch=2*genuine_pairs_iterator_pcc.n//batch_size, epochs=3, validation_data=val_combined_iter_pcc, 
                      validation_steps=2*val_genuine_pairs_iterator_pcc.n//batch_size)

In [None]:
s_model_pcc.optimizer.lr=1e-3
s_model_pcc.fit_generator(combined_iter_pcc, steps_per_epoch=2*genuine_pairs_iterator_pcc.n//batch_size, epochs=3, validation_data=val_combined_iter_pcc, 
                      validation_steps=2*val_genuine_pairs_iterator_pcc.n//batch_size)

### Save weights

In [None]:
model.save_weights(path+'ft_vgg_model_weights.hdf5')

### Load weights

In [None]:
model.load_weights(path+'ft_vgg_model_weights.hdf5')

### How good is VGG16 model?

#### Make KNN for classifier for known whales

Something must be going wrong with KNN since it seems to be unhelpful despite val_loss being reasonably low and well behaved. Should I try to go through an example by hand? Draw plots? Instead of KNN, choose a representative of each whale?

In [None]:
def a(idx):
    return img_to_array(load_img(os.path.join(train_path, ind2dir[idx], ind2fn[idx]),grayscale=False,target_size=target_size), data_format="channels_first")
arr = np.stack([a(idx) for idx in range(first_kw_idx,num_whales)])

In [None]:
X = model.predict(arr)

In [None]:
y = np.array([ind2dir[idx] for idx in range(first_kw_idx,num_whales)])

In [None]:
neigh = KNeighborsClassifier(n_neighbors=5, metric='manhattan')

In [None]:
neigh.fit(X,y)

#### Make means type classifier for known whales

In [None]:
def a(idx):
    return img_to_array(load_img(os.path.join(train_path, ind2dir[idx], ind2fn[idx]),grayscale=False,target_size=target_size), data_format="channels_first")
arr = np.stack([a(idx) for idx in range(first_kw_idx,num_whales)])
X = model.predict(arr)

In [None]:
dir2av_vec = {}
for dr in dirs[1:]:
    dir2av_vec[dr] = np.mean(X[[fn2ind[x]-first_kw_idx for x in os.listdir(train_path+dr)],:],axis=0)
def whale_dist(emb_vec):
    return([np.sum(np.abs(dir2av_vec[dr] - emb_vec)) for dr in dirs[1:]])

In [None]:
whale_dist(X[2,:])

In [None]:
(np.array(whale_dist(X[2,:])) < 2.09).sum()

#### Validation performance

In [None]:
val_gen = image.ImageDataGenerator(data_format="channels_first")
val_iter = val_gen.flow_from_directory(validation_path, target_size = target_size, batch_size = batch_size, color_mode="rgb")
preds = model.predict_generator(val_iter)
sorted_names = np.sort(np.unique(y))

In [None]:
def get_preds(emb):
    probs = neigh.predict_proba(emb.reshape(1,-1)).reshape((-1,))
    idx = np.flip(np.argsort(probs)[-5:],axis=0)
    return(sorted_names[idx])
pred_names = np.apply_along_axis(get_preds,1,preds)
getname = np.vectorize(lambda x: ind2dir[x])
rating = 0
for i in range(5):
    rating += 1/(i+1)*(pred_names[:,i] == getname(val_iter.classes)).astype(int)
rating.sum()/val_iter.n

In [None]:
0.0022123893805309734
0.002433628318584071

In [None]:
(val_iter.classes>0).sum()/val_iter.n

In [None]:
2*(1/1)/val_iter.n

##### Base

In [None]:
base_preds = np.repeat(np.array(['new_whale', 'w_1287fbc', 'w_98baff9', 'w_7554f44', 'w_1eafe46']).reshape(1,-1), val_iter.n, axis=0)
for i in range(5):
    rating += 1/(i+1)*(base_preds[:,i] == getname(val_iter.classes)).astype(int)
rating.sum()/val_iter.n

### How good it VGG16_pcc model?

In [None]:
X = fc_model.predict(conv_features[first_kw_idx:])
y = np.array([ind2dir[idx] for idx in range(first_kw_idx,num_whales)])
neigh = KNeighborsClassifier(n_neighbors=5, metric='manhattan')
neigh.fit(X,y)

In [None]:
preds = fc_model.predict(conv_features_val)
sorted_names = np.sort(np.unique(y))
def get_preds(emb):
    probs = neigh.predict_proba(emb.reshape(1,-1)).reshape((-1,))
    idx = np.flip(np.argsort(probs)[-5:],axis=0)
    return(sorted_names[idx])
pred_names = np.apply_along_axis(get_preds,1,preds)
getname = np.vectorize(lambda x: ind2dir[x])
rating = 0
for i in range(5):
    rating += 1/(i+1)*(pred_names[:,i] == getname(val_iter.classes)).astype(int)
rating.sum()/val_iter.n

### Figure out threshold for new_whale <font color="red"> Is that needed?</font>

In [None]:
batch = next(val_genuine_pairs_iterator)
s_model.predict(batch[0])

In [None]:
batch = next(val_k_impostor_pairs_iterator)
s_model.predict(batch[0])

In [None]:
batch = next(val_n_impostor_pairs_iterator)
s_model.predict(batch[0])

## Submit something

In [None]:
test_gen = image.ImageDataGenerator()
test_iter = test_gen.flow_from_directory(test_path, target_size = target_size, batch_size = batch_size, color_mode = 'grayscale')

### Get predictions

In [None]:
preds = model.predict_generator(test_iter)

In [None]:
sorted_names = np.sort(np.unique(y))
def get_preds(emb):
    probs = neigh.predict_proba(emb.reshape(1,-1)).reshape((-1,))
    idx = np.flip(np.argsort(probs)[-4:],axis=0)
    return(np.concatenate([np.array(['new_whale']),sorted_names[idx]]))

In [None]:
pred_names = np.apply_along_axis(get_preds,1,preds)

### Get ids

In [None]:
filenames = test_iter.filenames
ids = np.array([f[10:] for f in filenames])

In [None]:
ids

### Make file

In [None]:
subm = np.concatenate([ids.reshape(-1,1), pred_names],axis=1)

In [None]:
np.savetxt('submission_siam_3.csv', subm, fmt = '%s, %s %s %s %s %s', header = 'Image,Id', comments = '')

# Evaluate model on training data

In [None]:
eval_gen = image.ImageDataGenerator()
eval_iter = eval_gen.flow_from_directory(train_path, target_size = (128,128), batch_size = batch_size, color_mode = 'grayscale')

## Use 'model'

In [None]:
eval_preds = model.predict_generator(eval_iter)
best_preds_idx = np.fliplr(np.argsort(eval_preds, axis=1)[:,-5:])

In [None]:
map_loss(eval_iter.classes, best_preds_idx)

## Base model

In [None]:
base_preds = np.repeat(np.array([eval_iter.class_indices[z] for z in ['new_whale', 'w_1287fbc', 'w_98baff9', 'w_7554f44', 'w_1eafe46']]).reshape(1,-1), eval_iter.n, axis=0)

In [None]:
map_loss(eval_iter.classes, base_preds)

### Is that what is supposed to be happening? The score is much higher on Kaggle test set...

In [None]:
labels['Id'].value_counts()[:5]

In [None]:
len(labels)

In [None]:
(810+34/2+27/3+26/4+23/5)/9850

In [None]:
import ml_metrics

# Submit predictions

## Generate predictions

In [None]:
import bcolz
def save_array(fname, arr):
    c=bcolz.carray(arr, rootdir=fname, mode='w')
    c.flush()
def load_array(fname):
    return bcolz.open(fname)[:]

In [None]:
test_gen = image.ImageDataGenerator()
test_iter = test_gen.flow_from_directory(test_path, target_size = (128,128), batch_size = batch_size, color_mode = 'grayscale')

In [None]:
preds = model.predict_generator(test_iter)

In [None]:
filenames = test_iter.filenames

In [None]:
save_array('preds.dat', preds)
save_array('filenames.dat', filenames)

## Make output file

In [None]:
ids = np.array([f[8:] for f in filenames])

In [None]:
cl_names = {ind : name for name,ind in aug_img_iter.class_indices.items()}

In [None]:
best_preds_idx = np.fliplr(np.argsort(preds, axis=1)[:,-5:])

In [None]:
f = lambda x: cl_names[x]
f = np.vectorize(f)

In [None]:
subm = np.concatenate([ids.reshape(-1,1), f(best_preds_idx)],axis=1)

In [None]:
np.savetxt('submission.csv', subm, fmt = '%s, %s %s %s %s %s', header = 'Image,Id', comments = '')