# Triplet networks and one-shot learning

## Import packages and mount data

In [None]:
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Input, Conv2D, Lambda, Dense, Flatten, MaxPooling2D, Dropout, Concatenate, BatchNormalization, concatenate, ReLU, LeakyReLU
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import binary_crossentropy
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

In [None]:
# mount the data needed to drive folder so we can use them in colab, data is stored in google drive
from google.colab import drive
!mkdir drive
drive.mount('drive')

mkdir: cannot create directory ‘drive’: File exists
Drive already mounted at drive; to attempt to forcibly remount, call drive.mount("drive", force_remount=True).


In [None]:
PATH = os.path.join("drive","My Drive","omniglot")

with open(os.path.join(PATH, "omniglot_train.p"), "rb") as f:
    (X_train, c_train) = pickle.load(f)

with open(os.path.join(PATH, "omniglot_test.p"), "rb") as f:
    (X_test, c_test) = pickle.load(f)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("")
print("training alphabets")
print([key for key in c_train.keys()])
print("test alphabets:")
print([key for key in c_test.keys()])

X_train shape: (964, 20, 105, 105)
X_test shape: (659, 20, 105, 105)

training alphabets
['Braille', 'Anglo-Saxon_Futhorc', 'Tifinagh', 'Grantha', 'Burmese_(Myanmar)', 'Mkhedruli_(Georgian)', 'Latin', 'Ojibwe_(Canadian_Aboriginal_Syllabics)', 'Balinese', 'Malay_(Jawi_-_Arabic)', 'Early_Aramaic', 'Korean', 'Japanese_(hiragana)', 'Armenian', 'Cyrillic', 'Hebrew', 'Syriac_(Estrangelo)', 'Japanese_(katakana)', 'Blackfoot_(Canadian_Aboriginal_Syllabics)', 'N_Ko', 'Alphabet_of_the_Magi', 'Inuktitut_(Canadian_Aboriginal_Syllabics)', 'Greek', 'Bengali', 'Tagalog', 'Futurama', 'Arcadian', 'Gujarati', 'Asomtavruli_(Georgian)', 'Sanskrit']
test alphabets:
['ULOG', 'Atemayar_Qelisayer', 'Ge_ez', 'Gurmukhi', 'Tengwar', 'Keble', 'Malayalam', 'Oriya', 'Kannada', 'Mongolian', 'Angelic', 'Atlantean', 'Syriac_(Serto)', 'Aurek-Besh', 'Avesta', 'Glagolitic', 'Sylheti', 'Tibetan', 'Manipuri', 'Old_Church_Slavonic_(Cyrillic)']


## Building the triplet network

We will define a triplet Network for use with the Omniglot dataset. Each branch of the triplet  is a "convnet" model that transforms data to an embeddings space.

In [None]:
# define a convnet model to transforms data to an embeddings space. 
input_shape = (105, 105, 1)

# The architecture is similar to that in the paper (Koch et al., "Siamese Neural Networks for One-shot Image Recognition"), 
# but we include dropout and batch normalization to improve generalization and speed up training.
convnet = Sequential()
convnet.add(Conv2D(64, (3,3), input_shape=input_shape))
convnet.add(BatchNormalization())
convnet.add(ReLU())
convnet.add(MaxPooling2D((2,2)))
convnet.add(Dropout(0.2))

convnet.add(Conv2D(128, (3,3)))
convnet.add(BatchNormalization())
convnet.add(ReLU())
convnet.add(MaxPooling2D((2,2)))
convnet.add(Dropout(0.2))

convnet.add(Conv2D(128, (3,3)))
convnet.add(BatchNormalization())
convnet.add(ReLU())
convnet.add(MaxPooling2D((2,2)))
convnet.add(Dropout(0.2))

convnet.add(Conv2D(256, (3,3)))
convnet.add(BatchNormalization())
convnet.add(ReLU())
convnet.add(MaxPooling2D((2,2)))
convnet.add(Dropout(0.2))

convnet.add(Flatten())

convnet.add(Dense(1024, activation="linear"))

convnet._name = "leg"

convnet.summary()

Model: "leg"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_20 (Conv2D)           (None, 103, 103, 64)      640       
_________________________________________________________________
batch_normalization_20 (Batc (None, 103, 103, 64)      256       
_________________________________________________________________
re_lu_20 (ReLU)              (None, 103, 103, 64)      0         
_________________________________________________________________
max_pooling2d_20 (MaxPooling (None, 51, 51, 64)        0         
_________________________________________________________________
dropout_20 (Dropout)         (None, 51, 51, 64)        0         
_________________________________________________________________
conv2d_21 (Conv2D)           (None, 49, 49, 128)       73856     
_________________________________________________________________
batch_normalization_21 (Batc (None, 49, 49, 128)       512     

In [None]:
# The anchor, positive, negative image are merged together, as the input of the triplet network, then got split to get each one's neural codes.
generated = Input(shape=(3, 105, 105, 1), name='input')

anchor = Lambda(lambda x: x[:, 0])(generated)
pos = Lambda(lambda x: x[:, 1])(generated)
neg = Lambda(lambda x: x[:, 2])(generated)

# merge the anchor, positive, negative embedding together, 
# let the merged layer be the output of triplet network
anchor_embedding = convnet(anchor)
pos_embedding = convnet(pos)
neg_embedding = convnet(neg)  

merged_output = concatenate([anchor_embedding, pos_embedding, neg_embedding], axis=-1, name='merged_layer')

triplet_net = Model(inputs=generated, outputs=merged_output)
triplet_net.summary()

Model: "model_142"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              [(None, 3, 105, 105, 0                                            
__________________________________________________________________________________________________
lambda_12 (Lambda)              (None, 105, 105, 1)  0           input[0][0]                      
__________________________________________________________________________________________________
lambda_13 (Lambda)              (None, 105, 105, 1)  0           input[0][0]                      
__________________________________________________________________________________________________
lambda_14 (Lambda)              (None, 105, 105, 1)  0           input[0][0]                      
__________________________________________________________________________________________

## Defining the triplet loss

In [None]:
# Notice that the ground truth variable is not used for loss calculation. 
# It is used as a function argument to by-pass some Keras functionality.
# This is because the network structure already implies the ground truth for the anchor image with the "positive" image.
import tensorflow as tf
def triplet_loss(ground_truth, network_output):

    anchor, positive, negative = tf.split(network_output, num_or_size_splits=3, axis=1)        

    # This is an easy implementation, but also a very inefficient one because it uses offline triplet mining (https://omoindrot.github.io/triplet-loss)
    positive_distance = tf.reduce_sum(tf.square(anchor - positive), 1)
    negative_distance = tf.reduce_sum(tf.square(anchor - negative), 1)

    margin = 2000
    loss = tf.maximum(positive_distance - negative_distance + margin, 0.0)
    loss = tf.reduce_mean(loss)
 
    return loss

## Selecting triplets for training

#### Different  selection method

We have two different options for the triplet selection method, and we will compare the model performance under these two methods after building our model.

(1) Random  triplets selection, including the following steps:
* Pick one random class for anchor
* Pick two different random picture for this class, as the anchor and positive images
* Pick another class for Negative, different from anchor_class
* Pick one random picture from the negative class.

(2) Hard triplets selection. For easy implement, for a picked anchor, positive pair, we will choose the hardest negative to form a hard triplet, that means, after picking an anchor, positive image, we will choose the negative image which is nearest from anchor image from a negative class, ie: "- d(a,n)"  can get the maximum value. The whole process including the following steps:
* Pick one random class for anchor
* Pick two different random picture for this class, as an anchor and positive images
* Pick another class for negative, different from anchor_class
* Pick one hardest picture from the negative class.

In [None]:
# Notice that the returned  1 * np.zeros(batch_size) is to by-pass some Keras functionality, corresponding to ground_truth in tripletloss
# We use a variable hard_selection to control which method we are going to use. If we set hard_selection == False, we will select triplets random,If we set the variable hard_selection == True, we will select hard triplets.
def get_batch(batch_size, X, hard_selection):
    # Create a subset of the model that basically represents a "leg" of the model
    subset_model = Model(inputs=triplet_net.get_layer("leg").get_input_at(0), 
                         outputs=triplet_net.get_layer("leg").get_output_at(0))

    while True:
        n_classes, n_examples, w, h = X.shape
        
        # initialize result
        triplets = []

        for i in range(batch_size):
            triplet = [[], [], []]

            # Pick one random class for anchor
            anchor_class = np.random.randint(0, n_classes)

            # Pick two different random pics for this class => idx_A and idx_P
            [idx_A, idx_P] = np.random.choice(n_examples, size=2, replace=False)
            #print(f"Anchor class: {anchor_class}, idx_A: {idx_A}, idx_P: {idx_P}")
            
            # Pick another class for negative, different from anchor_class
            negative_class = np.random.choice(np.setdiff1d(range(0, n_classes), anchor_class))
            # print(f"Negative class: {negative_class}, shape: {X[negative_class].shape}")

            if not hard_selection:
                # Pick a random pic from this negative class => N 
                idx_N = np.random.choice(n_examples, size=1, replace=False)

            else:
                # Pick a hardest pic from this negative class => N
                
                # Get the embedding of the anchor image
                anchor_img = subset_model.predict(np.expand_dims(X[anchor_class][idx_A], axis=0))

                # Make a prediction for all images in the negative class
                neg_imgs = subset_model.predict(np.expand_dims(X[negative_class], axis=0).reshape(20, 105, 105, 1))
                
                # Compute the distance (note that we use the l2 distance) between the anchor and negative img embeddings
                distances = [np.linalg.norm(anchor_img - neg_img) for neg_img in neg_imgs]

                # Pick the image with the nearest distance as the "hard" image
                idx_N = np.argsort(distances)[0]

            triplet[0] = X[anchor_class][idx_A].reshape(w, h, 1)
            triplet[1] = X[anchor_class][idx_P].reshape(w, h, 1)
            triplet[2]=  X[negative_class][idx_N].reshape(w, h, 1)
            triplets.append(triplet)

        yield np.array(triplets), 1 * np.zeros(batch_size)

## One-shot learning with different selection method

In [None]:
def make_oneshot_task(N, X, c, language=None):
    """Create pairs of (test image, support set image) with ground truth, for testing N-way one-shot learning."""
    n_classes, n_examples, w, h = X.shape
    indices = np.random.randint(0, n_examples, size=(N,))
    if language is not None:
        low, high = c[language]
        if N > high - low:
            raise ValueError("This language ({}) has less than {} letters".format(language, N))
        categories = np.random.choice(range(low,high), size=(N,), replace=False)
    else:  # if no language specified just pick a bunch of random letters
        categories = np.random.choice(range(n_classes), size=(N,), replace=False)            
    true_category = categories[0]
    ex1, ex2 = np.random.choice(n_examples, replace=False, size=(2,))
    test_image = np.asarray([X[true_category, ex1, :, :]]*N).reshape(N, w, h, 1)
    support_set = X[categories, indices, :, :]
    support_set[0, :, :] = X[true_category, ex2]
    support_set = support_set.reshape(N, w, h, 1)
    targets = np.zeros((N,))
    targets[0] = 1
    targets, test_image, support_set = shuffle(targets, test_image, support_set)
    pairs = [test_image, support_set]
    return pairs, targets

In [None]:
def test_oneshot(model, X, c, N=20, k=250, language=None, verbose=True):     
    """Test average N-way oneshot learning accuracy of a siamese neural net over k one-shot tasks."""
    n_correct = 0
    
    if verbose:
        print("Evaluating model on {} random {}-way one-shot learning tasks ...".format(k, N))

    for i in range(k):
        # Create a one-shot task 
        inputs, targets = make_oneshot_task(N, X, c, language=language)

        # 1. For a given one-shot task, obtain embeddings for the test image as well as the support set. 
        test_img = model.predict(inputs[0])
        support_set = model.predict(inputs[1])
        # Note that we use the l2 distance to compute the distances
        distances = [np.linalg.norm(x-y) for x,y in zip(test_img, support_set)]
        
        # 2. Pick the image from the support set that is closest (in L2-distance) to the test image as your one-shot prediction.
        if np.argmin(distances) == np.argmax(targets):
            n_correct += 1

    percent_correct = (100.0 * n_correct / k)
    
    if verbose:
        print("Got an average of {}% accuracy for {}-way one-shot learning".format(percent_correct, N))
    return percent_correct

## Evaluate one-shot learning with  random triplets selection

In [None]:
def train(model, X_train, hard_selection=False, batch_size=64, steps_per_epoch=100, epochs=1):
    model.fit(get_batch(batch_size, X_train, hard_selection), steps_per_epoch=steps_per_epoch, epochs=epochs)

## Evaluate one-shot learning with  hard triplets selection

In [None]:
# Random triplet selection
triplet_net.compile(loss=triplet_loss, optimizer=Adam(lr=0.0001))
loops = 20
best_acc_random = 0
for i in range(loops):
    print("=== Training loop {} ===".format(i+1))
    # === ADD CODE HERE ===
    train(triplet_net, X_train, hard_selection=False, batch_size=64, steps_per_epoch=100, epochs=1)
    subset_model = Model(inputs=triplet_net.get_layer("leg").get_input_at(0), 
                         outputs=triplet_net.get_layer("leg").get_output_at(0))
    test_acc = test_oneshot(subset_model, X_test, c_test)

    if test_acc >= best_acc_random:
        print("********* New best one-shot accuracy, saving model ********")
        triplet_net.save(os.path.join(".", "triplet_net_with_random_selection.h5"))
        best_acc_random = test_acc

=== Training loop 1 ===
Evaluating model on 250 random 20-way one-shot learning tasks ...
Got an average of 44.4% accuracy for 20-way one-shot learning
********* New best one-shot accuracy, saving model ********
=== Training loop 2 ===
Evaluating model on 250 random 20-way one-shot learning tasks ...
Got an average of 55.2% accuracy for 20-way one-shot learning
********* New best one-shot accuracy, saving model ********
=== Training loop 3 ===
Evaluating model on 250 random 20-way one-shot learning tasks ...
Got an average of 54.4% accuracy for 20-way one-shot learning
=== Training loop 4 ===
Evaluating model on 250 random 20-way one-shot learning tasks ...
Got an average of 60.0% accuracy for 20-way one-shot learning
********* New best one-shot accuracy, saving model ********
=== Training loop 5 ===
Evaluating model on 250 random 20-way one-shot learning tasks ...
Got an average of 54.4% accuracy for 20-way one-shot learning
=== Training loop 6 ===
Evaluating model on 250 random 20-wa

In [None]:
# Hard triplet selection
triplet_net.compile(loss=triplet_loss, optimizer=Adam(lr=0.0001))
loops = 20
best_acc_hard = 0
for i in range(loops):
    print("=== Training loop {} ===".format(i+1))
    # === ADD CODE HERE ===
    train(triplet_net, X_train, hard_selection=True, batch_size=64, steps_per_epoch=100, epochs=1)
    subset_model = Model(inputs=triplet_net.get_layer("leg").get_input_at(0), 
                         outputs=triplet_net.get_layer("leg").get_output_at(0))
    test_acc = test_oneshot(subset_model, X_test, c_test)

    if test_acc >= best_acc_hard:
        print("********* New best one-shot accuracy, saving model ********")
        triplet_net.save(os.path.join(".", "triplet_net_with_hard_selection.h5"))
        best_acc_hard = test_acc

=== Training loop 1 ===
Evaluating model on 250 random 20-way one-shot learning tasks ...
Got an average of 47.6% accuracy for 20-way one-shot learning
********* New best one-shot accuracy, saving model ********
=== Training loop 2 ===
Evaluating model on 250 random 20-way one-shot learning tasks ...
Got an average of 60.4% accuracy for 20-way one-shot learning
********* New best one-shot accuracy, saving model ********
=== Training loop 3 ===
Evaluating model on 250 random 20-way one-shot learning tasks ...
Got an average of 62.4% accuracy for 20-way one-shot learning
********* New best one-shot accuracy, saving model ********
=== Training loop 4 ===
Evaluating model on 250 random 20-way one-shot learning tasks ...
Got an average of 70.0% accuracy for 20-way one-shot learning
********* New best one-shot accuracy, saving model ********
=== Training loop 5 ===
Evaluating model on 250 random 20-way one-shot learning tasks ...
Got an average of 68.8% accuracy for 20-way one-shot learning


## Evaluation

In [None]:
print(f"Best accuracy using random triplets: {best_acc_random}\nBest accuracy using hard triplets: {best_acc_hard}")

Best accuracy using random triplets: 78.4
Best accuracy using hard triplets: 84.4


Before we evaluate our model and compare the performance of random triplets against hard triplets, we will first provide some motivation regarding our model architecture (specifically, the architecture of a single "leg" of the model) and the margin we used for comparing the random triplets against the hard triplets.


### Determining the model
Through experimenting, we observed that the architecture of one single "leg" of the model has a significant impact on the resulting accuracy. Initially, we started with simple baseline model. We continued by extending this model and varying the number of filters in each convolutional layer. We tried varying the number of convolutional blocks, including batch normalization, max pooling, and dropout. We use max-pooling layers to reduce overfitting and to keep the number of parameters on the low side, which in turn decreases the training time (we explain later why this is important). We use dropout to reduce overfitting since it is a straightforward and effective way to do this. We have also tried experimenting with the optimizer and the batch sizes and stick with Adam with a batch size of 64 since this combination seemed to perform well in most cases. As our final layer, we use a linear activation function since we want to preserve the absolute distances between embeddings. Furthermore, we noticed that other activation functions (e.g., sigmoid) did not significantly speed up training.

While we designed our model architecture (the "legs"), we wanted to keep the model relatively simple; we argue that this assignment was not given to us to spend hours and hours on finding the best model for the "legs", but rather to get acquainted with the concepts and tinker with aspects such as the margin, the model, and the selection method to get insights on how these networks work. Furthermore, keeping our model simple ensured that we "only" a couple of million parameters. This was convenient since it resulted in shorter training times, which was especially useful for hard triplet selection.

Ultimately, we found that our current model performed the "best". It is hard to argue what the "best" is in this environment since several processes are performed at "random" (e.g., picking the anchor and positive image class, picking the anchor and positive images, and picking the negative image class). For that reason, we tried to use the hard triplet selection method to verify whether a model performed better or not. We limited the number of loops to three and used this to decide what model performed best. Since we did not want to put all our trust in three loops, whose result can still be somewhat "random" because the negative image class is picked at "random", we also decided to run the model with the random triplets selection method for 10 loops. We are aware that this method is not perfect for comparing models. However, we observed that some models resulted in accuracies between 30% and 40% in most of the loops. In contrast, other models resulted in accuracies that were consistently at least 60% or higher after several loops.

### Determining the margin
It is interesting to tinker with the margin of our triplet loss and see how varying margins influence the results. From the lecture notes, we know that if the margin is low, we have relatively few triplets $(x_a, x_p, x_n)$ that are semi-hard, that is, $||f(x_p) - f(x_a)||^2 < ||f(x_n) - f(x_a)||^2 < ||f(x_p) - f(x_a)||^2 + \alpha$, compared to when the margin is higher. Such a reduction in the number of semi-hard triplets implies an increase in the number of easy negatives. From the lecture notes, we know that easy negatives are less informative and will not contribute any gradients or fewer gradients to our training. This means that the training is less effective, and our model is expected to not achieve as good results as it could.

On the other hand, what happens if the margin is "too large"? To make life easier, assume that the optimal value of the margin (the margin value that leads to the selection of the best triplets) equals 1500. If we now pick a margin that equals 2000, we allow more triplets to be semi-hard negatives than we would have if we had chosen the optimal margin of 1500. This means that we allow triplets that are easy negatives when we consider the optimal margin to be semi-hard negatives with our margin. Since we now use these new semi-hard triplets for training our model, we basically use triplets that are easy negatives in our optimal situation. By definition of our optimal margin, these new semi-hard triplets that are introduced by setting the margin to 2000 carry less information than the semi-hard triplets that exist when using the optimal margin. Hence, we expect our model to learn less from these new semi-hard triplets introduced by increasing the margin higher than the optimal margin. As a result, we expect our model to learn less and, on average, the accuracies to be lower. The more we increase our margin (i.e., the larger the difference between the optimal and our margin), the more "less informative" semi-hard triplets we use for training. Hence, we increase the margin more and more, and so we expect the performance of the model to go down as we use more and more "less informative" semi-hard triplets.

Therefore, we expect that there is a sweet spot for the optimal margin value. We played with small, medium, and large margin values and compared the obtained results. We determined what small, medium, and large margins are in our case by comparing the distances between the anchor and positive, and anchor and negative embeddings using the `tf.print()` statement. We have attempted quite some margin values, and we, indeed, observed that for small margin values such as 0, 50, and 100, the accuracy was not as high as the maximum accuracies we observed, which were obtained with a margin of 2.000. On the other side of the spectrum, we observed that for large margin values such as 5.000 and 10.000, the accuracy was also not as high as the maximum accuracies we observed for a margin of 2.000. In the end, the best margin value we found was 2.000, which resulted in an of 78.4% using the random triplet selection method and an accuracy of 84.4% using the hard triplet selection method. Note that these accuracies are the highest accuracies observed in 20 loops. We also observed that for the random triplet selection method, many of the accuracies obtained after 12 loops were higher than 70%. On the other hand, after 12 loops for the hard triplet selection method, we observed that most accuracies were higher than 79%.

It must also be noted that determining the margin value is difficult since the embedding changes as the training proceeds. In [3], Sun et al. also mention that determining a proper margin was one of their practical difficulties. Is there a smarter way to pick the margin? It seems like there is. Zhang et al. [2] propose a novel multi-stage training strategy that learns incremental triplet margin and improves triplet loss effectively. It would be interesting to explore this concept in the future.


### Comparing random triplets to hard triplets.
To reiterate, under the random triplet selection method, we achieved an accuracy of 78.4%. Under the hard triplet selection method, we achieved an accuracy of 84.4%. Comparing the highest accuracies obtained is not necessarily the best comparison since it is possible the random triplet selection results in an extremely high accuracy simply by luck (since picking the negative image class and the negative image in that class is done at random). Overall, however, we indeed observe that hard triplet selection achieves higher accuracies on a consistent basis (i.e., it does not perform better in 1 of the 20 loops, but it performs better in most loops), which makes sense since the model is given triplets that are, in most cases, more informative, theoretically speaking.

However, it must also be noted that the hard triplet selection required significantly longer to train, namely around 310 seconds. This is significantly longer than the 33 seconds for random triplet selection. This would be a serious disadvantage if we were to train our model for longer periods.

In the future, it would be interesting to explore the strategy proposed by Zhang et al. [2] to pick a margin. Furthermore, it would be exciting to implement online triplet mining and see how that compares to our setup.

[2] https://arxiv.org/pdf/1812.06576.pdf

[3] https://arxiv.org/pdf/1406.4773.pdf