In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from sklearn.metrics.pairwise import cosine_similarity

# Triplet Loss

For an image $x$, its encoding is denoted as $f(x)$, where $f$ is the function computed by the neural network.

<div style="text-align: center;">
    <img src="images/f_x.png" style="width:400px;height:200px;">
</div>

### Triplet Training:
- Training will use **triplets of images** $(A, P, N)$, where:
    - **A** is the "Anchor" image — a picture of a person.
    - **P** is the "Positive" image — a picture of the same person as the Anchor.
    - **N** is the "Negative" image — a picture of a different person than the Anchor.

- These triplets are selected from the training dataset. Let $(A^{(i)}, P^{(i)}, N^{(i)})$ denote the $i$-th training example.

- You aim to ensure that an image $A^{(i)}$ (Anchor) is **closer** to the Positive $P^{(i)}$ than to the Negative $N^{(i)}$, by at least a margin $\alpha$:

$$
|| f\left(A^{(i)}\right)-f\left(P^{(i)}\right)||_{2}^{2}+\alpha<|| f\left(A^{(i)}\right)-f\left(N^{(i)}\right)||_{2}^{2}
$$

### Triplet Loss:
- We define the following **triplet loss** function:

$$
\mathcal{J} = \sum^{m}_{i=1} \left[ \underbrace{\mid \mid f(A^{(i)}) - f(P^{(i)}) \mid \mid_2^2}_\text{(1)} - \underbrace{\mid \mid f(A^{(i)}) - f(N^{(i)}) \mid \mid_2^2}_\text{(2)} + \alpha \right]_+
\tag{3}
$$

> **Note**: The notation "$[z]_+$" denotes $\max(z, 0)$.

### Key Points:
- Term (1): Squared distance between the **anchor** (A) and the **positive** (P) for a given triplet; you want this to be small.
- Term (2): Squared distance between the **anchor** (A) and the **negative** (N) for a given triplet; you want this to be large
- **$\alpha$**: This is the margin—a manually chosen hyperparameter. Here, $\alpha = 0.2$.

> **Note**: In most implementations, the encoding vectors are rescaled to have an L2 norm equal to 1 (i.e., $\mid \mid f(img) \mid \mid_2 = 1$). You won’t need to handle this here.


In [2]:
def triplet_loss(Y_true, Y_pred, alpha=0.2):
    """
    Implementation of the triplet loss as defined by formula (3)
    
    Arguments:
    Y_true -- true labels, required when you define a loss in Keras, you don't need it in this function.
    Y_pred -- Python list containing three objects:
            anchor -- the encodings for the anchor images, of shape (None, 128)
            positive -- the encodings for the positive images, of shape (None, 128)
            negative -- the encodings for the negative images, of shape (None, 128)
    
    Returns:
    loss -- real number, value of the loss
    """
    
    anchor, positive, negative = Y_pred[0], Y_pred[1], Y_pred[2]

    pos_dist = tf.subtract(anchor, positive)  # (None, n_features)
    pos_dist = tf.square(pos_dist)  # (None, n_features)
    pos_dist = tf.reduce_sum(pos_dist, axis=-1)  # (None, )

    neg_dist = tf.subtract(anchor, negative)  # (None, n_features)
    neg_dist = tf.square(neg_dist)  # (None, n_features)
    neg_dist = tf.reduce_sum(neg_dist, axis=-1)  #  (None, )

    loss = tf.add(tf.subtract(pos_dist, neg_dist), alpha)  # (None, )
    loss = tf.maximum(loss, 0)  # (None, )
    loss = tf.reduce_sum(loss)
    return loss

# Pre-trained Model

For the face recognition task, the preferred model is **Inception ResNet v1**. However, since **Inception ResNet v1** is not accessible, I will be using **Inception ResNet v2** (just for the purpose of study). Below are the details of both models:

- `Inception ResNet v1`
    - **Input shape**: (160, 160, 3)
    - **Output**: 128 classes (representing face embeddings)
    - **Purpose**: Face recognition, typically used for extracting face features for identification and verification
    - **Dataset**: Commonly trained on specialized face datasets such as: `VGGFace2`, `MS-Celeb-1M`, `CASIA-WebFace`
    - ---
- `Inception ResNet v2`
    - **Input shape**: (299, 299, 3)
    - **Output**: 1000 classes (representing general object categories)
    - **Purpose**: Generic image classification, commonly used to classify objects like animals, vehicles, and other everyday objects
    - **Dataset**: Typically trained on the `ImageNet` dataset, which includes 1000 different object categories
    - ---
> In summary, if **Inception ResNet v1** becomes accessible, it should be your preferred choice for face recognition tasks, as it delivers superior performance in this domain.

In [3]:
inception_resnet_v2 = keras.applications.InceptionResNetV2()
inception_resnet_v2.summary()




# Face Embedding

In [4]:
def extract_features(image_path, 
                     image_size=(299, 299), 
                     model='inception_resnet_v2'):
    img = keras.preprocessing.image.load_img(image_path, target_size=image_size)
    
    img = np.array(img)
    img = np.around(img / 255.0, decimals=12)
    
    img = np.expand_dims(img, axis=0)

    if model == 'inception_resnet_v1':
        feature_vector = inception_resnet_v1.predict(img)
    elif model == 'inception_resnet_v2':
        feature_vector = inception_resnet_v2.predict(img)
        
    vector_length = np.linalg.norm(feature_vector, ord=2)
    
    return feature_vector / vector_length

In [5]:
base_path = "images/human_faces/"

names = ["danielle", "younes", "tian", "andrew", "kian", "dan", 
         "sebastiano", "bertrand", "kevin", "felix", "benoit", "arnaud"]

In [6]:
database = {
    name: extract_features(f"{base_path}{name}.jpg") for name in names
}

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 17s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 297ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 337ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 318ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 326ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 304ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 315ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 334ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 311ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 329ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 350ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 327ms/step


In [7]:
for i, j in database.items():
    print(i, j.shape)

danielle (1, 1000)
younes (1, 1000)
tian (1, 1000)
andrew (1, 1000)
kian (1, 1000)
dan (1, 1000)
sebastiano (1, 1000)
bertrand (1, 1000)
kevin (1, 1000)
felix (1, 1000)
benoit (1, 1000)
arnaud (1, 1000)


# Face Verification

**Face Verification** "Is this the claimed person?" For example, at some airports, you can pass through customs by letting a system scan your passport and then verifying that you (the person carrying the passport) are the correct person. A mobile phone that unlocks using your face is also using face verification. This is a 1:1 matching problem.

In [8]:
def verify(image_path, identity, database, threshold=1):
    
    features_from_camera = extract_features(image_path)
    features_from_database = database[identity]

    dist = np.linalg.norm(features_from_camera - features_from_database, ord=2)
    
    if dist < threshold:  # 0.7 is more appropriate but I have to use 1 since I dont use Inception Resnet v1, just to test, and the result still sucks
        print(f"It's {identity}, welcome home!")
        open_door = True
    else:
        print(f"It's not {identity}, please go away.")
        open_door = False

    return dist, open_door

In [9]:
for i in range(6):
    print(verify(f"images/human_faces/camera_{i}.jpg", "younes", database))
    print('-'*100)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 333ms/step
It's not younes, please go away.
(1.0055572, False)
----------------------------------------------------------------------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 321ms/step
It's not younes, please go away.
(1.0951583, False)
----------------------------------------------------------------------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 340ms/step
It's not younes, please go away.
(1.0376465, False)
----------------------------------------------------------------------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 353ms/step
It's not younes, please go away.
(1.102509, False)
----------------------------------------------------------------------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 328ms/step
It's not younes, please 

In [10]:
for i in range(6):
    print(verify(f"images/human_faces/camera_{i}.jpg", "kian", database))
    print('-'*100)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 314ms/step
It's not kian, please go away.
(1.4089334, False)
----------------------------------------------------------------------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 342ms/step
It's not kian, please go away.
(1.4082857, False)
----------------------------------------------------------------------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 324ms/step
It's not kian, please go away.
(1.4087815, False)
----------------------------------------------------------------------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 324ms/step
It's not kian, please go away.
(1.4008085, False)
----------------------------------------------------------------------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 333ms/step
It's not kian, please go away.


# Face Recognition

**Face Recognition** "Who is this person?" For example, the video lecture showed a [face recognition video](https://www.youtube.com/watch?v=wr4rx0Spihs) of Baidu employees entering the office without needing to otherwise identify themselves. This is a 1:K matching problem.

Your face verification system is mostly working. But since Kian got his ID card stolen, when he came back to the office the next day he couldn't get in!

To solve this, you'd like to change your face verification system to a face recognition system. This way, no one has to carry an ID card anymore. An authorized person can just walk up to the building, and the door will unlock for them!

You'll implement a face recognition system that takes as input an image, and figures out if it is one of the authorized persons (and if so, who). Unlike the previous face verification system, you will no longer get a person's name as one of the inputs.


In [11]:
def identify(image_path, database, threshold):
    features_from_camera = extract_features(image_path)

    min_dist = 100
    for name, features in database.items():
        dist = np.linalg.norm(features - features_from_camera, ord=2)
        if dist < min_dist:
            min_dist = dist
            identity = name

    if min_dist > threshold:  # 0.7 is more appropriate but I have to use 1 since I dont use Inception Resnet v1, just to test, and the result still sucks
        print("Not in the database.")
    else:
        print(f"It's {identity}, welcome home!")

    return min_dist, identity

In [12]:
for i in range(6):
    print(identify(f"images/human_faces/camera_{i}.jpg", database))
    print('-'*100)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 300ms/step
It's benoit, welcome home!
(0.27198675, 'benoit')
----------------------------------------------------------------------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 339ms/step
It's sebastiano, welcome home!
(0.22065023, 'sebastiano')
----------------------------------------------------------------------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 325ms/step
It's arnaud, welcome home!
(0.15413554, 'arnaud')
----------------------------------------------------------------------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 351ms/step
It's bertrand, welcome home!
(0.19792324, 'bertrand')
----------------------------------------------------------------------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 314ms/step
It's arnaud, welcom

# References
1. Florian Schroff, Dmitry Kalenichenko, James Philbin (2015). [FaceNet: A Unified Embedding for Face Recognition and Clustering](https://arxiv.org/pdf/1503.03832.pdf)

2. Yaniv Taigman, Ming Yang, Marc'Aurelio Ranzato, Lior Wolf (2014). [DeepFace: Closing the gap to human-level performance in face verification](https://research.fb.com/wp-content/uploads/2016/11/deepface-closing-the-gap-to-human-level-performance-in-face-verification.pdf)

3. This implementation also took a lot of inspiration from the official FaceNet github repository: https://github.com/davidsandberg/facenet

4. Further inspiration was found here: https://machinelearningmastery.com/how-to-develop-a-face-recognition-system-using-facenet-in-keras-and-an-svm-classifier/

5. And here: https://github.com/nyoki-mtl/keras-facenet/blob/master/notebook/tf_to_keras.ipynb