# Imports and Setup

In [1]:
# install facenet-pytorch package
!pip install facenet-pytorch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting facenet-pytorch
  Downloading facenet_pytorch-2.5.3-py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: facenet-pytorch
Successfully installed facenet-pytorch-2.5.3


In [2]:
# imports
from facenet_pytorch import MTCNN, InceptionResnetV1
import torch
from torch.utils.data import DataLoader
from torchvision import datasets
import numpy as np
import pandas as pd
import os

workers = 0 if os.name == 'nt' else 4

In [3]:
# mount drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
# change directory to google drive
%cd /content/drive/MyDrive  
# change directory to EE443 FinalProject 
%cd ./EE443/FinalProject

/content/drive/MyDrive
/content/drive/.shortcut-targets-by-id/1QEy2Veehr65v4Rnyo-FHxdruyA8PgnGA/EE443/FinalProject


In [5]:
# use GPU if available
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Running on device: {}'.format(device))

Running on device: cuda:0


In [6]:
# initialize the MTCNN face detection module with default settings
mtcnn = MTCNN(
    image_size=160, margin=0, min_face_size=20,
    thresholds=[0.6, 0.7, 0.7], factor=0.709, post_process=True,
    device=device
)

In [7]:
# initialize the inception resnet with weights pretrained on the vggface2 dataset
resnet = InceptionResnetV1(pretrained='vggface2').eval().to(device)

  0%|          | 0.00/107M [00:00<?, ?B/s]

# Evaluate Performance on Validation Data

In [8]:
# initialize dataloader
def collate_fn(x):
    return x[0]

# choose validation data
dataset = datasets.ImageFolder('val')
loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=workers)

In [9]:
# detect and align the face in each image
aligned = []
for x, y in loader:
    x_aligned, prob = mtcnn(x, return_prob=True)
    if x_aligned is not None:
        print('Face detected with probability: {:8f}'.format(prob))
        aligned.append(x_aligned)

Face detected with probability: 0.999960
Face detected with probability: 0.999977
Face detected with probability: 1.000000
Face detected with probability: 0.999979
Face detected with probability: 0.999996
Face detected with probability: 0.999384
Face detected with probability: 0.999963
Face detected with probability: 0.999880
Face detected with probability: 0.999999
Face detected with probability: 0.999953
Face detected with probability: 0.999880
Face detected with probability: 0.999987
Face detected with probability: 0.999971
Face detected with probability: 0.999699
Face detected with probability: 0.999819
Face detected with probability: 0.999991
Face detected with probability: 0.999979
Face detected with probability: 0.996858
Face detected with probability: 0.999885
Face detected with probability: 0.999971
Face detected with probability: 0.999854
Face detected with probability: 1.000000
Face detected with probability: 0.999240
Face detected with probability: 0.999881
Face detected wi

In [10]:
# pass the aligned faces through the resnet to get the FaceNet embeddings
aligned = torch.stack(aligned).to(device)
embeddings = resnet(aligned).detach().cpu()

In [11]:
# calculate difference between images using the norm (taxicab distance) between the embedded features
dists = np.zeros(100)
for i in range(100):
    dists[i] = (embeddings[2*i]-embeddings[2*i+1]).norm().item()

In [12]:
# we notice almost all the images of the same person have a distance < 1,
# while images of different people have a distance > 1
print(dists)

[0.79012734 0.63600713 0.69700629 0.57419777 0.51121759 0.86554986
 0.7226277  0.50012213 0.7678082  0.5448277  0.59487498 0.54517043
 0.54393071 0.54310936 0.59262335 0.47543904 0.50062203 0.45440468
 0.5488553  0.60421038 1.48392057 1.14231873 1.42739642 1.18204725
 1.43106306 1.61400449 0.57709128 1.18898976 1.21715522 1.17669261
 1.38067961 1.29715931 1.37809253 1.21578753 1.27305973 1.21488333
 1.19465709 1.16543436 1.21804476 1.00786293 1.1680088  1.08924377
 1.04165995 1.11382699 1.07124996 1.07234597 1.35489428 1.16439116
 1.34280479 1.15742719 0.45784411 0.58190948 0.76706356 0.84657317
 0.61378306 0.5873211  0.57680929 0.6854369  0.60287029 0.60402066
 0.63950592 0.67248708 1.13848352 0.46741477 0.44723389 0.87806499
 0.76923317 0.52662665 0.69749236 0.65280777 0.78832406 0.48510972
 0.54233164 0.52812558 0.63791198 1.52328444 1.4107101  1.4179045
 1.55062759 1.27548337 1.35484731 1.43746698 1.29628956 1.48562014
 1.55929732 1.52635098 1.21875894 1.31141055 1.51513159 1.47091

In [13]:
# predict the faces to be the same person if distance is < 1
preds = (dists < 1).astype(int)

# get ground truth labels from the val.txt file
gt_labels = open('val/val.txt', "r").read().split('\n')[1:-1]
gt_labels = [label.split(', ')[1] for label in gt_labels]
gt_labels = np.array(gt_labels, dtype = int)

# compare the predicted labels to the ground truth labels
print(gt_labels == preds)

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True False  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True False  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True]


In [14]:
# calculate percent correct
print(np.sum((gt_labels == preds).astype(int))/100)

0.98


# Generating Test Predictions

In [15]:
# initialize dataloader
def collate_fn(x):
    return x[0]
    
# choose test data
dataset = datasets.ImageFolder('test')
loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=workers)

In [16]:
# detect and align the face in each image
aligned = []
for x, y in loader:
    x_aligned, prob = mtcnn(x, return_prob=True)
    if x_aligned is not None:
        print('Face detected with probability: {:8f}'.format(prob))
        aligned.append(x_aligned)

Face detected with probability: 0.999872
Face detected with probability: 0.999998
Face detected with probability: 0.999880
Face detected with probability: 0.999971
Face detected with probability: 1.000000
Face detected with probability: 0.999958
Face detected with probability: 0.999441
Face detected with probability: 0.999970
Face detected with probability: 0.999808
Face detected with probability: 0.999441
Face detected with probability: 0.999981
Face detected with probability: 0.999994
Face detected with probability: 0.998808
Face detected with probability: 0.999987
Face detected with probability: 0.999851
Face detected with probability: 0.999981
Face detected with probability: 1.000000
Face detected with probability: 0.999971
Face detected with probability: 1.000000
Face detected with probability: 0.999998
Face detected with probability: 0.999988
Face detected with probability: 0.999963
Face detected with probability: 1.000000
Face detected with probability: 0.999954
Face detected wi

In [17]:
# pass the aligned faces through the resnet to get the FaceNet embeddings
aligned = torch.stack(aligned).to(device)
embeddings = resnet(aligned).detach().cpu()

In [18]:
# calculate difference between images using the norm (taxicab distance) between the embedded features
dists = np.zeros(400)
for i in range(400):
    dists[i] = (embeddings[2*i]-embeddings[2*i+1]).norm().item()

In [19]:
print(dists)

[0.55457389 1.33937073 0.75492823 0.46458155 1.14128816 1.22111797
 0.92487389 0.71835679 1.46324587 1.54312849 1.30589306 1.51131403
 0.62057084 0.48127607 1.48006308 1.31388748 0.70081174 0.68515295
 1.55126536 1.56341124 0.80053246 0.70108575 1.35165989 0.39693546
 1.3551861  1.30024803 0.65000135 1.28141761 1.3888309  1.46833003
 1.50147951 1.47529566 1.50615156 1.39507055 0.70599973 0.82728398
 0.77297646 1.31199348 1.55231595 1.24618661 1.31329799 0.73118144
 0.40046084 1.33507752 1.27266645 0.65234393 1.42402971 1.52530968
 1.35693979 0.78036028 0.95639145 1.51770079 1.40628767 0.59773433
 1.20556617 1.1832422  0.58490819 0.79516321 0.68039733 1.54548228
 1.45559764 1.29632294 1.51602709 1.42896986 0.74144036 1.52114892
 1.5076499  1.49484074 0.57134444 0.62918085 1.56477547 0.65062499
 0.59039557 1.3236165  1.43723488 1.30871046 0.53728682 1.46880507
 1.4165225  1.30310917 1.35830569 1.4758122  0.77532077 0.8473925
 0.64656967 1.22914016 1.49078965 1.46112847 0.63909763 0.49784

In [20]:
# predict the faces to be the same person if distance is < 1
preds = (dists < 1).astype(int)
print(preds)

[1 0 1 1 0 0 1 1 0 0 0 0 1 1 0 0 1 1 0 0 1 1 0 1 0 0 1 0 0 0 0 0 0 0 1 1 1
 0 0 0 0 1 1 0 0 1 0 0 0 1 1 0 0 1 0 0 1 1 1 0 0 0 0 0 1 0 0 0 1 1 0 1 1 0
 0 0 1 0 0 0 0 0 1 1 1 0 0 0 1 1 0 0 1 1 0 1 1 0 0 0 1 0 0 0 0 0 0 0 1 1 1
 0 1 1 1 1 0 0 1 0 0 1 1 0 1 0 0 0 0 1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 0 1 0
 1 1 1 0 1 0 0 1 1 1 1 0 1 1 1 0 1 1 1 0 0 1 1 1 1 0 0 0 0 0 1 1 0 1 0 0 1
 0 1 0 1 1 0 0 0 0 1 0 1 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 1 0
 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 1 0 0 1 1 0 0 1 0 0 0 1 0 0 1 0 0 1 0 0 0 0
 0 0 1 0 1 0 1 1 0 1 1 0 0 1 0 1 0 0 0 0 1 1 0 0 0 0 1 0 0 1 0 1 0 0 1 1 0
 1 0 0 1 1 1 1 1 0 1 0 0 0 0 0 1 1 1 0 1 1 0 1 1 0 0 1 1 0 1 0 1 1 1 1 1 0
 0 0 1 0 0 1 0 1 1 1 0 1 1 0 1 1 0 0 0 0 1 1 1 0 0 0 0 0 1 1 1 1 0 1 1 0 1
 1 1 0 1 0 0 1 1 1 0 1 0 0 0 0 1 0 1 0 1 0 1 1 0 1 0 1 1 0 1]


In [22]:
# write the predictions to a txt file
f = open('testing.txt', 'w')
f.write('id, label\n')
for i in range(400):
    f.write(str(i))
    f.write(', ')
    f.write(str(preds[i]))
    f.write('\n')