**NOTE: This notebook is written for the Google Colab platform, which provides free hardware acceleration. However it can also be run (possibly with minor modifications) as a standard Jupyter notebook, using a local GPU.** 



In [None]:
#@title -- Installation of Packages -- { display-mode: "form" }
import sys
import numpy as np
from packaging.version import parse as parse_version

if parse_version(np.__version__) > parse_version('1.20.0'):
    !{sys.executable} -m pip install lapjv
else:
    !{sys.executable} -m pip install lapjv==1.3.12

!{sys.executable} -m pip install umap-learn facenet-pytorch
!{sys.executable} -m pip install git+https://github.com/michalgregor/class_utils.git

# Install google-images-download
!{sys.executable} -m pip install git+https://github.com/Joeclinton1/google-images-download.git

# download the ultralytics bing scraper
# from class_utils.download import download_file_maybe_extract
# download_file_maybe_extract(
#     "https://raw.githubusercontent.com/ultralytics/google-images-download/master/bing_scraper.py",
#     directory="."
# )

# install dependencies for ultralytics bing_scraper
# !{sys.executable} -m pip install tqdm selenium
# !apt update
# !apt install chromium-chromedriver
# !cp /usr/lib/chromium-browser/chromedriver /usr/bin
# sys.path.insert(0, '/usr/lib/chromium-browser/chromedriver')
# from selenium import webdriver
# chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--headless')
# chrome_options.add_argument('--no-sandbox')
# chrome_options.add_argument('--disable-dev-shm-usage')
# wd = webdriver.Chrome('chromedriver',chrome_options=chrome_options)

In [None]:
#@title -- Import of Necessary Packages -- { display-mode: "form" }
from google_images_download.google_images_download import googleimagesdownload
from class_utils import make_montage, plot_bboxes
from scipy.spatial.distance import cdist
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
from lapjv import lapjv
from umap import UMAP
from PIL import Image
import numpy as np
import glob
import os
import shutil

from facenet_pytorch import MTCNN, InceptionResnetV1, fixed_image_standardization
import torch

In [None]:
#@title -- Downloading Data -- { display-mode: "form" }
from class_utils.download import download_file_maybe_extract
download_file_maybe_extract("https://www.dropbox.com/s/i5kxprnwuqg4s95/george_martin_example.jpg?dl=1", directory="data")

# also create a directory for storing any outputs
import os
os.makedirs("output", exist_ok=True)

In [None]:
#@title -- Auxiliary Functions -- { display-mode: "form" }

def download_images(keyword, limit, num_retries=5,
                    output_directory='downloads',
                    image_directory='.'):
    for i in range(num_retries):
        response = googleimagesdownload()
        download = response.download(
            {"keywords": k, "limit": 25,
             "output_directory": output_directory,
             "image_directory": image_directory})
        absolute_image_paths = list(download[0].values())[0]

        if len(absolute_image_paths) > 0:
            break

# def download_images(keyword, limit, num_retries=5,
#                     output_directory='downloads',
#                     image_directory='.', chromedriver=None):
#     if chromedriver is None:
#         chromedriver = '/usr/lib/chromium-browser/chromedriver'

#     command = ('python3 bing_scraper.py --search "{keyword:}" --limit 25 ' + 
#               '--download --chromedriver {chromedriver:} ' +
#               '--output_directory="{output_directory:}" ' +
#               '--image_directory="{image_directory:}" ' 
#     ).format(keyword=keyword, chromedriver=chromedriver,
#              output_directory=output_directory,
#              image_directory=image_directory)

#     !$command

def get_image_filenames(
    directory,
    image_exts = ['.jpg', '.jpeg', '.png', '.gif'],
    recursive = True
):
    images = []
    
    for fname in glob.glob(os.path.join(directory, "**/*"), recursive=recursive):
        if os.path.isfile(fname) and os.path.splitext(fname)[-1] in image_exts:
            images.append(fname)
    
    return images

# def enforce_maxres(img, maxres):
#     width, height = img.width, img.height
    
#     if width > height:
#         if width > maxres:
#             width, height = maxres, maxres / width * height
#             img = img.resize((int(width), int(height)), resample=3)
#     else:
#         if height > maxres:
#             width, height = maxres / height * width, maxres
#             img = img.resize((int(width), int(height)), resample=3)
            
#     return img

def plot_clusters(face_extracted, clusts, labelIDs,
                  verbose=1, figsize=(10, 8),
                  show_title=True):
    figures = []

    # loop over the unique face integers
    for labelID in labelIDs:
        # find all indexes into the `data` array that belong to the
        # current label ID, then randomly sample a maximum of 25 indexes
        # from the set
        if verbose:
            print("Faces for face ID: {}".format(labelID))
        
        idxs = np.where(clusts == labelID)[0]
        idxs = np.random.choice(idxs, size=min(25, len(idxs)),
            replace=False)
    
        # initialize the list of faces to include in the montage
        faces = np.asarray([face_extracted[i] for i in idxs])
    
        # create a montage of the faces
        if len(faces):
            montage = make_montage(faces, 5)
        else:
            montage = np.zeros((64, 64, 3))
        
        # show the output montage
        title = "Face ID #{}".format(labelID)
        title = "Unknown Faces" if labelID == -1 else title
        
        fig = plt.figure(figsize=figsize)
        plt.imshow(montage)
        plt.axis('off')

        if show_title:
            plt.title(title)

        figures.append(fig)

    return figures

def plot_faces(face_extracted, poses, w=0.08, h=0.08, ax=None):
    ax = plt.gca()
    
    for i in range(len(face_extracted)):
        face = face_extracted[i]
        pos = poses[i]
        ax.imshow(face, extent=[pos[0] - w/2, pos[0] + w/2,
                                pos[1] - h/2, pos[1] + h/2])

    plt.xlim([np.min(poses[:, 0]) - w, np.max(poses[:, 0]) + w])
    plt.ylim([np.min(poses[:, 1]) - h, np.max(poses[:, 1]) + h])

## Human Face Clustering

The notebook shows a simple way to peform face clustering in Python.

---
### Task 1: Downloading the Images

Naturally, if we want to do face clustering, we will need some face images. We will therefore use the `googleimagesdownload` package to download a few images from the Google Images service – e.g. the photos of some famous celebrities.

**Add the names of 5 or 6 celebrities into the `keywords` list below. These will be used as keywords when searching for the photos.** 

---


In [None]:
keywords = [

    
    
    # XXXXXXXX
    
    
    
]

For every one of these keywords, we will now download a couple of images and store them in the `downloads` folder.



In [None]:
# make sure that the dataset directory is clean before we start
shutil.rmtree('dataset', ignore_errors=True)

# download 25 images for each keyword
for k in keywords:
    download_images(keyword=k, limit=25, output_directory='dataset')

Should the image download fail for any reason, uncomment the cell below to download a precollected dataset.



In [None]:
# !wget -nc -O faceclust_dataset.zip https://www.dropbox.com/s/3mkdxof3r4rmmf2/faceclust_dataset.zip?dl=1
# !unzip -oq -d dataset faceclust_dataset.zip

### Extraction and Transformation of Faces

Given that the images we are using are downloaded directly from Google Images, they will contain not only human faces, but also entire figures and other objects. We will need to extract the faces somehow. 

Let us first select a device for PyTorch: either a GPU if it is available, or else a CPU. Then let's construct a pretrained network that is going to do the face extraction for us.



In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

mtcnn = MTCNN(
    min_face_size=64,
    device=device,
    keep_all=True,
    post_process=False
)

Now let's try out the network on a sample image. We will first open the image and run `mtcnn.detect` on it. What we get is a list containing a bounding box for each face and the list of their corresponding confidence scores (0 for no confidence, 1 for total confidence).

Finally we are also going to plot the bounding boxes onto the image to make sure that everything works correctly.



In [None]:
img = Image.open("data/george_martin_example.jpg")
bboxes, probs = mtcnn.detect(img)

for i, (bbox, prob) in enumerate(zip(bboxes, probs)):
    print(f"Bounding box {i}: {bbox}; score: {prob}")

fig = plot_bboxes(img, bboxes)
fig.savefig("output/face_detection_cnn.jpg",
            bbox_inches="tight", pad_inches=0)

Our `mtcnn` object can also be used to extract the actual facial images from our overall image. Let's run `mtcnn.extract` and display the extracted faces in a grid.



In [None]:
extracts = mtcnn.extract(img, list(bboxes), None) / 255.0
face_montage = make_montage(extracts.permute(0, 2, 3, 1), 3)
plt.imshow(face_montage); plt.axis('off');

In a subsequent step, we will use a different deep neural network to transform our facial images into a new representation that will better express the similarities and differences between human faces than a raw-pixel representation could. The network has been pretrained as a classifier on a dataset with a large number of human faces (VGGFace2).

After pretraining the network, we strip away its top layer. As a result, the network transforms each input image into a 512-dimensional embedding vector. We are going to use these vectors to represent our facial images.



In [None]:
resnet = InceptionResnetV1(pretrained='vggface2', device=device).eval()

Next we retrieve a list of all image files present in the "dataset" folder. We are going to walk over them, extract the faces present in each of the images and store them in the `face_extracted` tensor. We store a numpy copy of the tensor into `face_extracted_img`: we will be using this to plot the faces. We then apply `fixed_image_standardization` to the `face_extracted` tensor to transform it into the format that the neural network expects.

Note that this is cell going to take a while to execute because some of the images will likely be quite high-resolution and we need to run each of them through the network.



In [None]:
img_paths = get_image_filenames('dataset')
face_extracted = []

for i, img_path in enumerate(img_paths):
    print(f"Extracting from image {i}/{len(img_paths)}: '{img_path}'.")
    img = Image.open(img_path).convert(mode="RGB")
    with torch.no_grad():
        extracts = mtcnn(img)

    if not extracts is None:
        face_extracted.append(extracts)

face_extracted = torch.vstack(face_extracted)
face_extracted_img = face_extracted.permute(0, 2, 3, 1).cpu().numpy() / 255.0
face_extracted = fixed_image_standardization(face_extracted).to(device)

Having extracted all the faces, we will now be working with small images, which are now also going to have a standardized size so we are going to be able to batch them. This will make the following cell, where run each image through the embedding network, run much faster than then previous one (especially when using a GPU).

Note that we are using `torch.no_grad` here: we will not be doing backpropagation so we do not need to build a computational graph in the forward pass – this saves us some more time.



In [None]:
batch_size = 64
face_embeddings = []

for i in range(0, len(face_extracted), batch_size):
    with torch.no_grad():
        embedding = resnet(face_extracted[i:min(i+batch_size, len(face_extracted))])
    face_embeddings.append(embedding)

face_embeddings = torch.vstack(face_embeddings).cpu().numpy()

Our dataset may contain faces of people whose names were not among our keywords (the original photos might have contained other persons). Some faces may be extracted incorrectly and there is a chance that some non-faces will be extracted by mistake as well. It will be interesting to see how the network extracting the representations will be able to cope with this.



---
### Task 2: Clustering

**Apply clustering to the `encodings` array, e.g. using DBSCAN. Assign the resulting cluster IDs to variable clusts. Note that you may need to tweak the `eps` hyperparameter to get good results.** 

---


In [None]:


# apply the clustering


clusts =      # assign the cluster IDs to this variable




### Displaying the Results

Finally, let's visualize the faces belonging to the individual clusters. The first image will correspond to faces that do not belong into any cluster.



In [None]:
labelIDs = np.unique(clusts)
numUniqueFaces = len(np.where(labelIDs > -1)[0])
print("Number of unique faces: {}".format(numUniqueFaces))
print("The photos were found using {} different keywords.".format(len(keywords)))

In [None]:
figs = plot_clusters(face_extracted_img, clusts, labelIDs, verbose=0)

for ifig, fig in enumerate(figs):
    fig.savefig("output/clust_{}.pdf".format(ifig),
                bbox_inches="tight", pad_inches=0)

---
### Task 3: Reducing Dimensionality using UMAP

**Use UMAP to reduce the dimensionality of the data in the `encodings` array from 128 to 2 – so that it is possible to plot the data. It may be necessary to tweak arguments `min_dist` and `spread` to get a nice, readable figure (i.e. avoid excessive face overlap and such). Assign the results to an array named `embeds`.** 

---


In [None]:



# apply dimensionality reduction




embeds =          # assign the reduced data to this variable





We normalize the reduced data into the range of [0, 1].



In [None]:
embeds -= embeds.min(axis=0)
embeds /= embeds.max(axis=0)

We plot the faces on the embedding positions.



In [None]:
plt.figure(figsize=(10, 8))
plot_faces(face_extracted_img, embeds)
plt.xlabel('$d_1$')
plt.ylabel('$d_2$')

plt.savefig("output/faces_umap.pdf",
            bbox_inches="tight", pad_inches=0)

### Displaying the Faces on a Grid using the Jonker-Volgenant Algorithm

The visualization produced by UMAP displays the distances between face clusters and such. However, the images overlap to a considerable extent, which makes the figure less readable. We can therefore take an additional step and project all the images into a regular grid using the Jonker-Volgenant algorithm.



In [None]:
sqrt_size = int(np.ceil(np.sqrt(len(embeds))))
size = sqrt_size * sqrt_size
grid = np.dstack(np.meshgrid(np.linspace(0, 1, sqrt_size), np.linspace(0, 1, sqrt_size))).reshape(-1, 2)

padded_embeds = np.zeros((size, embeds.shape[1]))
padded_embeds[:embeds.shape[0], :] = embeds

cost_matrix = cdist(grid, padded_embeds, "sqeuclidean").astype(np.float32)
cost_matrix = cost_matrix * (100000 / cost_matrix.max())
row_as, col_as, _ = lapjv(cost_matrix)
grid_jv = grid[col_as]

The new positions have been stored in array `grid_jv`: we will now use them for the plotting.



In [None]:
plt.figure(figsize=(12, 12))
plot_faces(face_extracted_img, grid_jv)
plt.axis('off')
plt.savefig("output/faces_grid.pdf",
            bbox_inches="tight", pad_inches=0)