In [1]:
import os
import cv2
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from collections import OrderedDict
from PIL import Image
from mtcnn.mtcnn import MTCNN
import matplotlib.patches as patches
import tensorflow as tf
import keras
from sklearn.model_selection import train_test_split
import shutil
from shutil import unpack_archive
from subprocess import check_output

2023-11-29 20:57:36.528646: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-11-29 20:57:36.619747: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
!nvidia-smi

Wed Nov 29 20:57:39 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.12             Driver Version: 535.104.12   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 2080 Ti     On  | 00000000:1A:00.0 Off |                  N/A |
| 31%   34C    P8               5W / 250W |      1MiB / 11264MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 2080 Ti     On  | 00000000:1B:00.0 Off |  

In [3]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  4


## Exploratory Data Analysis

In [4]:
# Set Data Path
DATA_PATH = '../data/lfw-dataset/lfw-deepfunneled/lfw-deepfunneled'


### Given Meta Data Information

In [5]:
# Get meta data
lfw_allnames = pd.read_csv("../data/lfw-dataset/lfw_allnames.csv")
matchpairsDevTest = pd.read_csv("../data/lfw-dataset/matchpairsDevTest.csv")
matchpairsDevTrain = pd.read_csv("../data/lfw-dataset/matchpairsDevTrain.csv")
mismatchpairsDevTest = pd.read_csv("../data/lfw-dataset/mismatchpairsDevTest.csv")
mismatchpairsDevTrain = pd.read_csv("../data/lfw-dataset/mismatchpairsDevTrain.csv")


- pairs.csv: Contains randomly generated splits for 10-fold cross validation specifically for pairs. Use this for the image restricted configuration when forming training sets (refer to readme). There are 10 total sets; 5 sets contain 300 matched pairs, the other 5 sets contain 300 mismatched pairs.

- people.csv: Contains randomly generated splits for 10-fold cross validation specifically for individual faces. Use this for the unrestricted configuration when forming training sets (refer to readme). There are 10 total sets, each with a different amount of people; Set 1: 601. Set 2: 555. Set 3: 552. Set 4: 560. Set 5: 567. Set 6: 527. Set 7: 597. Set 8: 601. Set 9: 580. Set 10: 609.

- matchpairsDevTest.csv: Use this testing set if you decide to go with the pairs configuration. Contains 500 matched pairs of faces for testing set.

- matchpairsDevTrain.csv: Use this training set if you decide to go with the pairs configuration. Contains 1100 matched pairs of faces for training set.

- mismatchpairsDevTest.csv: Use this testing set if you decide to go with the pairs configuration. Contains 500 mismatched pairs of faces for testing set.

- mismatchpairsDevTrain.csv: Use this training set f you decide to go with the pairs configuration. Contains 1100 mismatched pairs of faces for training set.

- peopleDevTest.csv: Use this testing test if you decide to go with the people configuration. Contains 1711 people and 3708 images.

- peopleDevTrain.csv: Use this training set if you decide to go with the people configuration. Contains 4038 people and 9525 images.

In [6]:
pairs = pd.read_csv("../data/lfw-dataset/pairs.csv")
# tidy pairs data: 
pairs = pairs.rename(columns ={'name': 'name1', 'Unnamed: 3': 'name2'})
matched_pairs = pairs[pairs["name2"].isnull()].drop("name2",axis=1)
mismatched_pairs = pairs[pairs["name2"].notnull()]
people = pd.read_csv("../data/lfw-dataset/people.csv")
# remove null values
people = people[people.name.notnull()]
peopleDevTest = pd.read_csv("../data/lfw-dataset/peopleDevTest.csv")

- **lfwallnames.csv:** Contains all names of each face in the dataset along with number of images each face has.

- **lfwreadme.csv:** Comprehensive readme file found on the original database. If there is any information you are missing here or are looking for additional resources you will probably find it in this file. It explains how each .csv file comes into play when forming training and testing models, as well as column metadata information for figuring out what the .csv is talking about. The original website also gives recommendations on training/testing splits and comparison benchmarks.


In [7]:
print("Number of unique celbrities: ",len(lfw_allnames))

Number of unique celbrities:  5749


In [8]:
print("Celebrities with multiple images: ", sum(lfw_allnames.images > 1))

Celebrities with multiple images:  1680


In [9]:
print(" Most photographed celebrities: ", lfw_allnames.sort_values(by="images",ascending=False).head(10))

 Most photographed celebrities:                     name  images
1871      George_W_Bush     530
1047       Colin_Powell     236
5458         Tony_Blair     144
1404    Donald_Rumsfeld     121
1892  Gerhard_Schroeder     109
373        Ariel_Sharon      77
2175        Hugo_Chavez      71
2941  Junichiro_Koizumi      60
2468      Jean_Chretien      55
2682      John_Ashcroft      53


In [10]:
def load_image(name, number, data_path=DATA_PATH):
    """
    Load an image from the dataset.
    :param name: Name of the person.
    :param number: Image number for the person.
    :param data_path: Base directory of the LFW dataset.
    :return: The loaded image.
    """
    filename = f"{name}_{number:04d}.jpg"
    filepath = os.path.join(data_path, name, filename)
    image = cv2.imread(filepath)
    return image


In [11]:
def preprocess_image(image, target_size=(224, 224)):
    """
    Preprocess the image by resizing and normalizing.
    :param image: The image to preprocess.
    :param target_size: The target size of the image.
    :return: Preprocessed image.
    """
    image = cv2.resize(image, target_size)
    image = image / 255.0  # Normalize pixel values
    return image

In [12]:
def create_dataset(data_path, allnames, target_size=(224, 224)):
    """
    Create a dataset of images and labels.
    :param data_path: Base directory of the LFW dataset.
    :param allnames: DataFrame containing names and number of images.
    :param target_size: Target size for each image.
    :return: List of preprocessed images and labels.
    """
    images = []
    labels = []

    for index, row in allnames.iterrows():
        name = row['name']
        for i in range(1, row['images'] + 1):
            image = load_image(name, i, data_path)
            image = preprocess_image(image, target_size)
            images.append(image)
            labels.append(name)

    return np.array(images), np.array(labels)

# Create the dataset
images, labels = create_dataset(DATA_PATH, lfw_allnames)


In [13]:
images.shape

(13233, 224, 224, 3)

In [14]:
labels.shape

(13233,)

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense

def build_model(input_shape=(224, 224, 3)):
    """
    Build a simple CNN model for classification.
    :param input_shape: Shape of the input images.
    :return: Compiled Keras model.
    """
    model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        MaxPooling2D(2, 2),
        Conv2D(64, (3, 3), activation='relu'),
        MaxPooling2D(2, 2),
        Conv2D(128, (3, 3), activation='relu'),
        MaxPooling2D(2, 2),
        Flatten(),
        Dense(512, activation='relu'),
        Dense(len(np.unique(labels)), activation='softmax')  # Assuming classification
    ])

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

model = build_model()


2023-11-29 20:58:15.323332: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-11-29 20:58:15.794205: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9803 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:1a:00.0, compute capability: 7.5
2023-11-29 20:58:15.794990: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 9803 MB memory:  -> device: 1, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:1b:00.0, compute capability: 7.5
2023-11-29 20:58:15.795693: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Creat

In [16]:
from sklearn.model_selection import train_test_split


# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

# Train the model
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val))
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test accuracy: {test_accuracy}, Test loss: {test_loss}")


Epoch 1/10


2023-11-29 20:58:30.515589: W tensorflow/core/framework/op_kernel.cc:1807] OP_REQUIRES failed at cast_op.cc:121 : UNIMPLEMENTED: Cast string to float is not supported


UnimplementedError: Graph execution error:

Detected at node 'Cast_1' defined at (most recent call last):
    File "/blue/egn4951/jmiclat/professionalme/conda/envs/profme/lib/python3.10/runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/blue/egn4951/jmiclat/professionalme/conda/envs/profme/lib/python3.10/runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "/blue/egn4951/jmiclat/professionalme/conda/envs/profme/lib/python3.10/site-packages/ipykernel/__main__.py", line 5, in <module>
      app.launch_new_instance()
    File "/blue/egn4951/jmiclat/professionalme/conda/envs/profme/lib/python3.10/site-packages/traitlets/config/application.py", line 1077, in launch_instance
      app.start()
    File "/blue/egn4951/jmiclat/professionalme/conda/envs/profme/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 737, in start
      self.io_loop.start()
    File "/blue/egn4951/jmiclat/professionalme/conda/envs/profme/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 195, in start
      self.asyncio_loop.run_forever()
    File "/blue/egn4951/jmiclat/professionalme/conda/envs/profme/lib/python3.10/asyncio/base_events.py", line 603, in run_forever
      self._run_once()
    File "/blue/egn4951/jmiclat/professionalme/conda/envs/profme/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once
      handle._run()
    File "/blue/egn4951/jmiclat/professionalme/conda/envs/profme/lib/python3.10/asyncio/events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "/blue/egn4951/jmiclat/professionalme/conda/envs/profme/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 524, in dispatch_queue
      await self.process_one()
    File "/blue/egn4951/jmiclat/professionalme/conda/envs/profme/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 513, in process_one
      await dispatch(*args)
    File "/blue/egn4951/jmiclat/professionalme/conda/envs/profme/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 418, in dispatch_shell
      await result
    File "/blue/egn4951/jmiclat/professionalme/conda/envs/profme/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 758, in execute_request
      reply_content = await reply_content
    File "/blue/egn4951/jmiclat/professionalme/conda/envs/profme/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 426, in do_execute
      res = shell.run_cell(
    File "/blue/egn4951/jmiclat/professionalme/conda/envs/profme/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 549, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/blue/egn4951/jmiclat/professionalme/conda/envs/profme/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3048, in run_cell
      result = self._run_cell(
    File "/blue/egn4951/jmiclat/professionalme/conda/envs/profme/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3103, in _run_cell
      result = runner(coro)
    File "/blue/egn4951/jmiclat/professionalme/conda/envs/profme/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/blue/egn4951/jmiclat/professionalme/conda/envs/profme/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3308, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/blue/egn4951/jmiclat/professionalme/conda/envs/profme/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3490, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/blue/egn4951/jmiclat/professionalme/conda/envs/profme/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3550, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/scratch/local/16765705/ipykernel_2664338/4106594929.py", line 11, in <module>
      history = model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val))
    File "/blue/egn4951/jmiclat/professionalme/conda/envs/profme/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/blue/egn4951/jmiclat/professionalme/conda/envs/profme/lib/python3.10/site-packages/keras/engine/training.py", line 1650, in fit
      tmp_logs = self.train_function(iterator)
    File "/blue/egn4951/jmiclat/professionalme/conda/envs/profme/lib/python3.10/site-packages/keras/engine/training.py", line 1249, in train_function
      return step_function(self, iterator)
    File "/blue/egn4951/jmiclat/professionalme/conda/envs/profme/lib/python3.10/site-packages/keras/engine/training.py", line 1233, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/blue/egn4951/jmiclat/professionalme/conda/envs/profme/lib/python3.10/site-packages/keras/engine/training.py", line 1222, in run_step
      outputs = model.train_step(data)
    File "/blue/egn4951/jmiclat/professionalme/conda/envs/profme/lib/python3.10/site-packages/keras/engine/training.py", line 1028, in train_step
      return self.compute_metrics(x, y, y_pred, sample_weight)
    File "/blue/egn4951/jmiclat/professionalme/conda/envs/profme/lib/python3.10/site-packages/keras/engine/training.py", line 1122, in compute_metrics
      self.compiled_metrics.update_state(y, y_pred, sample_weight)
    File "/blue/egn4951/jmiclat/professionalme/conda/envs/profme/lib/python3.10/site-packages/keras/engine/compile_utils.py", line 605, in update_state
      metric_obj.update_state(y_t, y_p, sample_weight=mask)
    File "/blue/egn4951/jmiclat/professionalme/conda/envs/profme/lib/python3.10/site-packages/keras/utils/metrics_utils.py", line 77, in decorated
      update_op = update_state_fn(*args, **kwargs)
    File "/blue/egn4951/jmiclat/professionalme/conda/envs/profme/lib/python3.10/site-packages/keras/metrics/base_metric.py", line 140, in update_state_fn
      return ag_update_state(*args, **kwargs)
    File "/blue/egn4951/jmiclat/professionalme/conda/envs/profme/lib/python3.10/site-packages/keras/metrics/base_metric.py", line 676, in update_state
      y_true = tf.cast(y_true, self._dtype)
Node: 'Cast_1'
2 root error(s) found.
  (0) UNIMPLEMENTED:  Cast string to float is not supported
	 [[{{node Cast_1}}]]
  (1) CANCELLED:  Function was cancelled before it was started
0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_1328]