# Information
The images were obtained from https://github.com/telecombcn-dl/2018-dlai-team4  
This is a subset of the original LFW dataset, called "optimized for accuracy"  
The subset is based only on celebrities with more than 20 images in LFW  
A preprocessing step is applied to extract only the face from the whole images  

# Mounting Google Drive with Colab (optional)
Also checks for a connected GPU. The connected GPU will not speed up this notebook and is not recommended because there are limits with using Colab GPUs.

In [None]:
from google.colab import drive
drive.mount('/content/drive') # mount drive

# Check to see if using a GPU
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') < 0:
  print('Connected to a GPU.')
  print('This project does not utilize a GPU, feel free to use a standard CPU runtime to avoid any GPU runtime limits.')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Importing libraries

In [None]:
from os import listdir  # for loading all images in the dataset

from PIL import Image   # For loading images
import numpy as np      # Numpy arrays, more memory-efficient than Python lists, useful built-in array functions (min/max of multidimensional array, normalization)
import pandas as pd     # For loading CSVs
import pickle           # For exporting data
import natsort          # For sorting filenames
from tensorflow.keras.utils import to_categorical   # Converting array of integers into array of one-hot vectors

# Setting paths/directories/variables

In [None]:
DATA_DIR = 'drive/My Drive/Colab Notebooks/LFW/'  # Raw dataset, contains CSVs and subdirectories for train/test/val
TEST_DIR = DATA_DIR + 'Test/'                     # Test subdirectory                  
TRAIN_DIR = DATA_DIR + 'Train/'                   # Train subdirectory
VAL_DIR = DATA_DIR + 'Validation/'                # Validation subdirectory

NAME_LIST_PATH = DATA_DIR + 'name_list.csv'       # List of all classes
PICKLED_DATA_PATH = 'drive/My Drive/Colab Notebooks/data.pickled'   # Output path for processed dataset

IMG_SHAPE = (100, 100)

# Getting the number of classes
Used for converting from names to one-hot vectors

In [None]:
name_list = pd.read_csv(NAME_LIST_PATH).values
print(name_list)
NUM_CLASSES = len(name_list)  # Number of classes is the number of names in the list

print(f'Number of classes: {NUM_CLASSES}')

[[0 'Gloria_Macapagal_Arroyo']
 [1 'Jennifer_Capriati']
 [2 'Laura_Bush']
 [3 'Winona_Ryder']
 [4 'Tiger_Woods']
 [5 'Hugo_Chavez']
 [6 'John_Negroponte']
 [7 'George_W_Bush']
 [8 'Roh_Moo-hyun']
 [9 'Paul_Bremer']
 [10 'George_Robertson']
 [11 'Tom_Daschle']
 [12 'Ricardo_Lagos']
 [13 'Jennifer_Lopez']
 [14 'Jose_Maria_Aznar']
 [15 'Silvio_Berlusconi']
 [16 'Vicente_Fox']
 [17 'Jennifer_Aniston']
 [18 'Gerhard_Schroeder']
 [19 'David_Beckham']
 [20 'Kofi_Annan']
 [21 'Igor_Ivanov']
 [22 'Jiang_Zemin']
 [23 'Mahmoud_Abbas']
 [24 'Pete_Sampras']
 [25 'Guillermo_Coria']
 [26 'Donald_Rumsfeld']
 [27 'Megawati_Sukarnoputri']
 [28 'Jeremy_Greenstock']
 [29 'Junichiro_Koizumi']
 [30 'Jack_Straw']
 [31 'Rudolph_Giuliani']
 [32 'Jacques_Chirac']
 [33 'Saddam_Hussein']
 [34 'John_Ashcroft']
 [35 'Lindsay_Davenport']
 [36 'Naomi_Watts']
 [37 'Lleyton_Hewitt']
 [38 'Hamid_Karzai']
 [39 'Tom_Ridge']
 [40 'Recep_Tayyip_Erdogan']
 [41 'Tony_Blair']
 [42 'Hans_Blix']
 [43 'Jean_Chretien']
 [44 'Nesto

# Loading images
And normalizing them.

In [None]:
imgs = [[], [], []] # Empty list of empty lists, imgs[0] = test images, imgs[1] = train images, imgs[2] = validation images
paths = [TEST_DIR, TRAIN_DIR, VAL_DIR]  # Create a list of paths to loop through

for i, path in enumerate(paths):                # Loop through each path with index i
  filenames = natsort.natsorted(listdir(path))
  for filename in filenames:                # Loop through all files in the path
    with Image.open(path + filename) as image:  # Open the image at path + filename with PIL Image library
      imgs[i].append(np.asarray(image.resize(IMG_SHAPE)))         # Convert to a numpy array and append to the corresponding list in imgs

for i in range(len(imgs)):        # Loop through each list
  imgs[i] = np.asarray(imgs[i])   # Convert to a numpy array

## Normalizing images

In [None]:
# Find the maximum value in the test images
max_imgs = np.argmax(imgs[0]) # Unravels imgs[0] to a single-dimensional array and returns the index of the largest element
max_imgs = np.unravel_index(max_imgs, imgs[0].shape)  # Converts index for unraveled array into multi-dimensional index
max_imgs = imgs[0][max_imgs]  # Index imgs[0] with the multi-dimensional index

# Repeat, but find minimum instead of maximum
min_imgs = np.argmin(imgs[0])
min_imgs = np.unravel_index(min_imgs, imgs[0].shape)
min_imgs = imgs[0][min_imgs]

print(f'Test images')
print(f'Min: {min_imgs}')
print(f'Max: {max_imgs}')
print(f'Average: {np.average(imgs[0])}\n')

# Images are represented as multi-dimensional arrays with values in the range [0-255]
# 2 dimensions for black-and-white images, 3 dimensions for RGB images, 4 dimensions for RGBA images
# Simplest way to normalize images is to divide all values by 255 to get an array with floats in the range [0.0-1.0]
# Normalizing input can make training a neural network much faster
# Especially helpful if a network takes inputs that have different ranges (image array with values [0-255], age [0-~120])
imgs = np.asarray(imgs) / 255.0   # Normalize the images to values between 0.0 and 1.0 (inclusive)

# Find the max and min again
max_imgs = np.argmax(imgs[0])
max_imgs = np.unravel_index(max_imgs, imgs[0].shape)
max_imgs = imgs[0][max_imgs]

min_imgs = np.argmin(imgs[0])
min_imgs = np.unravel_index(min_imgs, imgs[0].shape)
min_imgs = imgs[0][min_imgs]

print(f'Normalized test images:')
print(f'Min: {min_imgs}')
print(f'Max: {max_imgs}')
print(f'Average: {np.average(imgs[0])}')

Test images
Min: 0
Max: 255
Average: 121.01817683508104



  return array(a, dtype, copy=False, order=order)


Normalized test images:
Min: 0.0
Max: 1.0
Average: 0.4745810856277703


# Loading labels
And converting them to one-hot vectors

In [None]:
# The labels for each set (train, test, validate) are stored as a list of integers  
# There is a CSV that maps each integer to a string name (should be at NAME_LIST_PATH)  
# One-hot vectors is kind of like a way of normalizing categorical data (colors, names, street names, gender, etc.)  
# If there are two classes: red and blue  
#   I can assign 1 to red and 2 to blue (arbitrary)  
# Then if my test labels are this list: [1, 2, 1, 1]  
#   They will be converted into this list of one-hot vectors: [[1, 0], [0, 1], [1, 0], [1, 0]]  
#     where each one-hot vector is a Boolean vector (0 = false, 1 = true): [is red, is blue]
# Whole set becomes a Boolean matrix where the row is the image index and the column is the categorical index

# Load the CSVs as an array of integers
test_labels = pd.read_csv(f'{DATA_DIR}test_labels.csv', header=None).values.astype(np.uint)
train_labels = pd.read_csv(f'{DATA_DIR}train_labels.csv', header=None).values.astype(np.uint)
val_labels = pd.read_csv(f'{DATA_DIR}val_labels.csv', header=None).values.astype(np.uint)

# Convert the array of integers to an array of one-hot encoded vectors
test_labels_onehot = to_categorical(test_labels, num_classes=NUM_CLASSES)
train_labels_onehot = to_categorical(train_labels, num_classes=NUM_CLASSES)
val_labels_onehot = to_categorical(val_labels, num_classes=NUM_CLASSES)

print(f'Training labels shape: {train_labels.shape}')                 # (number of training images x 1), list of single integers
print(f'Training labels one-hot shape: {train_labels_onehot.shape}')  # (number of training images x number of classes), list of one-hot vectors

Training labels shape: (3043, 1)
Training labels one-hot shape: (3043, 62)


# Printing shapes

In [None]:
print('Testing data shapes')
print(f'X: {imgs[0].shape}')
print(f'y: {test_labels_onehot.shape}\n')

print('Training data shapes')
print(f'X: {imgs[1].shape}')
print(f'y: {train_labels_onehot.shape}\n')

print('Validation data shapes')
print(f'X: {imgs[2].shape}')
print(f'y: {val_labels_onehot.shape}\n')

Testing data shapes
X: (1049, 100, 100, 3)
y: (1049, 62)

Training data shapes
X: (3043, 100, 100, 3)
y: (3043, 62)

Validation data shapes
X: (1021, 100, 100, 3)
y: (1021, 62)



# Exporting data as a pickled object

In [None]:
with open(PICKLED_DATA_PATH, 'wb') as outfile:  # Open output file
  pickle.dump((imgs[1], train_labels_onehot, imgs[0], test_labels_onehot, imgs[2], val_labels_onehot), outfile)                    # Save data tuple as a pickled object