### Assigning race labels to faces from the RAF-DB dataset


In [None]:
import os
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
import requests
from PIL import Image
from io import BytesIO
import torch
from torchvision import models

In [None]:
# get raf-db dataset from Kaggle
!kaggle datasets download -d shuvoalok/raf-db-dataset
!unzip raf-db-dataset.zip -d /content/raf-db

Dataset URL: https://www.kaggle.com/datasets/shuvoalok/raf-db-dataset
License(s): other
raf-db-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  raf-db-dataset.zip
replace /content/raf-db/DATASET/test/1/test_0002_aligned.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [None]:
train_dir = '/content/raf-db/DATASET/train'
test_dir = '/content/raf-db/DATASET/test'

train_emotions = os.listdir(train_dir) # indexed from 1-7 for emotion classes
test_emotions = os.listdir(test_dir)

def load_images_from_directory(directory, emotions):
    images = []
    labels = []
    filenames = []

    for emotion in emotions:
        emotion_folder = os.path.join(directory, emotion)
        for img_name in os.listdir(emotion_folder):
            img_path = os.path.join(emotion_folder, img_name)
            img = tf.keras.preprocessing.image.load_img(img_path, target_size=(64, 64))
            img_array = tf.keras.preprocessing.image.img_to_array(img) / 255.0
            images.append(img_array)
            labels.append(int(emotion))
            filenames.append(img_name)

    return np.array(images), np.array(labels), filenames

X_train, y_train, filenames_train = load_images_from_directory(train_dir, train_emotions)
X_test, y_test, filenames_test = load_images_from_directory(test_dir, test_emotions)

### Match face embeddings from FairFace dataset to assign race labels in the RAF-DB dataset

In [None]:
!pip install gdown

!gdown --id 1kXdAsqT8YiNYIMm8p5vQUvNFwhBbT4vQ -O res34_fair_align_multi_7_20190809.pt
!gdown --id 113QMzQzkBDmYMs9LwzvD-jxEZdBQ5J4X -O res34_fair_align_multi_4_20190809.pt

Downloading...
From: https://drive.google.com/uc?id=1kXdAsqT8YiNYIMm8p5vQUvNFwhBbT4vQ
To: /content/res34_fair_align_multi_7_20190809.pt
100% 85.3M/85.3M [00:00<00:00, 112MB/s]
Downloading...
From: https://drive.google.com/uc?id=113QMzQzkBDmYMs9LwzvD-jxEZdBQ5J4X
To: /content/res34_fair_align_multi_4_20190809.pt
100% 85.3M/85.3M [00:00<00:00, 269MB/s]


In [None]:
!pip install --upgrade sympy




In [None]:
# for the embeddings/examples
detected_faces_urls = [
    'https://raw.githubusercontent.com/dchen236/FairFace/master/detected_faces/race_Asian_face0.jpg',
    'https://raw.githubusercontent.com/dchen236/FairFace/master/detected_faces/race_Black_face0.jpg',
    'https://raw.githubusercontent.com/dchen236/FairFace/master/detected_faces/race_Latino_face0.jpg',
    'https://raw.githubusercontent.com/dchen236/FairFace/master/detected_faces/race_Middle_Eastern_face0.jpg',
    'https://raw.githubusercontent.com/dchen236/FairFace/master/detected_faces/race_White_face0.jpg'
]

def load_fairface_data(image_urls):
    fairface_images = []
    race_labels = []

    for img_url in image_urls:
        response = requests.get(img_url)

        img = Image.open(BytesIO(response.content)).convert("RGB")
        img = img.resize((64, 64))
        img_array = np.array(img) / 255.0
        fairface_images.append(img_array)

        # careful of bug: don't just split the url directly
        filename = os.path.basename(img_url)

        # e.g.'race_Asian_face0.jpg'
        parts = filename.split('_')
        if len(parts) >= 2:
            race_label = parts[1]
            if race_label == 'Middle':
                race_label = 'Middle_Eastern'
        else:
            race_label = 'Unknown'

        race_labels.append(race_label)

    return np.array(fairface_images), np.array(race_labels)

X_fairface, y_fairface = load_fairface_data(detected_faces_urls)
label_encoder = LabelEncoder()
y_fairface_encoded = label_encoder.fit_transform(y_fairface)

# ResNet34 pretrained model (do not retrain)
model_7 = models.resnet34(pretrained=False, num_classes=7)
model_4 = models.resnet34(pretrained=False, num_classes=4)

# weights
pretrained_dict_7 = torch.load('res34_fair_align_multi_7_20190809.pt', map_location=torch.device('cpu'))
pretrained_dict_4 = torch.load('res34_fair_align_multi_4_20190809.pt', map_location=torch.device('cpu'))
pretrained_dict_7.pop('fc.weight', None)
pretrained_dict_7.pop('fc.bias', None)
pretrained_dict_4.pop('fc.weight', None)
pretrained_dict_4.pop('fc.bias', None)

model_7.load_state_dict(pretrained_dict_7, strict=False)
model_4.load_state_dict(pretrained_dict_4, strict=False)

# need modification to match architecture
model_7.fc = torch.nn.Linear(model_7.fc.in_features, 7)
model_4.fc = torch.nn.Linear(model_4.fc.in_features, 4)

model_7.eval()
model_4.eval()



ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [None]:
# double check outputs
y_fairface

array(['Asian', 'Black', 'Latino', 'Middle_Eastern', 'White'],
      dtype='<U14')

In [None]:
def extract_fairface_embeddings(fairface_images, model):
    fairface_embeddings = []
    for img in fairface_images:
        img_tensor = torch.tensor(img).float().unsqueeze(0).permute(0, 3, 1, 2)  # change shape to (batch, channel, height, width)
        with torch.no_grad():
            embedding = model(img_tensor)
        fairface_embeddings.append(embedding.numpy())
    return np.array(fairface_embeddings)

fairface_embeddings = extract_fairface_embeddings(X_fairface, model_7)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_7 = model_7.to(device)
model_7.eval()

def extract_raf_embeddings(raf_images):
    raf_embeddings = []
    model_7.eval()

    # disable gradient computation
    with torch.no_grad():
        for img in raf_images:
            img_tensor = torch.tensor(img).float().unsqueeze(0).permute(0, 3, 1, 2).to(device)
            embedding = model_7(img_tensor)
            raf_embeddings.append(embedding.cpu().numpy())

    return np.vstack(raf_embeddings)  # vstack for (N, D) array

raf_embeddings_train = extract_raf_embeddings(X_train)
raf_embeddings_test = extract_raf_embeddings(X_test)

In [None]:
def assign_race_label(raf_embedding, fairface_embeddings, fairface_labels):
    raf_embedding_flat = raf_embedding.flatten().reshape(1, -1)
    fairface_embeddings_flat = fairface_embeddings.reshape(fairface_embeddings.shape[0], -1)
    similarities = cosine_similarity(raf_embedding_flat, fairface_embeddings_flat)
    closest_index = np.argmax(similarities)
    return fairface_labels[closest_index]

raf_race_labels_train = {}
for i, embedding in enumerate(raf_embeddings_train):
    race_label = assign_race_label(embedding, fairface_embeddings, y_fairface)
    filename = filenames_train[i]
    raf_race_labels_train[filename] = race_label

raf_race_labels_test = {}
for i, embedding in enumerate(raf_embeddings_test):
    race_label = assign_race_label(embedding, fairface_embeddings, y_fairface)
    filename = filenames_test[i]
    raf_race_labels_test[filename] = race_label

In [None]:
# Save race labels for RAF-DB dataset
np.save("/content/raf_race_labels_train.npy", raf_race_labels_train)
np.save("/content/raf_race_labels_test.npy", raf_race_labels_test)

# Evaluate model performance
print("Training Data Race Labels:", raf_race_labels_train)
print("Test Data Race Labels:", raf_race_labels_test)

Training Data Race Labels: {'train_03512_aligned.jpg': 'White', 'train_02584_aligned.jpg': 'White', 'train_07696_aligned.jpg': 'White', 'train_06477_aligned.jpg': 'Black', 'train_00054_aligned.jpg': 'White', 'train_04387_aligned.jpg': 'Asian', 'train_02227_aligned.jpg': 'White', 'train_04822_aligned.jpg': 'White', 'train_03149_aligned.jpg': 'Middle_Eastern', 'train_01592_aligned.jpg': 'White', 'train_08935_aligned.jpg': 'White', 'train_00792_aligned.jpg': 'Asian', 'train_02738_aligned.jpg': 'Middle_Eastern', 'train_05544_aligned.jpg': 'White', 'train_06812_aligned.jpg': 'White', 'train_05242_aligned.jpg': 'White', 'train_00571_aligned.jpg': 'Latino', 'train_01618_aligned.jpg': 'Asian', 'train_03959_aligned.jpg': 'Black', 'train_05156_aligned.jpg': 'White', 'train_03183_aligned.jpg': 'White', 'train_05352_aligned.jpg': 'White', 'train_00549_aligned.jpg': 'Asian', 'train_00055_aligned.jpg': 'Black', 'train_07535_aligned.jpg': 'White', 'train_05578_aligned.jpg': 'Latino', 'train_02412_ali