# Feature Engineering

In [1]:
import zipfile
import os
import pandas as pd
import numpy as np

from pathlib import Path
from PIL import Image

## Loading the Dataset

In [2]:
# Zip files
zip_025 = '../data/fairface-img-margin025-trainval.zip'
zip_125 = '../data/fairface-img-margin125-trainval.zip'

# Extraction directories
extract_dir_025 = '../data/fairface_025'
extract_dir_125 = '../data/fairface_125'

In [3]:
def extract_zip(zip_path, extract_to):
    zip_path = Path(zip_path)
    extract_to = Path(extract_to)
    
    if not extract_to.exists():
        print(f"Extracting {zip_path.name}...")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
        print(f"Extracted to {extract_to}")
    else:
        print(f"{extract_to} already exists, skipping extraction.")

# Extract both datasets
extract_zip(zip_025, extract_dir_025)
extract_zip(zip_125, extract_dir_125)

..\data\fairface_025 already exists, skipping extraction.
..\data\fairface_125 already exists, skipping extraction.


In [4]:
# Load train and validation labels
train_labels = pd.read_csv('../data/fairface_label_train.csv')
val_labels = pd.read_csv('../data/fairface_label_val.csv')

## Preprocessing

In [5]:
# Combine for overall statistics
df = pd.concat([train_labels, val_labels], ignore_index=True)
print(df.shape)
df.head()

(97698, 5)


Unnamed: 0,file,age,gender,race,service_test
0,train/1.jpg,50-59,Male,East Asian,True
1,train/2.jpg,30-39,Female,Indian,False
2,train/3.jpg,3-9,Female,Black,False
3,train/4.jpg,20-29,Female,Indian,True
4,train/5.jpg,20-29,Female,Indian,True


In [6]:
# Check for missing values
print(df.isnull().sum())

file            0
age             0
gender          0
race            0
service_test    0
dtype: int64


In [7]:
print(df.info())
for col in ['age', 'gender', 'race']:
    print(f"{col}: {df[col].unique()}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97698 entries, 0 to 97697
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   file          97698 non-null  object
 1   age           97698 non-null  object
 2   gender        97698 non-null  object
 3   race          97698 non-null  object
 4   service_test  97698 non-null  bool  
dtypes: bool(1), object(4)
memory usage: 3.1+ MB
None
age: ['50-59' '30-39' '3-9' '20-29' '40-49' '10-19' '60-69' '0-2'
 'more than 70']
gender: ['Male' 'Female']
race: ['East Asian' 'Indian' 'Black' 'White' 'Middle Eastern' 'Latino_Hispanic'
 'Southeast Asian']


## Feature Engineering

In [None]:
def brightness(img_path):
    img = Image.open(img_path).convert('L')
    return np.array(img).mean()

df['brightness'] = df['file'].apply(lambda f: brightness(os.path.join('../data/fairface_025', f)))

In [9]:
def contrast(img_path):
    img = Image.open(img_path).convert('L')
    return np.array(img).std()

df['contrast'] = df['file'].apply(lambda f: contrast(os.path.join('../data/fairface_025', f)))

In [10]:
# Extract image sharpness using Laplacian variance
def extract_sharpness(image):
    image = image.convert('L')  # Convert to grayscale
    image_array = np.array(image)
    laplacian = np.abs(np.gradient(np.gradient(image_array)[0])[0]) + np.abs(np.gradient(np.gradient(image_array)[1])[1])
    variance = laplacian.var()
    return variance

df['sharpness'] = df['file'].apply(lambda f: extract_sharpness(Image.open(os.path.join('../data/fairface_025', f))))

In [11]:
# Extract saturation
def saturation(img_path):
    img = Image.open(img_path).convert('RGB')
    img_hsv = img.convert('HSV')
    saturation_channel = np.array(img_hsv)[:, :, 1]
    return saturation_channel.mean()

df['saturation'] = df['file'].apply(lambda f: saturation(os.path.join('../data/fairface_025', f)))
df.head()

KeyboardInterrupt: 

In [None]:
# Extract hue variance
def hue_variance(img_path):
    img = Image.open(img_path).convert('RGB')
    img_hsv = img.convert('HSV')
    hue_channel = np.array(img_hsv)[:, :, 0]
    return hue_channel.var()

df['hue_variance'] = df['file'].apply(lambda f: hue_variance(os.path.join('../data/fairface_025', f)))
df.head()

Unnamed: 0,file,age,gender,race,service_test,brightness,contrast,sharpness,saturation,hue_variance
0,train/1.jpg,50-59,Male,East Asian,True,34.598334,14.607521,1.61345,133.303133,5861.253941
1,train/2.jpg,30-39,Female,Indian,False,123.987843,27.774537,2.137719,89.485591,649.939604
2,train/3.jpg,3-9,Female,Black,False,144.705138,43.936752,14.794022,77.438776,2981.514741
3,train/4.jpg,20-29,Female,Indian,True,85.136998,71.030456,9.986235,86.136181,5501.222295
4,train/5.jpg,20-29,Female,Indian,True,132.422413,45.385137,10.499576,125.124442,2881.653201


In [None]:
# Extract facial embeddings using a pre-trained ResNet model from torchvision import models, transforms
import torch 
from torchvision import models, transforms

resnet = models.resnet50(pretrained=True)
resnet.eval()

preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

def get_embedding(img_path):
    img = Image.open(img_path).convert('RGB')
    img_t = preprocess(img)
    batch_t = torch.unsqueeze(img_t, 0)
    with torch.no_grad():
        embedding = resnet(batch_t)
    return embedding.numpy().flatten()

df['embedding'] = df['file'].apply(lambda f: get_embedding(os.path.join('../data/fairface_025', f)))

# Display the first few rows with new features
df.head()



Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to C:\Users\Nigel/.cache\torch\hub\checkpoints\resnet50-0676ba61.pth


100%|██████████| 97.8M/97.8M [00:07<00:00, 14.5MB/s]


KeyboardInterrupt: 