# **Download do Dataset**

In [2]:
!kaggle datasets download -d jessicali9530/lfw-dataset -p ../data/

Dataset URL: https://www.kaggle.com/datasets/jessicali9530/lfw-dataset
License(s): other
Downloading lfw-dataset.zip to ../data
... resuming from 8388608 bytes (109507047 bytes left) ...
100%|███████████████████████████████████████▊| 112M/112M [00:21<00:00, 4.40MB/s]
100%|████████████████████████████████████████| 112M/112M [00:21<00:00, 5.18MB/s]


In [3]:
!unzip -q ../data/lfw-dataset.zip -d ../data/

In [4]:
!rm ../data/lfw-dataset.zip
!rm ../data/*.csv

# **Detecção das faces**

In [5]:
from facenet_pytorch import MTCNN, extract_face
import torch
from PIL import Image
import os
from tqdm.notebook import tqdm as tqdm
import pandas as pd
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device name: {torch.cuda.get_device_name(0)}')

RAW_LFW_PATH = '../data/lfw-deepfunneled/lfw-deepfunneled/'
FACES_PATH = '../data/lfw-faces/'

Device name: NVIDIA GeForce RTX 3050 Laptop GPU


In [6]:
mtcnn = MTCNN(keep_all=True, device=device)

In [7]:
def extract_faces(raw_images_path: str, faces_images_path: str) -> pd.DataFrame:
    if not os.path.exists(faces_images_path):
        os.makedirs(faces_images_path)
        
    subfolders = os.listdir(raw_images_path)
    
    nobox = []
    df = []
    
    for id, folder in enumerate(tqdm(subfolders, desc='Extraindo rostos', unit='img')):
        imgs = os.listdir(f'{raw_images_path}/{folder}')
        
        for file in imgs:
            save_path = os.path.join(faces_images_path, file.split('/')[-1])
            save_path_df = os.path.join('./data/lfw-faces/', file.split('/')[-1])
            df.append((id, save_path_df))
            
            img = Image.open(os.path.join(raw_images_path + folder, file))
        
            boxes, _ = mtcnn.detect(img)
            
            if boxes is not None:
                extract_face(img, boxes[0], save_path=save_path)
            else:
                nobox.append(save_path_df)
                
    df = pd.DataFrame(df, columns=['id', 'path'])
    
    return nobox, df

In [8]:
nobox, df = extract_faces(RAW_LFW_PATH, FACES_PATH)

print(f'Número de imagens sem face detectada: {len(nobox)}')
print(f'Número de imagens com face detectada: {len(df)}')
print(f'Número de identidades: {df["id"].nunique()}')

# Removendo imagens sem face detectada do df
df = df[~df['path'].isin(nobox)]

# Salvando o csv
df.to_csv('../data/lfw_faces.csv', index=False)

Extraindo rostos:   0%|          | 0/5749 [00:00<?, ?img/s]

Número de imagens sem face detectada: 1
Número de imagens com face detectada: 13233
Número de identidades: 5749


In [9]:
df.head()

Unnamed: 0,id,path
0,0,./data/lfw-faces/Koichiro_Matsuura_0001.jpg
1,1,./data/lfw-faces/Mark_Hanson_0001.jpg
2,2,./data/lfw-faces/Gregorio_Honasan_0001.jpg
3,3,./data/lfw-faces/Shanna_Zolman_0001.jpg
4,4,./data/lfw-faces/Edward_Seaga_0001.jpg


# **Split de treino e teste**

In [10]:
!rm -rf ../data/lfw-deepfunneled/

In [11]:
ids_count = df['id'].value_counts()
valid_ids = ids_count[ids_count >= 5].index

shuffled_ids = valid_ids.to_numpy()
np.random.seed(42)
np.random.shuffle(shuffled_ids)

test_ids = shuffled_ids[:49]

# Criar os dataframes de treino e teste
test_df = df[df['id'].isin(test_ids)]
train_df = df[~df['id'].isin(test_ids)]

print(f"Identidades no conjunto de teste: {test_df['id'].nunique()}")
print(f"Imagens no conjunto de teste: {len(test_df)}\n")

print(f"Identidades no conjunto de treino: {train_df['id'].nunique()}")
print(f"Imagens no conjunto de treino: {len(train_df)}")

Identidades no conjunto de teste: 49
Imagens no conjunto de teste: 567

Identidades no conjunto de treino: 5700
Imagens no conjunto de treino: 12665


In [12]:
train_df.to_csv('../data/lfw_train.csv', index=False)
test_df.to_csv('../data/lfw_test.csv', index=False)