# **Extração das imagens do .rec**

In [2]:
import os
import numpy as np
np.bool = np.bool_
import mxnet as mx
import cv2
import pickle
from tqdm.notebook import tqdm
import pandas as pd

In [6]:
rec_path = './data/CASIA/faces_webface_112x112/'
output_dir = './data/CASIA/'

In [7]:
import os
import pandas as pd
import cv2
import mxnet as mx
from tqdm import tqdm

def load_mx_rec(df, rec_path, save_path, write_img=True):
    if write_img:
        if not os.path.isdir(save_path + "/casia-faces"):
            os.makedirs(save_path + "/casia-faces")

    imgrec = mx.recordio.MXIndexedRecordIO(
        os.path.join(rec_path, 'train.idx'),
        os.path.join(rec_path, 'train.rec'), 'r')
    img_info = imgrec.read_idx(0)
    header, _ = mx.recordio.unpack(img_info)
    max_idx = int(header.label[0])

    file_path = os.path.join(save_path, "casia-faces")

    if not os.path.isdir(file_path):
        os.makedirs(file_path)

    data_list = []  # Lista para armazenar dados antes de escrever no CSV

    for idx in tqdm(range(1, max_idx), desc="Extracting images"):
        img_info = imgrec.read_idx(idx)
        header, img = mx.recordio.unpack_img(img_info)
        label = int(header.label)
        img_path = f"{label}_{idx}.jpg"

        if write_img and img_path in df['path'].values:
            cv2.imwrite(os.path.join(file_path, img_path), img)

        data_list.append([img_path, label])

    # Criar DataFrame e salvar em CSV
    new_df = pd.DataFrame(data_list, columns=['path', 'id'])
    new_df.to_csv(os.path.join(save_path, "casia_faces.csv"), index=False)

In [8]:
load_mx_rec(None, rec_path, output_dir, write_img=False) # Não escrever imagens, apenas obter CSV

Extracting images: 100%|██████████| 490623/490623 [00:37<00:00, 13168.30it/s]


# **Limpeza**

In [12]:
# Carregar DataFrame
df = pd.read_csv(os.path.join(output_dir, 'casia_faces.csv'))

# Filtros e seleção de amostras
df_clean = df.groupby('id').filter(lambda x: len(x) >= 128)
df_clean = df_clean.groupby('id').apply(lambda x: x.sample(128)).reset_index(drop=True)

print(f'Total de imagens no df: {df_clean.shape[0]:,}')
print(f"Total de identidades no df: {df_clean['id'].nunique()}")

Total de imagens no df: 81,280
Total de identidades no df: 635


  df_clean = df_clean.groupby('id').apply(lambda x: x.sample(128)).reset_index(drop=True)


In [13]:
load_mx_rec(df_clean, rec_path, output_dir, write_img=True) # Escreve imagens, somente com amostras selecionadas

Extracting images: 100%|██████████| 490623/490623 [10:43<00:00, 762.95it/s] 


In [14]:
qtd = len(os.listdir(os.path.join(output_dir, 'casia-faces')))
print(f'Total de imagens na pasta: {qtd:,}')

Total de imagens na pasta: 81,280


# **Separar em treino e teste**

In [15]:
# Selecionar 35 identidades para teste
test_ids = np.random.choice(df_clean['id'].unique(), 35, replace=False)
test_df = df_clean[df_clean['id'].isin(test_ids)]

# train_df é o resto
train_df = df_clean[~df_clean['id'].isin(test_ids)]

In [16]:
print(f"train_df: {train_df.shape[0]:,} imagens | {train_df['id'].nunique()} identidades")
print(f"test_df: {test_df.shape[0]:,} imagens | {test_df['id'].nunique()} identidades")

train_df: 76,800 imagens | 600 identidades
test_df: 4,480 imagens | 35 identidades


In [17]:
train_df.to_csv('./data/CASIA/casia_train.csv', index=False)
test_df.to_csv('./data/CASIA/casia_test.csv', index=False)