# Extract faces CelebA_Spoof

In [1]:
from glob import glob
import cv2
import os
from tqdm import tqdm
from pathlib import Path
import shutil
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
train_images = glob('C:\KhoiNXM\Workspace\Learning\Master Thesis\Dataset\CelebA_Spoof\Data\\train\*\*\*.jpg')
len(train_images)

494405

In [3]:
test_images = glob('C:\KhoiNXM\Workspace\Learning\Master Thesis\Dataset\CelebA_Spoof\Data\\test\*\*\*.png')
len(test_images)

67170

In [8]:
int(len(train_images)*0.01)

4944

In [9]:
for image_path in tqdm(train_images[:int(len(train_images)*0.01)]):
    image_basename = os.path.basename(image_path)
    name, ext = image_basename.split('.')

    image = cv2.imread(image_path)
    real_h, real_w, c = image.shape

    bb_path = image_path.replace(name, f'{name}_BB').replace(ext, 'txt')
    with open(bb_path) as f:
        x, y, w, h, score = map(float, f.readline().split())

    # Resize bounding box according to image shape
    x1 = int(x*(real_w / 224))
    y1 = int(y*(real_h / 224))
    x2 = x1 + int(w*(real_w / 224))
    y2 = y1 + int(h*(real_h / 224))

    # Clip value of bouding 
    x1 = max(0, min(x1, real_w))
    y1 = max(0, min(y1, real_h))
    x2 = max(0, min(x2, real_w))
    y2 = max(0, min(y2, real_h))

    face = image[y1: y2, x1: x2]

    new_path = image_path.replace('C:\\KhoiNXM\\Workspace\\Learning\\Master Thesis\\Dataset\\', 
                                  'C:\\KhoiNXM\\Workspace\\Learning\\Master Thesis\\Dev\\face_recognition_system\\data\\')
    dir_path = os.path.dirname(new_path)
    Path(dir_path).mkdir(parents=True, exist_ok=True)

    try:
        cv2.imwrite(new_path, face)
    except Exception:
        print(x1, y1, x2, y2)

100%|██████████| 4944/4944 [00:38<00:00, 127.49it/s]


# Combine datasets

In [10]:
celeb_root = 'C:\\KhoiNXM\\Workspace\\Learning\\Master Thesis\\Dev\\face_recognition_system\\data\\CelebA_Spoof\\Data\\train'
nua_root = 'C:\\KhoiNXM\\Workspace\\Learning\\Master Thesis\\Dataset\\Detectedface'

In [11]:
r_celeb = glob(os.path.join(celeb_root, '*', 'live', '*.jpg'))
f_celeb = glob(os.path.join(celeb_root, '*', 'spoof', '*.jpg'))
len(r_celeb), len(f_celeb)

(1540, 3404)

In [12]:
r_nua = glob(os.path.join(nua_root, 'ClientFace', '*', '*.jpg'))
f_nua = glob(os.path.join(nua_root, 'ImposterFace', '*', '*.jpg'))
len(r_nua), len(f_nua)

(5105, 7509)

In [13]:
datasets_path = 'C:\\KhoiNXM\\Workspace\\Learning\\Master Thesis\\Dev\\face_recognition_system\\data\\DeePixBiS\\'

In [14]:
Path(datasets_path + 'train\\0').mkdir(parents=True, exist_ok=True)
Path(datasets_path + 'train\\1').mkdir(parents=True, exist_ok=True)
Path(datasets_path + 'test\\0').mkdir(parents=True, exist_ok=True)
Path(datasets_path + 'test\\1').mkdir(parents=True, exist_ok=True)
Path(datasets_path + 'val\\0').mkdir(parents=True, exist_ok=True)
Path(datasets_path + 'val\\1').mkdir(parents=True, exist_ok=True)

In [15]:
r_data = r_celeb + r_nua
f_data = f_celeb + f_nua
len(r_data), len(f_data)

(6645, 10913)

In [16]:
X_r, y_r = r_data, [1]*len(r_data)
X_f, y_f = f_data, [0]*len(f_data)

In [17]:
X = X_r + X_f
y = y_r + y_f
len(X), len(y)

(17558, 17558)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=1/9, random_state=1)

In [19]:
train_idx, val_idx, test_idx = 0, 0, 0

for path, cls in tqdm(zip(X_train, y_train)):
    ext = Path(path).suffix
    dest = os.path.join(datasets_path, 'train', str(cls), str(train_idx) + ext)
    shutil.copyfile(path, dest)
    train_idx += 1

for path, cls in tqdm(zip(X_val, y_val)):
    ext = Path(path).suffix
    dest = os.path.join(datasets_path, 'val', str(cls), str(val_idx) + ext)
    shutil.copyfile(path, dest)
    val_idx += 1

for path, cls in tqdm(zip(X_test, y_test)):
    ext = Path(path).suffix
    dest = os.path.join(datasets_path, 'test', str(cls), str(test_idx) + ext)
    shutil.copyfile(path, dest)
    test_idx += 1

14046it [00:16, 863.34it/s]
1756it [00:02, 783.15it/s]
1756it [00:02, 792.90it/s]


In [20]:
train_list = []
val_list = []
test_list = []

for file in os.listdir(datasets_path + 'train\\0'):
    link = datasets_path + 'train\\0\\' + file
    train_list.append([link, 0])

for file in os.listdir(datasets_path + 'train\\1'):
    link = datasets_path + 'train\\1\\' + file
    train_list.append([link, 1])

for file in os.listdir(datasets_path + 'val\\0'):
    link = datasets_path + 'val\\0\\' + file
    val_list.append([link, 0])

for file in os.listdir(datasets_path + 'val\\1'):
    link = datasets_path + 'val\\1\\' + file
    val_list.append([link, 1])

for file in os.listdir(datasets_path + 'test\\0'):
    link = datasets_path + 'test\\0\\' + file
    test_list.append([link, 0])

for file in os.listdir(datasets_path + 'test\\1'):
    link = datasets_path + 'test\\1\\' + file
    test_list.append([link, 1])

train_df = pd.DataFrame(train_list, columns=['name', 'label'])
val_df = pd.DataFrame(val_list, columns=['name', 'label'])
test_df = pd.DataFrame(test_list, columns=['name', 'label'])

train_df.to_csv('C:\\KhoiNXM\\Workspace\\Learning\\Master Thesis\\Dev\\face_recognition_system\\data\\path_files\\train_data_celeb_nuaa.csv', index=False)
val_df.to_csv('C:\\KhoiNXM\\Workspace\\Learning\\Master Thesis\\Dev\\face_recognition_system\\data\\path_files\\val_data_celeb_nuaa.csv', index=False)
test_df.to_csv('C:\\KhoiNXM\\Workspace\\Learning\\Master Thesis\\Dev\\face_recognition_system\\data\\path_files\\test_data_celeb_nuaa.csv', index=False)

In [21]:
# For Google Drive
datasets_path = 'C:\\KhoiNXM\\Workspace\\Learning\\Master Thesis\\Dev\\face_recognition_system\\data\\DeePixBiS\\'
datasets_path_drive = './data/DeePixBiS/'
train_list = []
val_list = []
test_list = []

for file in os.listdir(f'{datasets_path}train/0'):
    link = f'{datasets_path_drive}train/0/{file}'
    train_list.append([link, 0])

for file in os.listdir(f'{datasets_path}train/1'):
    link = f'{datasets_path_drive}train/1/{file}'
    train_list.append([link, 1])

for file in os.listdir(f'{datasets_path}val/0'):
    link = f'{datasets_path_drive}val/0/{file}'
    val_list.append([link, 0])

for file in os.listdir(f'{datasets_path}val/1'):
    link = f'{datasets_path_drive}val/1/{file}'
    val_list.append([link, 1])

for file in os.listdir(f'{datasets_path}test/0'):
    link = f'{datasets_path_drive}test/0/{file}'
    test_list.append([link, 0])

for file in os.listdir(f'{datasets_path}test/1'):
    link = f'{datasets_path_drive}test/1/{file}'
    test_list.append([link, 1])

train_df = pd.DataFrame(train_list, columns=['name', 'label'])
val_df = pd.DataFrame(val_list, columns=['name', 'label'])
test_df = pd.DataFrame(test_list, columns=['name', 'label'])

train_df.to_csv('C:\\KhoiNXM\\Workspace\\Learning\\Master Thesis\\Dev\\face_recognition_system\\data\\path_files\\train_data_celeb_nuaa_drive.csv', index=False)
val_df.to_csv('C:\\KhoiNXM\\Workspace\\Learning\\Master Thesis\\Dev\\face_recognition_system\\data\\path_files\\val_data_celeb_nuaa_drive.csv', index=False)
test_df.to_csv('C:\\KhoiNXM\\Workspace\\Learning\\Master Thesis\\Dev\\face_recognition_system\\data\\path_files\\test_data_celeb_nuaa_drive.csv', index=False)