In [1]:
import cv2
import json
import numpy as np
import os
import onnx
import PIL
import shutil
import tensorflow as tf
import warnings

from datetime import datetime
from mtcnn import MTCNN
from onnx_tf.backend import prepare
from PIL import Image
from retinaface import RetinaFace
from tqdm import tqdm

# Set the path to the VGGFace2 dataset
orig_dir = "E:\\ms1m_align_112"

# Set the path to the directory where you want to copy the selected images
new_dir = "E:/balanced_ms1m"

# # Set the path to the text file to save the selected file paths
# txt_path = "E:/ms1m_files.txt"

# Create a list to store the selected file paths
selected_files = []

# Create the new directory if it doesn't exist
if not os.path.exists(new_dir):
    os.makedirs(new_dir)

# Initialize the MTCNN face detector
# detector = MTCNN()

In [5]:
train_num_images = 20
test_num_images = 5
val_num_images = 5

for person_dir in tqdm(os.listdir(os.path.join(orig_dir, 'train'))):
    if not os.path.isdir(os.path.join(orig_dir, 'train', person_dir)):
        continue
    train_images = os.listdir(os.path.join(orig_dir, 'train', person_dir))
    test_images = os.listdir(os.path.join(orig_dir, 'test', person_dir))
    val_images = os.listdir(os.path.join(orig_dir, 'dev', person_dir))

    if len(train_images) < 20 or len(test_images) < 5 or len(val_images) < 5:
        continue
    
    train_selected_images = np.random.choice(train_images, size=train_num_images, replace=False)
    test_selected_images = np.random.choice(test_images, size=test_num_images, replace=False)
    val_selected_images = np.random.choice(val_images, size=val_num_images, replace=False)

    class_path = os.path.join(orig_dir, 'train', person_dir)
    
    # Copy images to the train folder
    for image in train_selected_images:
        src_path = os.path.join(orig_dir, 'train', person_dir, image)
        dst_path = os.path.join(new_dir, 'train', person_dir, image)
        os.makedirs(os.path.dirname(dst_path), exist_ok=True)
        shutil.copy(src_path, dst_path)

    # Copy images to the test folder
    for image in test_selected_images:
        src_path = os.path.join(orig_dir, 'test', person_dir, image)
        dst_path = os.path.join(new_dir, 'test', person_dir, image)
        os.makedirs(os.path.dirname(dst_path), exist_ok=True)
        shutil.copy(src_path, dst_path)

    # Copy images to the val folder
    for image in val_selected_images:
        src_path = os.path.join(orig_dir, 'dev', person_dir, image)
        dst_path = os.path.join(new_dir, 'val', person_dir, image)
        os.makedirs(os.path.dirname(dst_path), exist_ok=True)
        shutil.copy(src_path, dst_path)

100%|██████████| 85742/85742 [6:05:42<00:00,  3.91it/s]   


In [4]:
# all_ids = np.load('./id_files/glint_all_ids.npz')['res']
# warnings.filterwarnings('ignore') # Ignore all the warning messages in this tutorial

onnx_model = onnx.load('F:/test/onnx_tensorflow/model.onnx')
# onnx_model = version_converter.convert_version(onnx_model, 11)
tf_rep = prepare(onnx_model) # Import the ONNX model to Tensorflow

In [5]:
# Set the path to the directory where you want to copy the selected images
new_dir = "E:/balanced_ms1m"

all_ids_dict = dict()

for id in tqdm(os.listdir(os.path.join(new_dir, 'train'))):
    # if id in ['train', 'test', 'val']:
    #     continue
    all_ids_dict[id] = {'train':[], 'test':[], 'val':[]}
    for file in os.listdir(os.path.join(new_dir, 'train', id)):
        all_ids_dict[id]['train'].append(os.path.join(new_dir, 'train', id, file))
    for file in os.listdir(os.path.join(new_dir, 'test', id)):
        all_ids_dict[id]['test'].append(os.path.join(new_dir, 'test', id, file))
    for file in os.listdir(os.path.join(new_dir, 'val', id)):
        all_ids_dict[id]['val'].append(os.path.join(new_dir, 'val', id, file))

In [15]:
os.makedirs(os.path.join('.', 'ms1m'), exist_ok=True)
with open(os.path.join('.', 'ms1m', 'all_id_files.json'), 'w') as fp:
    json.dump(all_ids_dict, fp, indent=4)

In [8]:
all_ids_dict = json.load(open(os.path.join('.', 'ms1m', 'all_id_files.json')))
keys = list(all_ids_dict.keys())
len(keys)

In [10]:
dataset_path = 'E:/balanced_ms1m'

os.makedirs(os.path.join(dataset_path, 'embeddings', 'train'), exist_ok=True)
os.makedirs(os.path.join(dataset_path, 'embeddings', 'test'), exist_ok=True)
os.makedirs(os.path.join(dataset_path, 'embeddings', 'val'), exist_ok=True)

for d in tqdm(keys[24498:]):
    image_list = []
    for img_path in all_ids_dict[d]['train']:
        img = Image.open(img_path)
        x_train = tf.image.resize(np.array(img), (112, 112), method="nearest")
        x_train = (tf.cast(x_train, tf.float32) - 127.5) / 128.
        x_train = tf.transpose(x_train, perm=[2, 0, 1])
        x_train = tf.expand_dims(x_train, 0)
        image_list.extend(x_train)

    for img_path in all_ids_dict[d]['test']: 
        img = Image.open(img_path)
        x_train = tf.image.resize(np.array(img), (112, 112), method="nearest")
        x_train = (tf.cast(x_train, tf.float32) - 127.5) / 128.
        x_train = tf.transpose(x_train, perm=[2, 0, 1])
        x_train = tf.expand_dims(x_train, 0)
        image_list.extend(x_train)

    for img_path in all_ids_dict[d]['val']: 
        img = Image.open(img_path)
        x_train = tf.image.resize(np.array(img), (112, 112), method="nearest")
        x_train = (tf.cast(x_train, tf.float32) - 127.5) / 128.
        x_train = tf.transpose(x_train, perm=[2, 0, 1])
        x_train = tf.expand_dims(x_train, 0)
        image_list.extend(x_train)
    id_emb = tf_rep.run(np.array(image_list))._0
    np.savez_compressed(os.path.join(dataset_path, 'embeddings', 'train', d + '.npz'), res=id_emb[:20])
    np.savez_compressed(os.path.join(dataset_path, 'embeddings', 'test', d + '.npz'), res=id_emb[20:25])
    np.savez_compressed(os.path.join(dataset_path, 'embeddings', 'val', d + '.npz'), res=id_emb[25:])

100%|██████████| 44639/44639 [77:16:49<00:00,  6.23s/it]   
