In [1]:
import os
import shutil
import csv
import pickle
import random
from PIL import Image
import numpy as np


In [2]:
# Paths to original dataset and the target directories
original_dataset_path = "dataset"
arcface_dataset_path = "MS1MV2/images"
retinaface_dataset_path = "MS1MV3/aligned_images"

# Ensure target directories exist
os.makedirs(arcface_dataset_path, exist_ok=True)
os.makedirs(retinaface_dataset_path, exist_ok=True)

# CSV and bin file paths
arcface_csv = "MS1MV2/arcface_labels.csv"
retinaface_csv = "MS1MV3/retinaface_labels.csv"
arcface_bin = "MS1MV2/arcface_pairs.bin"
retinaface_bin = "MS1MV3/retinaface_pairs.bin"

In [3]:
# Function to copy images and structure to target folder
def convert_to_target_format(src_path, dest_path, csv_file, bin_file):
    label_map = []
    image_paths = []

    for identity in os.listdir(src_path):
        identity_path = os.path.join(src_path, identity)
        if os.path.isdir(identity_path):
            # Create identity folder in target path with zero-padded identity name
            target_identity_folder = os.path.join(dest_path, identity.zfill(5))
            os.makedirs(target_identity_folder, exist_ok=True)

            # Copy each image and record its label
            for image_name in os.listdir(identity_path):
                src_image_path = os.path.join(identity_path, image_name)
                dest_image_path = os.path.join(target_identity_folder, image_name)
                shutil.copy2(src_image_path, dest_image_path)

                # Record image path and label for CSV
                image_paths.append(f"{target_identity_folder}/{image_name}")
                label_map.append(int(identity))

    # Write CSV file with image paths and labels
    with open(csv_file, mode="w", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(["image_path", "label"])
        for img_path, label in zip(image_paths, label_map):
            writer.writerow([img_path, label])

    # Generate image pairs for binary file
    pairs = []
    for i in range(len(image_paths)):
        for j in range(i + 1, len(image_paths)):
            same_label = label_map[i] == label_map[j]
            pairs.append((image_paths[i], image_paths[j], same_label))

    # Randomly shuffle pairs to mix positive and negative samples
    random.shuffle(pairs)

    # Save pairs to a binary file
    with open(bin_file, "wb") as f:
        pickle.dump(pairs, f)

    print(f"Dataset conversion complete for {dest_path}.")


In [4]:
# Convert to MS1MV2 (MS1M-ArcFace) format and generate CSV and bin files
convert_to_target_format(original_dataset_path, arcface_dataset_path, arcface_csv, arcface_bin)

# Convert to MS1MV3 (MS1M-RetinaFace) format and generate CSV and bin files
convert_to_target_format(original_dataset_path, retinaface_dataset_path, retinaface_csv, retinaface_bin)

Dataset conversion complete for MS1MV2/images.
Dataset conversion complete for MS1MV3/aligned_images.


In [8]:
import os
import shutil
import csv
import pickle
import random

# Paths for original dataset and new datasets (MS1MV2 and MS1MV3)
original_dataset_path = "dataset"
arcface_dataset_path = "datasets/faces_emore_112x112_folders"
retinaface_dataset_path = "datasets/ms1m-retinaface-t1_112x112_folders"

# Ensure target directories exist
os.makedirs(arcface_dataset_path, exist_ok=True)
os.makedirs(retinaface_dataset_path, exist_ok=True)

# CSV and bin file paths
arcface_csv = "datasets/faces_emore/arcface_labels.csv"
retinaface_csv = "datasets/ms1m-retinaface-t1/retinaface_labels.csv"
arcface_bin_files = [
    "datasets/faces_emore/lfw.bin",
    "datasets/faces_emore/cfp_fp.bin",
    "datasets/faces_emore/agedb_30.bin"
]
retinaface_bin_files = [
    "datasets/ms1m-retinaface-t1/lfw.bin",
    "datasets/ms1m-retinaface-t1/cfp_fp.bin",
    "datasets/ms1m-retinaface-t1/agedb_30.bin"
]

# Function to process the dataset, create CSV and bin files
def convert_to_target_format(src_path, dest_path, csv_file, bin_files):
    label_map = []
    image_paths = []

    # Process each identity in the source dataset
    for identity in os.listdir(src_path):
        identity_path = os.path.join(src_path, identity)
        if os.path.isdir(identity_path):
            # Create identity folder in target path with zero-padded identity name
            target_identity_folder = os.path.join(dest_path, identity.zfill(5))
            os.makedirs(target_identity_folder, exist_ok=True)

            # Copy images and record path and label
            for image_name in os.listdir(identity_path):
                src_image_path = os.path.join(identity_path, image_name)
                dest_image_path = os.path.join(target_identity_folder, image_name)
                shutil.copy2(src_image_path, dest_image_path)

                # Record image path and label
                image_paths.append(f"{target_identity_folder}/{image_name}")
                label_map.append(int(identity))

    # Ensure the directory for CSV file exists
    csv_directory = os.path.dirname(csv_file)
    os.makedirs(csv_directory, exist_ok=True)

    # Write CSV with image paths and labels
    with open(csv_file, mode="w", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(["image_path", "label"])
        for img_path, label in zip(image_paths, label_map):
            writer.writerow([img_path, label])

    # Generate and save image pairs to binary files
    pairs = []
    for i in range(len(image_paths)):
        for j in range(i + 1, len(image_paths)):
            same_label = label_map[i] == label_map[j]
            pairs.append((image_paths[i], image_paths[j], same_label))

    # Randomly shuffle pairs for variety
    random.shuffle(pairs)

    # Save pairs to each binary file specified
    for bin_file in bin_files:
        # Ensure the directory for the binary file exists
        bin_directory = os.path.dirname(bin_file)
        os.makedirs(bin_directory, exist_ok=True)

        with open(bin_file, "wb") as f:
            pickle.dump(pairs, f)

    print(f"Dataset conversion complete for {dest_path} with CSV and bin files.")

# Convert to MS1MV3 (RetinaFace) format and generate CSV and bin files
convert_to_target_format(original_dataset_path, retinaface_dataset_path, retinaface_csv, retinaface_bin_files)

# Convert to MS1MV2 (ArcFace) format and generate CSV and bin files
convert_to_target_format(original_dataset_path, arcface_dataset_path, arcface_csv, arcface_bin_files)

# Evaluation paths to use in models
retinaface_eval_paths = [os.path.join("datasets/ms1m-retinaface-t1", os.path.basename(path)) for path in retinaface_bin_files]
arcface_eval_paths = [os.path.join("datasets/faces_emore", os.path.basename(path)) for path in arcface_bin_files]

print("RetinaFace eval paths:", retinaface_eval_paths)
print("ArcFace eval paths:", arcface_eval_paths)


Dataset conversion complete for datasets/ms1m-retinaface-t1_112x112_folders with CSV and bin files.
Dataset conversion complete for datasets/faces_emore_112x112_folders with CSV and bin files.
RetinaFace eval paths: ['datasets/ms1m-retinaface-t1/lfw.bin', 'datasets/ms1m-retinaface-t1/cfp_fp.bin', 'datasets/ms1m-retinaface-t1/agedb_30.bin']
ArcFace eval paths: ['datasets/faces_emore/lfw.bin', 'datasets/faces_emore/cfp_fp.bin', 'datasets/faces_emore/agedb_30.bin']
