In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# 1. Load the full Identity file
df = pd.read_csv('identity_CelebA.txt', sep='\s+', header=None, names=['image_path', 'id'])

In [35]:
id_counts = df['id'].value_counts()
valid_ids = id_counts[id_counts >= 5].index
df_filtered = df[df['id'].isin(valid_ids)]

print(f"Original Identities: {len(df['id'].unique())}")
print(f"Filtered Identities (>=5 images): {len(valid_ids)}")
print(f"Images Dropped: {len(df) - len(df_filtered)}")

Original Identities: 10177
Filtered Identities (>=5 images): 9343
Images Dropped: 2311


In [27]:
unique_valid_ids = df_filtered['id'].unique()
victim_ids = np.random.choice(unique_valid_ids, size=500, replace=False)

# Create the two massive pools of data
victim_pool_df = df_filtered[df_filtered['id'].isin(victim_ids)]
attacker_pool_df = df_filtered[~df_filtered['id'].isin(victim_ids)]

print(f"\nTotal Victim Images: {len(victim_pool_df)}")
print(f"Total Attacker Images: {len(attacker_pool_df)}")

# 4. Inner-Split: Break Victim Pool into Train, Val, Test
# Step A: Split off the TEST set (20%)
# Stratify ensures every ID appears in the test set
victim_dev, victim_test = train_test_split(
    victim_pool_df, 
    test_size=0.2, 
    stratify=victim_pool_df['id'], 
    random_state=42
)

# Step B: Split the remaining "Dev" set into TRAIN and VAL (80/20 of the remaining)
victim_train, victim_val = train_test_split(
    victim_dev, 
    test_size=0.2, 
    stratify=victim_dev['id'], 
    random_state=42
)

# 5. Save to CSVs
victim_train.to_csv("victim_train.csv", index=False)
victim_val.to_csv("victim_val.csv", index=False)
victim_test.to_csv("victim_test.csv", index=False)
attacker_pool_df.to_csv("attacker_data.csv", index=False)

print("\n--- Splits Saved ---")
print(f"1. victim_train.csv: {len(victim_train)} images")
print(f"2. victim_val.csv:   {len(victim_val)} images")
print(f"3. victim_test.csv:  {len(victim_test)} images")
print(f"4. attacker_data.csv:{len(attacker_pool_df)} images")


Total Victim Images: 10815
Total Attacker Images: 189473

--- Splits Saved ---
1. victim_train.csv: 6921 images
2. victim_val.csv:   1731 images
3. victim_test.csv:  2163 images
4. attacker_data.csv:189473 images
