In [1]:
import pandas as pd

In [3]:
df = pd.read_csv('./Standardized_full_data/REAL_train_and_val.csv')

In [4]:
# Remove .mp3 and .wav extensions (case-insensitive)
df['file_name'] = df['file_name'].str.replace(r'\.(mp3|wav)$', '', case=False, regex=True)

# Optional: save cleaned file
df.to_csv('./Standardized_full_data/REAL_train_and_val_cleaned.csv', index=False)

print("✅ Cleaned file_name column. Example values:")
print(df['file_name'].head())

✅ Cleaned file_name column. Example values:
0    44784e541ed8d06b757a9dfb02a21ab2
1    b1e1dcbc6c82eba5ff40d583220ed8c5
2    f87e9ee30d7a221b60bd038686c6ab42
3    dfc0f92fe4eb00a4fc817dc2f8ef7170
4    5699242804b7856dd2be333128378080
Name: file_name, dtype: object


In [5]:
df

Unnamed: 0,file_name,speaker,age,gender,accent,native_language,country,region,source,spoof_or_real,train_or_val
0,44784e541ed8d06b757a9dfb02a21ab2,,46 - 65,female,,urdu,"Asian, South Asian or Asian American","south asia (india, pakistan, bangladesh, sri l...",asr_fairness,real,train
1,b1e1dcbc6c82eba5ff40d583220ed8c5,,31 - 45,female,,hindi,"Asian, South Asian or Asian American","south asia (india, pakistan, bangladesh, sri l...",asr_fairness,real,train
2,f87e9ee30d7a221b60bd038686c6ab42,,31 - 45,female,,hindi,"Asian, South Asian or Asian American","south asia (india, pakistan, bangladesh, sri l...",asr_fairness,real,train
3,dfc0f92fe4eb00a4fc817dc2f8ef7170,,18 - 22,female,,marathi,"Asian, South Asian or Asian American","south asia (india, pakistan, bangladesh, sri l...",asr_fairness,real,train
4,5699242804b7856dd2be333128378080,,46 - 65,female,,urdu,"Asian, South Asian or Asian American","south asia (india, pakistan, bangladesh, sri l...",asr_fairness,real,train
...,...,...,...,...,...,...,...,...,...,...,...
3228,11214,Nelson Mandela,,male,south africa,,south africa,"south africa & southern africa (south africa, ...",in_the_wild,real,val
3229,25615,Nelson Mandela,,male,south africa,,south africa,"south africa & southern africa (south africa, ...",in_the_wild,real,val
3230,14423,Nelson Mandela,,male,south africa,,south africa,"south africa & southern africa (south africa, ...",in_the_wild,real,val
3231,7454,Nelson Mandela,,male,south africa,,south africa,"south africa & southern africa (south africa, ...",in_the_wild,real,val


In [6]:
output_path = './Standardized_full_data/REAL_train_and_val_cleaned.csv'
df.to_csv(output_path, index=False)

## Check missing files

In [22]:
import pandas as pd
import os

def check_missing_files(real_metadata_path, spoof_metadata_path, base_dir, split="train"):
    # Load and standardize both metadata files
    real_df = pd.read_csv(real_metadata_path)
    spoof_df = pd.read_csv(spoof_metadata_path)

    # Print lengths of dataframes
    print(f"Length of real_df: {len(real_df)}")
    print(f"Length of spoof_df: {len(spoof_df)}")

    # Normalize columns
    real_df = real_df.rename(columns={"file_name": "file_name"})
    spoof_df = spoof_df.rename(columns={"Filename": "file_name"})

    # Select relevant columns
    real_df = real_df[["file_name", "spoof_or_real", "train_or_val"]]
    spoof_df = spoof_df[["file_name", "spoof_or_real", "train_or_val"]]

    # Add .wav extension
    real_df["file_name"] += ".wav"
    spoof_df["file_name"] += ".wav"

    # Combine both metadata sets
    combined_df = pd.concat([real_df, spoof_df], ignore_index=True)
    print(len(combined_df))
    combined_df = combined_df[combined_df["train_or_val"] == split]

    # Count total entries
    total_metadata = len(combined_df)

    # Get existing files from directory
    existing_files = set(os.listdir(base_dir))
    print(f"🔍 Unique files in base_dir: {len(existing_files)}")
    exists_mask = combined_df["file_name"].isin(existing_files)

    num_existing = exists_mask.sum()
    num_missing = total_metadata - num_existing
    percent_missing = (num_missing / total_metadata) * 100

    print(f"📦 Total metadata entries: {total_metadata}")
    print(f"✅ Existing files in base_dir: {num_existing}")
    print(f"❌ Missing files: {num_missing}")
    print(f"📉 Percentage missing: {percent_missing:.2f}%")

    # (Optional) Return missing file names
    missing_files = combined_df[~exists_mask]["file_name"].tolist()
    return missing_files

In [23]:
missing_files = check_missing_files(
    real_metadata_path="./Standardized_full_data/REAL_train_and_val_cleaned.csv",
    spoof_metadata_path="./Standardized_full_data/Metadata TTS data_full_new.csv",
    base_dir="./Standardized_full_data/Training",  # or Val
    split="train"
)

Length of real_df: 3233
Length of spoof_df: 5538
8771
🔍 Unique files in base_dir: 6981
📦 Total metadata entries: 6656
✅ Existing files in base_dir: 6656
❌ Missing files: 0
📉 Percentage missing: 0.00%


In [11]:
missing_files = check_missing_files(
    real_metadata_path="./Standardized_full_data/REAL_train_and_val_cleaned.csv",
    spoof_metadata_path="./Standardized_full_data/Metadata TTS data_full_new.csv",
    base_dir="./Standardized_full_data/Val",  # or Val
    split="val"
)

Length of real_df: 3233
Length of spoof_df: 5538
🔍 Unique files in base_dir: 1791
📦 Total metadata entries: 1791
✅ Existing files in base_dir: 1791
❌ Missing files: 0
📉 Percentage missing: 0.00%


In [18]:
import pandas as pd

def check_unique_filenames(real_metadata_path, spoof_metadata_path):
    # Load CSVs
    real_df = pd.read_csv(real_metadata_path)
    spoof_df = pd.read_csv(spoof_metadata_path)

    # Normalize file name column
    real_df = real_df.rename(columns={"file_name": "file_name"})
    spoof_df = spoof_df.rename(columns={"Filename": "file_name"})

    # Add .wav extension
    real_df["file_name"] += ".wav"
    spoof_df["file_name"] += ".wav"

    # Get unique filenames
    real_files = set(real_df["file_name"].unique())
    spoof_files = set(spoof_df["file_name"].unique())

    print(f"🎙️ Unique REAL files: {len(real_files)}")
    print(f"🤖 Unique SPOOF files: {len(spoof_files)}")

    # Optional: Check overlap
    overlap = real_files.intersection(spoof_files)
    if overlap:
        print(f"⚠️ Overlapping file names: {len(overlap)}")
    else:
        print("✅ No overlapping file names between real and spoof.")

    return real_files, spoof_files

In [19]:
real_files, spoof_files = check_unique_filenames(
    real_metadata_path="./Standardized_full_data/REAL_train_and_val_cleaned.csv",
    spoof_metadata_path="./Standardized_full_data/Metadata TTS data_full.csv"
)

🎙️ Unique REAL files: 3233
🤖 Unique SPOOF files: 5537
✅ No overlapping file names between real and spoof.


In [17]:
extra_files

['.DS_Store']

In [24]:
import pandas as pd
import os

def find_unreferenced_wavs_in_split(real_metadata_path, spoof_metadata_path, base_dir, split="train"):
    # Load metadata
    real_df = pd.read_csv(real_metadata_path)
    spoof_df = pd.read_csv(spoof_metadata_path)

    # Standardize file name column and add .wav extension
    real_df = real_df.rename(columns={"file_name": "file_name"})
    spoof_df = spoof_df.rename(columns={"Filename": "file_name"})
    real_df["file_name"] += ".wav"
    spoof_df["file_name"] += ".wav"

    # Filter by split
    real_split = real_df[real_df["train_or_val"] == split]
    spoof_split = spoof_df[spoof_df["train_or_val"] == split]

    # Combine into one set of metadata file names
    referenced_files = set(pd.concat([real_split, spoof_split])["file_name"])

    # List all actual .wav files in base_dir
    base_files = set(f for f in os.listdir(base_dir) if f.endswith(".wav"))

    # Find unreferenced files
    extra_files = base_files - referenced_files

    print(f"📁 Total .wav files in base_dir: {len(base_files)}")
    print(f"📄 Referenced files in metadata for '{split}': {len(referenced_files)}")
    print(f"❗ Unreferenced files in folder (not in metadata): {len(extra_files)}")

    return list(extra_files)

In [48]:
extra_wavs = find_unreferenced_wavs_in_split(
    real_metadata_path="./Standardized_full_data/REAL_train_and_val_cleaned.csv",
    spoof_metadata_path="./Standardized_full_data/Metadata TTS data_full_new.csv",
    base_dir="./Standardized_full_data/Training",
    split="train"
)

📁 Total .wav files in base_dir: 6980
📄 Referenced files in metadata for 'train': 6980
❗ Unreferenced files in folder (not in metadata): 0


In [39]:
extra_wavs

['v_output_4964.wav',
 'v_output_5015.wav',
 'v_output_4889.wav',
 'v_output_4930.wav',
 'v_output_5090.wav',
 'v_output_5011.wav',
 'v_output_4842.wav',
 'v_output_4968.wav',
 'v_output_4883.wav',
 'v_output_4974.wav',
 'v_output_4954.wav',
 'v_output_5062.wav',
 'v_output_5029.wav',
 'v_output_5105.wav',
 'v_output_4845.wav',
 'v_output_4919.wav',
 'v_output_4898.wav',
 'v_output_4923.wav',
 'v_output_4993.wav',
 'v_output_5087.wav',
 'v_output_4942.wav',
 'v_output_5034.wav',
 'v_output_4951.wav',
 'v_output_4882.wav',
 'v_output_4859.wav',
 'v_output_4885.wav',
 'v_output_5097.wav',
 'v_output_5123.wav',
 'v_output_5124.wav',
 'v_output_4907.wav',
 'v_output_5093.wav',
 'v_output_4933.wav',
 'v_output_4847.wav',
 'v_output_5131.wav',
 'v_output_5054.wav',
 'v_output_5024.wav',
 'v_output_5028.wav',
 'v_output_5113.wav',
 'v_output_5042.wav',
 'v_output_5030.wav',
 'v_output_4826.wav',
 'v_output_4928.wav',
 'v_output_5031.wav',
 'v_output_4912.wav',
 'v_output_5035.wav',
 'v_output

In [40]:
import pandas as pd

df = pd.read_csv("./Standardized_full_data/Metadata TTS data_full_new.csv")

row = df[df["Filename"] == "v_output_4964"]
if row.empty:
    # See what matches roughly
    fuzzy_matches = df[df["Filename"].str.contains("v_output_4964", na=False)]
    for f in fuzzy_matches["Filename"]:
        print(f"🧪 Raw: {repr(f)}")
else:
    print("✅ Exact match found.")

✅ Exact match found.


In [42]:
import pandas as pd
import os

# --- Load and clean metadata ---
real_df = pd.read_csv("./Standardized_full_data/REAL_train_and_val_cleaned.csv")
spoof_df = pd.read_csv("./Standardized_full_data/Metadata TTS data_full_new.csv")

# Clean filenames and split column
real_df["file_name"] = real_df["file_name"].astype(str).str.strip().str.lower() + ".wav"
spoof_df["file_name"] = spoof_df["Filename"].astype(str).str.strip().str.lower() + ".wav"
real_df["train_or_val"] = real_df["train_or_val"].astype(str).str.strip().str.lower()
spoof_df["train_or_val"] = spoof_df["train_or_val"].astype(str).str.strip().str.lower()

# Filter for train split
split = "train"
real_split = real_df[real_df["train_or_val"] == split]
spoof_split = spoof_df[spoof_df["train_or_val"] == split]

# Referenced files from metadata
referenced_files = set(pd.concat([real_split, spoof_split])["file_name"])

# Files in base directory
base_dir = "./Standardized_full_data/Training"
base_files = set(f.lower().strip() for f in os.listdir(base_dir) if f.endswith(".wav"))

# Now check your file!
target = "v_output_4964.wav"

print("🧪 Is it in base_files?", target in base_files)
print("🧪 Is it in referenced_files?", target in referenced_files)

print("\n🎯 Base match candidates:")
for f in base_files:
    if "v_output_4964" in f:
        print("  📁", repr(f))

print("\n🎯 Referenced match candidates:")
for f in referenced_files:
    if "v_output_4964" in f:
        print("  📄", repr(f))

🧪 Is it in base_files? True
🧪 Is it in referenced_files? False

🎯 Base match candidates:
  📁 'v_output_4964.wav'

🎯 Referenced match candidates:


In [44]:
all_df = pd.concat([real_df, spoof_df])
matches = all_df[all_df["file_name"].str.contains("v_output_4964")]
display(matches)

Unnamed: 0,file_name,speaker,age,gender,accent,native_language,country,region,source,spoof_or_real,train_or_val,Filename,Sentence,Gender,Accent,Eleven_Labs_voice
4958,v_output_4964.wav,,,,,,,"south asia (india, pakistan, bangladesh, sri l...",,spoof,,v_output_4964,"The old, weathered book on the dusty shelf yea...",female,Hindi Indian,Diana


In [46]:
matches = spoof_df[spoof_df["file_name"].str.contains("v_output_4964")]
display(matches)

Unnamed: 0,Filename,Sentence,Gender,Accent,Eleven_Labs_voice,train_or_val,region,spoof_or_real,file_name
4958,v_output_4964,"The old, weathered book on the dusty shelf yea...",female,Hindi Indian,Diana,,"south asia (india, pakistan, bangladesh, sri l...",spoof,v_output_4964.wav


In [47]:
import pandas as pd

# Load metadata
real_df = pd.read_csv("./Standardized_full_data/REAL_train_and_val_cleaned.csv")
spoof_df = pd.read_csv("./Standardized_full_data/Metadata TTS data_full_new.csv")

# Check for NaNs in train_or_val column
real_missing = real_df["train_or_val"].isna().sum()
spoof_missing = spoof_df["train_or_val"].isna().sum()

print(f"🧪 Missing 'train_or_val' in REAL metadata: {real_missing}")
print(f"🧪 Missing 'train_or_val' in SPOOF metadata: {spoof_missing}")
print(f"🧮 Total missing entries: {real_missing + spoof_missing}")

🧪 Missing 'train_or_val' in REAL metadata: 0
🧪 Missing 'train_or_val' in SPOOF metadata: 324
🧮 Total missing entries: 324
