In [2]:
import pandas as pd

In [3]:
THRESHOLD = 0.5

In [4]:
def clean_accent_name(accent):
    accent = accent.split("(")[0].strip()
    accent = " ".join([word.capitalize() for word in accent.split()])
    accent = accent.replace("&", "and")
    return accent

def clean_gender_name(gender):
    return gender.split("_")[0]

def preprocess_results(results_df):
    results_df["region"] = results_df["region"].apply(clean_accent_name)
    results_df["gender"] = results_df["gender"].apply(clean_gender_name)
    results_df["predicted_spoof"] = (results_df["prediction_score"] < THRESHOLD).astype(int)
    results_df["actual_label"] = 1
    results_df["FP"] = (results_df["predicted_spoof"] == 1) & (results_df["actual_label"] == 1)
    results_df["TN"] = (results_df["predicted_spoof"] == 0) & (results_df["actual_label"] == 1)
    return results_df

def compute_region_metrics(results_df):
    region_metrics = results_df.groupby("region").agg(
        total_samples=("actual_label", "count"),
        false_positives=("FP", "sum"),
        true_negatives=("TN", "sum")
    )
    region_metrics["FPR"] = region_metrics["false_positives"] / region_metrics["total_samples"]
    region_metrics["TNR"] = region_metrics["true_negatives"] / region_metrics["total_samples"]
    region_metrics = region_metrics.reset_index()
    return region_metrics

def compute_gender_metrics(results_df):
    gender_metrics = results_df.groupby("gender").agg(
        total_samples=("actual_label", "count"),
        false_positives=("FP", "sum"),
        true_negatives=("TN", "sum")
    )
    gender_metrics["FPR"] = gender_metrics["false_positives"] / gender_metrics["total_samples"]
    gender_metrics["TNR"] = gender_metrics["true_negatives"] / gender_metrics["total_samples"]
    gender_metrics = gender_metrics.reset_index()
    return gender_metrics

## Baseline Original Results

In [4]:
tssd_original = pd.read_csv("baseline-original-results/final-results-tssd.csv")
tssd_original = preprocess_results(tssd_original)
tssd_original.head(2)

Unnamed: 0,file_name,region,gender,wav_filename,wav_path,prediction_score,predicted_spoof,actual_label,FP,TN
0,common_voice_en_17775118.mp3,British Isles,female,common_voice_en_17775118.wav,mozilla_evaluation_wav/common_voice_en_1777511...,0.000227,1,1,True,False
1,common_voice_en_131226.mp3,British Isles,female,common_voice_en_131226.wav,mozilla_evaluation_wav/common_voice_en_131226.wav,0.017499,1,1,True,False


In [5]:
region_metrics_original = compute_region_metrics(tssd_original)
region_metrics_original

Unnamed: 0,region,total_samples,false_positives,true_negatives,FPR,TNR
0,British Isles,100,96,4,0.96,0.04
1,Multiple Accents,100,97,3,0.97,0.03
2,North America,100,91,9,0.91,0.09
3,Other,100,97,3,0.97,0.03
4,South Asia,100,91,9,0.91,0.09
5,Southeast Asia,100,93,7,0.93,0.07
6,Sub-saharan Africa,100,91,9,0.91,0.09


In [6]:
gender_metrics_original = compute_gender_metrics(tssd_original)
gender_metrics_original

Unnamed: 0,gender,total_samples,false_positives,true_negatives,FPR,TNR
0,female,350,320,30,0.914286,0.085714
1,male,350,336,14,0.96,0.04


## Baseline Updated Results

In [7]:
tssd_updated = pd.read_csv("baseline-updated-results/final-results-tssd.csv")
tssd_updated = preprocess_results(tssd_updated)
tssd_updated.head(2)

Unnamed: 0,file_name,speaker,age,gender,accent,native_language,country,region,source,wav_filename,wav_path,prediction_score,predicted_spoof,actual_label,FP,TN
0,common_voice_en_12088.wav,907c23b7fa9bfd336418f697c03eca72009141f994024d...,twenties,female,"india and south asia (india, pakistan, sri lanka)",,,South Asia,common_voice,common_voice_en_12088.wav,../../datasets/evaluation-data/evaluation-set/...,0.004374,1,1,True,False
1,common_voice_en_17271107.wav,185a19520056bd31c0b7613af5d9eb45e5b9eb8061127f...,twenties,female,"india and south asia (india, pakistan, sri lanka)",,,South Asia,common_voice,common_voice_en_17271107.wav,../../datasets/evaluation-data/evaluation-set/...,0.007559,1,1,True,False


In [8]:
region_metrics_updated = compute_region_metrics(tssd_updated)
region_metrics_updated

Unnamed: 0,region,total_samples,false_positives,true_negatives,FPR,TNR
0,British Isles,100,98,2,0.98,0.02
1,Caribbean,100,88,12,0.88,0.12
2,East Asia,100,88,12,0.88,0.12
3,Middle East and Central Asia,92,77,15,0.836957,0.163043
4,North America,100,93,7,0.93,0.07
5,South Asia,100,90,10,0.9,0.1
6,Southeast Asia,100,69,31,0.69,0.31
7,Sub-saharan Africa,100,96,4,0.96,0.04
8,Western Europe,100,84,16,0.84,0.16


In [9]:
gender_metrics_updated = compute_gender_metrics(tssd_updated)
gender_metrics_updated

Unnamed: 0,gender,total_samples,false_positives,true_negatives,FPR,TNR
0,female,446,356,90,0.798206,0.201794
1,male,446,427,19,0.957399,0.042601


## Comparison

In [10]:
region_comparison = pd.merge(
    region_metrics_original[["region", "FPR"]],
    region_metrics_updated[["region", "FPR"]],
    on="region",
    how="outer",
    suffixes=("_original", "_updated")
)
# Drop rows where region is "multiple accents" or "other":
region_comparison = region_comparison[~region_comparison["region"].isin(["Multiple Accents", "Other"])]
region_comparison

Unnamed: 0,region,FPR_original,FPR_updated
0,British Isles,0.96,0.98
2,North America,0.91,0.93
4,South Asia,0.91,0.9
5,Southeast Asia,0.93,0.69
6,Sub-saharan Africa,0.91,0.96
7,Caribbean,,0.88
8,East Asia,,0.88
9,Middle East and Central Asia,,0.836957
10,Western Europe,,0.84


In [11]:
gender_comparison = pd.merge(
    gender_metrics_original[["gender", "FPR"]],
    gender_metrics_updated[["gender", "FPR"]],
    on="gender",
    how="outer",
    suffixes=("_original", "_updated")
)
gender_comparison

Unnamed: 0,gender,FPR_original,FPR_updated
0,female,0.914286,0.798206
1,male,0.96,0.957399


## Local Fine-Tuned Results

In [8]:
tssd_finetuned = pd.read_csv("finetuned-results/final-results-tssd.csv")
tssd_finetuned = preprocess_results(tssd_finetuned)
tssd_finetuned.head(2)

Unnamed: 0,file_name,speaker,age,gender,accent,native_language,country,region,source,wav_filename,wav_path,prediction_score,predicted_spoof,actual_label,FP,TN
0,common_voice_en_12088.wav,907c23b7fa9bfd336418f697c03eca72009141f994024d...,twenties,female,"india and south asia (india, pakistan, sri lanka)",,,South Asia,common_voice,common_voice_en_12088.wav,../../datasets/evaluation-data/evaluation-set/...,0.995266,0,1,False,True
1,common_voice_en_17271107.wav,185a19520056bd31c0b7613af5d9eb45e5b9eb8061127f...,twenties,female,"india and south asia (india, pakistan, sri lanka)",,,South Asia,common_voice,common_voice_en_17271107.wav,../../datasets/evaluation-data/evaluation-set/...,0.99807,0,1,False,True


In [9]:
region_metrics_updated = compute_region_metrics(tssd_finetuned)
region_metrics_updated

Unnamed: 0,region,total_samples,false_positives,true_negatives,FPR,TNR
0,British Isles,100,19,81,0.19,0.81
1,Caribbean,100,10,90,0.1,0.9
2,East Asia,100,0,100,0.0,1.0
3,Middle East and Central Asia,92,1,91,0.01087,0.98913
4,North America,100,11,89,0.11,0.89
5,South Asia,100,20,80,0.2,0.8
6,Southeast Asia,100,13,87,0.13,0.87
7,Sub-saharan Africa,100,9,91,0.09,0.91
8,Western Europe,100,17,83,0.17,0.83


In [10]:
gender_metrics_updated = compute_gender_metrics(tssd_finetuned)
gender_metrics_updated

Unnamed: 0,gender,total_samples,false_positives,true_negatives,FPR,TNR
0,female,446,34,412,0.076233,0.923767
1,male,446,66,380,0.147982,0.852018
