In [36]:
import pandas as pd

In [37]:
THRESHOLD = 0.5364887 # 0.87961036 # 0.5

In [38]:
def clean_accent_name(accent):
    accent = accent.split("(")[0].strip()
    accent = " ".join([word.capitalize() for word in accent.split()])
    accent = accent.replace("&", "and")
    return accent

def clean_gender_name(gender):
    return gender.split("_")[0]

def preprocess_results(results_df):
    results_df["region"] = results_df["region"].apply(clean_accent_name)
    results_df["gender"] = results_df["gender"].apply(clean_gender_name)
    results_df["predicted_spoof"] = (results_df["spoof_score"] > THRESHOLD).astype(int) # results["prediction_score"]
    results_df["actual_label"] = 1
    results_df["FP"] = (results_df["predicted_spoof"] == 1) & (results_df["actual_label"] == 1)
    results_df["TN"] = (results_df["predicted_spoof"] == 0) & (results_df["actual_label"] == 1)
    return results_df

def compute_region_metrics(results_df):
    region_metrics = results_df.groupby("region").agg(
        total_samples=("actual_label", "count"),
        false_positives=("FP", "sum"),
        true_negatives=("TN", "sum")
    )
    region_metrics["FPR"] = region_metrics["false_positives"] / region_metrics["total_samples"]
    region_metrics["TNR"] = region_metrics["true_negatives"] / region_metrics["total_samples"]
    region_metrics = region_metrics.reset_index()
    region_metrics = region_metrics.sort_values(by="region", ascending=True)
    return region_metrics

def compute_gender_metrics(results_df):
    gender_metrics = results_df.groupby("gender").agg(
        total_samples=("actual_label", "count"),
        false_positives=("FP", "sum"),
        true_negatives=("TN", "sum")
    )
    gender_metrics["FPR"] = gender_metrics["false_positives"] / gender_metrics["total_samples"]
    gender_metrics["TNR"] = gender_metrics["true_negatives"] / gender_metrics["total_samples"]
    gender_metrics = gender_metrics.reset_index()
    gender_metrics = gender_metrics.sort_values(by="gender", ascending=True)
    return gender_metrics

## Baseline Original Results

In [28]:
# tssd_original = pd.read_csv("baseline-original-results/final-results-tssd.csv")
# tssd_original = preprocess_results(tssd_original)
# tssd_original.head(2)

In [29]:
# region_metrics_original = compute_region_metrics(tssd_original)
# region_metrics_original

In [30]:
# gender_metrics_original = compute_gender_metrics(tssd_original)
# gender_metrics_original

## Baseline Updated Results

In [39]:
tssd_updated = pd.read_csv("baseline-updated-results/final-results-tssd.csv")
tssd_updated = preprocess_results(tssd_updated)
tssd_updated.head(2)

Unnamed: 0,file_name,speaker,age,gender,accent,native_language,country,region,source,wav_filename,wav_path,spoof_score,predicted_spoof,actual_label,FP,TN
0,common_voice_en_12088.wav,907c23b7fa9bfd336418f697c03eca72009141f994024d...,twenties,female,"india and south asia (india, pakistan, sri lanka)",,,South Asia,common_voice,common_voice_en_12088.wav,../../datasets/evaluation-data/evaluation-set/...,0.832678,1,1,True,False
1,common_voice_en_17271107.wav,185a19520056bd31c0b7613af5d9eb45e5b9eb8061127f...,twenties,female,"india and south asia (india, pakistan, sri lanka)",,,South Asia,common_voice,common_voice_en_17271107.wav,../../datasets/evaluation-data/evaluation-set/...,0.774137,1,1,True,False


In [40]:
region_metrics_updated = compute_region_metrics(tssd_updated)
region_metrics_updated

Unnamed: 0,region,total_samples,false_positives,true_negatives,FPR,TNR
0,British Isles,100,93,7,0.93,0.07
1,Caribbean,100,86,14,0.86,0.14
2,East Asia,100,87,13,0.87,0.13
3,Middle East and Central Asia,92,75,17,0.815217,0.184783
4,North America,100,89,11,0.89,0.11
5,South Asia,100,89,11,0.89,0.11
6,Southeast Asia,100,69,31,0.69,0.31
7,Sub-saharan Africa,100,94,6,0.94,0.06
8,Western Europe,100,84,16,0.84,0.16


In [41]:
gender_metrics_updated = compute_gender_metrics(tssd_updated)
gender_metrics_updated

Unnamed: 0,gender,total_samples,false_positives,true_negatives,FPR,TNR
0,female,446,347,99,0.778027,0.221973
1,male,446,419,27,0.939462,0.060538


## Comparing Baselines

In [31]:
# region_comparison = pd.merge(
#     region_metrics_original[["region", "FPR"]],
#     region_metrics_updated[["region", "FPR"]],
#     on="region",
#     how="outer",
#     suffixes=("_original", "_updated")
# )
# # Drop rows where region is "multiple accents" or "other":
# region_comparison = region_comparison[~region_comparison["region"].isin(["Multiple Accents", "Other"])]
# region_comparison = region_comparison.sort_values(by="region", ascending=True)
# region_comparison

In [32]:
# gender_comparison = pd.merge(
#     gender_metrics_original[["gender", "FPR"]],
#     gender_metrics_updated[["gender", "FPR"]],
#     on="gender",
#     how="outer",
#     suffixes=("_original", "_updated")
# )
# gender_comparison = gender_comparison.sort_values(by="gender", ascending=True)
# gender_comparison

## Local Fine-Tuned Results

In [24]:
# tssd_finetuned = pd.read_csv("finetuned-results/final-results-tssd.csv")
# tssd_finetuned = preprocess_results(tssd_finetuned)
# tssd_finetuned.head(2)

In [25]:
# region_metrics_updated = compute_region_metrics(tssd_finetuned)
# region_metrics_updated

In [26]:
# gender_metrics_updated = compute_gender_metrics(tssd_finetuned)
# gender_metrics_updated

## Finetune-1 Results

Trained incorrectly.

In [33]:
# tssd_finetune_1 = pd.read_csv("finetune-1/final-results-tssd.csv")
# tssd_finetune_1 = preprocess_results(tssd_finetune_1)
# tssd_finetune_1.head(2)

In [34]:
# region_metrics_updated = compute_region_metrics(tssd_finetune_1)
# region_metrics_updated

In [35]:
# gender_metrics_updated = compute_gender_metrics(tssd_finetune_1)
# gender_metrics_updated

## Finetune-2 Results

In [42]:
tssd_finetune_2 = pd.read_csv("finetune-2/final-results-tssd.csv")
tssd_finetune_2 = preprocess_results(tssd_finetune_2)
tssd_finetune_2.head(2)

Unnamed: 0,file_name,speaker,age,gender,accent,native_language,country,region,source,wav_filename,wav_path,spoof_score,predicted_spoof,actual_label,FP,TN
0,common_voice_en_12088.wav,907c23b7fa9bfd336418f697c03eca72009141f994024d...,twenties,female,"india and south asia (india, pakistan, sri lanka)",,,South Asia,common_voice,common_voice_en_12088.wav,../../datasets/evaluation-data/evaluation-set/...,5.181502e-09,0,1,False,True
1,common_voice_en_17271107.wav,185a19520056bd31c0b7613af5d9eb45e5b9eb8061127f...,twenties,female,"india and south asia (india, pakistan, sri lanka)",,,South Asia,common_voice,common_voice_en_17271107.wav,../../datasets/evaluation-data/evaluation-set/...,2.798537e-07,0,1,False,True


In [43]:
region_metrics_updated = compute_region_metrics(tssd_finetune_2)
region_metrics_updated

Unnamed: 0,region,total_samples,false_positives,true_negatives,FPR,TNR
0,British Isles,100,3,97,0.03,0.97
1,Caribbean,100,2,98,0.02,0.98
2,East Asia,100,0,100,0.0,1.0
3,Middle East and Central Asia,92,1,91,0.01087,0.98913
4,North America,100,3,97,0.03,0.97
5,South Asia,100,3,97,0.03,0.97
6,Southeast Asia,100,2,98,0.02,0.98
7,Sub-saharan Africa,100,3,97,0.03,0.97
8,Western Europe,100,1,99,0.01,0.99


In [44]:
gender_metrics_updated = compute_gender_metrics(tssd_finetune_2)
gender_metrics_updated

Unnamed: 0,gender,total_samples,false_positives,true_negatives,FPR,TNR
0,female,446,7,439,0.015695,0.984305
1,male,446,11,435,0.024664,0.975336
