In [27]:
import pandas as pd
import os, errno
import time
import itertools

In [28]:
meta_data = pd.read_csv('~/vox1_meta.csv', sep='\t')

In [29]:
#print(meta_data["Nationality"].value_counts())
# print(meta_data["Gender"].value_counts())

In [30]:
# meta_data

In [31]:
# @brief Get speaker IDs where their metadata value for keys[i] has value values[i]
# @param alt_map Instead of comparing the key to value, compare map[key] to value
def get_ids(data, keys, values):
    for key_idx, value in enumerate(values):
        # set data to subset with value for given key
        data = data[data[keys[key_idx]] == value]
        
    return set(data['VoxCeleb1 ID'])

# @brief Return list of tuples of all combinations of keys in
#        the sensitive param provided as args
def get_intersection_combos(*sensitive_param_value_groups):
    return list(itertools.product(*sensitive_param_value_groups))

def map_col(df, key, values_map):
    return df.replace({key: values_map})

In [32]:
gender_values = ['m', 'f']
country_values = ['Australia', 'India', 'Norway', 'Ireland', 'Germany', 'New Zealand', 'Italy','Mexico',\
                  'Sweden', 'Spain', 'Russia', 'Switzerland', 'Chile', 'Philippines', 'Croatia', 'Denmark',\
                  'Netherlands', 'Poland', 'Portugal', 'China', 'France', 'Guyana', 'Singapore', 'Brazil',\
                  'Sri Lanka', 'South Africa', 'South Korea', 'Trinidad and Tobago', 'Pakistan', 'Austria',\
                  'Israel', 'Iran', 'Sudan', 'USA', 'UK', 'Canada']
native_lang_values = ['english', 'non-english']
native_lang_map = {'Australia': 'english', 'India': 'non-english', 'Norway': 'non-english', 'Ireland': 'english', 'Germany': 'non-english', 'New Zealand': 'english', 'Italy': 'non-english','Mexico': 'non-english',\
                  'Sweden': 'non-english', 'Spain': 'non-english', 'Russia': 'non-english', 'Switzerland': 'non-english', 'Chile': 'non-english', 'Philippines': 'non-english', 'Croatia': 'non-english', 'Denmark': 'non-english',\
                  'Netherlands': 'non-english', 'Poland': 'non-english', 'Portugal': 'non-english', 'China': 'non-english', 'France': 'non-english', 'Guyana': 'non-english', 'Singapore': 'non-english', 'Brazil': 'non-english',\
                  'Sri Lanka': 'non-english', 'South Africa': 'non-english', 'South Korea': 'non-english', 'Trinidad and Tobago': 'non-english', 'Pakistan': 'non-english', 'Austria': 'non-english',\
                  'Israel': 'non-english', 'Iran': 'non-english', 'Sudan': 'non-english', 'USA': 'english', 'UK': 'english', 'Canada': 'english'}

keys = ['Gender', 'Nationality']
intersections_ids = []
do_gender_nationality_intersection = False

if do_gender_nationality_intersection:
    # GENDER/NATIONALITY intersection
    # get all combinations of sensitive param values
    intersections_vals = get_intersection_combos(gender_values, country_values)
    
    # get all speaker IDs matching the combinations of sensitive param values
    for intersection in intersections_vals:
        intersections_ids.append(get_ids(meta_data, keys, intersection))
        print(f"Intersection {intersection}.count -> {len(intersections_ids[-1])}")
else:
    # GENDER/NATIVE-LANG intersection
    # replace nationality col with native-lang col
    filtered_meta_data = map_col(meta_data, 'Nationality', native_lang_map)
    
    # get all combinations of sensitive param values
    intersections_vals = get_intersection_combos(gender_values, native_lang_values)

    # get all speaker IDs matching the combinations of sensitive param values
    for intersection in intersections_vals:
        intersections_ids.append(get_ids(filtered_meta_data, keys, intersection))
        print(f"Intersection {intersection}.count -> {len(intersections_ids[-1])}")

Intersection ('m', 'english').count -> 631
Intersection ('m', 'non-english').count -> 59
Intersection ('f', 'english').count -> 500
Intersection ('f', 'non-english').count -> 61


In [33]:
# load all utternaces
utters = pd.read_csv('/home/jupyter/voxceleb-fairness/data/datasets/full/vox1_full_utterances.txt', header=None, sep=' ')
intersections_utters = []

# extract all utterance subsets
for idx, intersection_ids in enumerate(intersections_ids):
    intersection_utters = utters[[x in intersection_ids for x in utters[0]]]
    sample_len = min(len(intersection_utters), 5000)
    intersections_utters.append(intersection_utters.sample(sample_len).reset_index(drop=True))
    # print(intersections_vals[idx])
    # print(intersections_utters[-1])

In [34]:
# create list files from utterance subset files

def silentremove(filename):
    try:
        os.remove(filename)
    except OSError as e: # this would be "except OSError, e:" before Python 2.6
        if e.errno != errno.ENOENT: # errno.ENOENT = no such file or directory
            raise # re-raise exception if a different error occurred
            
def make_pairs(utters_df, filename):
    silentremove(filename)
    start = time.time()
    current_iter_start = start
    for i in range(len(utters_df)):
        data = []
        for j in range(i + 1, len(utters_df)):
            num = 1 if utters_df[0][i] == utters_df[0][j] else 0
            data.append([num, utters_df[1][i], utters_df[1][j]])
        pd.DataFrame(data).to_csv(filename, mode='a', index=False, header=None, sep=' ')
        if i % 1000 == 0:
            current_iter_end = time.time()
            print('Wrote {} of {} utterances in {} seconds ({} seconds from start)'.format(i, len(utters_df), current_iter_end - current_iter_start, current_iter_end - start))
            current_iter_start = current_iter_end

def clean_intersection_name(name):
    name = name.replace(')','')
    name = name.replace('(','')
    name = name.replace('\'','')
    name = name.replace(' ','')
    name = name.replace(',','_')
    return name

In [None]:
# extract the full lists
list_fnames = []
for idx, intersection_utters in enumerate(intersections_utters):
    # get file name for list and make dirs
    name = clean_intersection_name(str(intersections_vals[idx]))
    list_fnames.append(f"/home/jupyter/voxceleb-fairness/data/lists/intersect/vox1_intersect_{name}.txt")
    os.makedirs(os.path.dirname(list_fnames[-1]), exist_ok=True)
    # write the list
    make_pairs(intersection_utters, list_fnames[-1])

Wrote 0 of 5000 utterances in 0.30498290061950684 seconds (0.30498290061950684 seconds from start)
Wrote 1000 of 5000 utterances in 260.7658450603485 seconds (261.070827960968 seconds from start)
Wrote 2000 of 5000 utterances in 199.21984457969666 seconds (460.2906725406647 seconds from start)


In [None]:
# create the balanced lists with equal positive and negative test pairs
def balance_pairs(unbalanced_pairs):
    return pd.concat([unbalanced_pairs[unbalanced_pairs[0] == 1].reset_index(drop=True),\
               unbalanced_pairs[unbalanced_pairs[0] == 0].sample(sum(unbalanced_pairs[0] == 1)).\
               reset_index(drop=True)]).sort_index().reset_index(drop=True)

for list_fname in list_fnames:
    try:
        print(f"Running {list_fname}")
        # set new file name
        new_fname = list_fname.replace('.txt', '_balanced.txt')
        # read in old data and etract balanced list
        pairs = pd.read_csv(list_fname, header=None, sep=' ')
        balanced_pairs = balance_pairs(pairs)
        # write the new list
        balanced_pairs.to_csv(new_fname, index=False, header=None, sep=' ')
    except ValueError:
        print(f">> File is empty... skipping")
    except FileNotFoundError:
        print(f">> File not found... skipping")