In [1]:
import pandas as pd
import os, errno
import time

In [2]:
meta = pd.read_csv('vox1_meta.csv', sep='\t')

In [3]:
meta["Nationality"].value_counts()

USA                    799
UK                     215
Canada                  54
Australia               37
India                   26
Norway                  20
Ireland                 18
Germany                  9
New Zealand              8
Italy                    8
Mexico                   7
Sweden                   5
Russia                   4
Spain                    4
Switzerland              3
Chile                    3
Philippines              3
Netherlands              3
Croatia                  3
Denmark                  3
Poland                   2
China                    2
Portugal                 2
South Korea              1
Brazil                   1
Sri Lanka                1
Israel                   1
Trinidad and Tobago      1
Iran                     1
Singapore                1
Austria                  1
Sudan                    1
France                   1
South Africa             1
Guyana                   1
Pakistan                 1
Name: Nationality, dtype: in

In [4]:
meta

Unnamed: 0,VoxCeleb1 ID,VGGFace1 ID,Gender,Nationality,Set
0,id10001,A.J._Buckley,m,Ireland,dev
1,id10002,A.R._Rahman,m,India,dev
2,id10003,Aamir_Khan,m,India,dev
3,id10004,Aaron_Tveit,m,USA,dev
4,id10005,Aaron_Yoo,m,USA,dev
...,...,...,...,...,...
1246,id11247,Zachary_Levi,m,USA,dev
1247,id11248,Zachary_Quinto,m,USA,dev
1248,id11249,Zack_Snyder,m,USA,dev
1249,id11250,Zoe_Saldana,f,USA,dev


In [5]:
def GetIds(male=True):
    g = 'm' if male else 'f'
    return meta[meta["Gender"] == g]['VoxCeleb1 ID']

In [6]:
male_ids = set(GetIds())
female_ids = set(GetIds(male=False))

In [7]:
USA_ids = meta[meta["Nationality"] == 'USA']['VoxCeleb1 ID']
UK_ids = meta[meta["Nationality"] == 'UK']['VoxCeleb1 ID']
Canada_ids = meta[meta["Nationality"] == 'Canada']['VoxCeleb1 ID']
Australia_ids = meta[meta["Nationality"] == 'Australia']['VoxCeleb1 ID']
India_ids = meta[meta["Nationality"] == 'India']['VoxCeleb1 ID']
Norway_ids = meta[meta["Nationality"] == 'Norway']['VoxCeleb1 ID']
Ireland_ids = meta[meta["Nationality"] == 'Ireland']['VoxCeleb1 ID']

In [8]:
utters = pd.read_csv('/home/jupyter/voxceleb-fairness/data/datasets/full/vox1_full_utterances.txt', header=None, sep=' ')

In [9]:
male_utters = utters[[x in male_ids for x in utters[0]]].sample(5000).reset_index(drop=True)

In [10]:
male_utters

Unnamed: 0,0,1
0,id11230,id11230/NzkV5Ct5L6A/00009.wav
1,id10845,id10845/i5mhrsoDndM/00001.wav
2,id11121,id11121/VEe3X5EH6oM/00007.wav
3,id10490,id10490/EGCzwG5t7OU/00002.wav
4,id10105,id10105/wrHyTrC37FQ/00158.wav
...,...,...
4995,id10412,id10412/UN6BeUs8td4/00008.wav
4996,id10018,id10018/BaesL7QJLFU/00027.wav
4997,id10920,id10920/kgqSlgRoBy4/00002.wav
4998,id10836,id10836/f6MLIpHjyn8/00015.wav


In [11]:
female_utters = utters[[x in female_ids for x in utters[0]]].sample(5000).reset_index(drop=True)

In [12]:
def silentremove(filename):
    try:
        os.remove(filename)
    except OSError as e: # this would be "except OSError, e:" before Python 2.6
        if e.errno != errno.ENOENT: # errno.ENOENT = no such file or directory
            raise # re-raise exception if a different error occurred

In [13]:
def make_pairs(utters_df, filename):
    silentremove(filename)
    start = time.time()
    current_iter_start = start
    for i in range(len(utters_df)):
        data = []
        for j in range(i + 1, len(utters_df)):
            num = 1 if utters_df[0][i] == utters_df[0][j] else 0
            data.append([num, utters_df[1][i], utters_df[1][j]])
        pd.DataFrame(data).to_csv(filename, mode='a', index=False, header=None, sep=' ')
        if i % 1000 == 0:
            current_iter_end = time.time()
            print('Wrote {} of {} utterances in {} seconds ({} seconds from start)'.format(i, len(utters_df), current_iter_end - current_iter_start, current_iter_end - start))
            current_iter_start = current_iter_end

In [14]:
make_pairs(male_utters, '/home/jupyter/voxceleb-fairness/data/lists/vox1_male_all.txt')

Wrote 0 of 5000 utterances in 0.11998939514160156 seconds (0.11998939514160156 seconds from start)
Wrote 1000 of 5000 utterances in 105.78200435638428 seconds (105.90199375152588 seconds from start)
Wrote 2000 of 5000 utterances in 83.18765497207642 seconds (189.0896487236023 seconds from start)
Wrote 3000 of 5000 utterances in 58.98132395744324 seconds (248.07097268104553 seconds from start)
Wrote 4000 of 5000 utterances in 35.662431478500366 seconds (283.7334041595459 seconds from start)


In [15]:
male_pairs = pd.read_csv('/home/jupyter/voxceleb-fairness/data/lists/vox1_male_all.txt', header=None, sep=' ')

In [16]:
balanced_male_pairs = pd.concat([male_pairs[male_pairs[0] == 1].reset_index(drop=True), male_pairs[male_pairs[0] == 0].sample(sum(male_pairs[0] == 1)).reset_index(drop=True)]).sort_index().reset_index(drop=True)

In [27]:
balanced_male_pairs

Unnamed: 0,0,1,2
0,1,id11230/NzkV5Ct5L6A/00009.wav,id11230/QT0T8hWKh1Y/00007.wav
1,0,id11184/2fq0yy2gN9k/00034.wav,id10538/R8cb0F2NUcU/00001.wav
2,1,id11230/NzkV5Ct5L6A/00009.wav,id11230/HqqLQgVZktQ/00001.wav
3,0,id10143/69XxH60bd4Y/00003.wav,id10918/giRD3g15ZSg/00003.wav
4,1,id11230/NzkV5Ct5L6A/00009.wav,id11230/ADLO9eWGUIY/00011.wav
...,...,...,...
52783,1,id10635/3euqU5PKafw/00004.wav,id10635/phVMDrsjxrk/00015.wav
52784,0,id10719/3kRa0SluANU/00004.wav,id10991/46hVZI15pmE/00004.wav
52785,1,id10540/x056ioqqn8A/00003.wav,id10540/G2W41pvvZs0/00007.wav
52786,1,id10484/qBrXmKPpFYY/00007.wav,id10484/qBrXmKPpFYY/00028.wav


In [18]:
make_pairs(female_utters, '/home/jupyter/voxceleb-fairness/data/lists/vox1_female_all.txt')

Wrote 0 of 5000 utterances in 0.1213064193725586 seconds (0.1213064193725586 seconds from start)
Wrote 1000 of 5000 utterances in 106.44076418876648 seconds (106.56207060813904 seconds from start)
Wrote 2000 of 5000 utterances in 84.26895356178284 seconds (190.83102416992188 seconds from start)
Wrote 3000 of 5000 utterances in 59.418402433395386 seconds (250.24942660331726 seconds from start)
Wrote 4000 of 5000 utterances in 36.00027275085449 seconds (286.24969935417175 seconds from start)


In [19]:
female_pairs = pd.read_csv('/home/jupyter/voxceleb-fairness/data/lists/vox1_female_all.txt', header=None, sep=' ')

In [20]:
balanced_female_pairs = pd.concat([female_pairs[female_pairs[0] == 1].reset_index(drop=True), female_pairs[female_pairs[0] == 0].sample(sum(female_pairs[0] == 1)).reset_index(drop=True)]).sort_index().reset_index(drop=True)

In [21]:
balanced_female_pairs

Unnamed: 0,0,1,2
0,1,id10795/z5_ZEt9c-IQ/00003.wav,id10795/InEZE75bHjU/00004.wav
1,0,id10682/iOpqNblDlJU/00001.wav,id11203/987UrtpOII8/00005.wav
2,1,id10795/z5_ZEt9c-IQ/00003.wav,id10795/InEZE75bHjU/00005.wav
3,0,id10909/M3rfGq1-lXg/00003.wav,id10439/bLufSCk9P2k/00043.wav
4,1,id10795/z5_ZEt9c-IQ/00003.wav,id10795/6WpEq7PXG0M/00002.wav
...,...,...,...
59627,1,id10387/4ry5D-jgSQU/00005.wav,id10387/7d4PiFleEVM/00001.wav
59628,0,id10123/Qw8-jKhzwEg/00006.wav,id10407/kHiKqkxnypg/00016.wav
59629,1,id10061/8GGP83uAxI4/00010.wav,id10061/SFMhpLiLgWo/00014.wav
59630,1,id10615/C9MSVzhtB8A/00005.wav,id10615/Bi7kCsbg5L0/00008.wav


In [22]:
balanced_male_pairs.to_csv('/home/jupyter/voxceleb-fairness/data/lists/vox1_male_all_balanced.txt', index=False, header=None, sep=' ')

In [23]:
balanced_female_pairs.to_csv('/home/jupyter/voxceleb-fairness/data/lists/vox1_female_all_balanced.txt', index=False, header=None, sep=' ')