In [None]:
import os
import pandas as pd

In [None]:
data_dir = "/dataset/SV_sets/voices/Development_Data/Speaker_Recognition/sid_dev/"

### Dev Enroll List

In [None]:
dev_enroll_list = pd.read_csv("/dataset/SV_sets/voices/original_data/sid_dev_lists_and_keys/dev-enroll.lst", 
                              names=["id", "wav"], delimiter=' ')
dev_enroll_list['file'] = dev_enroll_list.wav.apply(lambda x: "/".join(x.split("/")[1:]).rstrip(".wav"))
recs =  dev_enroll_list.id.apply(lambda x: x.split('-'))
info = list(zip(*recs))[3:]
info_df = pd.DataFrame(info).T
info_df.columns = ["room", "noise", "spk_id", "ch_id", "sg_id", "mc_id", "mc_t", "mc_l", "mc_deg"]
dev_enroll_df = pd.concat([dev_enroll_list, info_df], axis=1)
dev_enroll_df.drop(columns="wav", inplace=True)
dev_enroll_df['set'] = "enroll"

In [None]:
dev_enroll_df.head()

### Dev Test List 

In [None]:
dev_test_list = pd.read_csv("/dataset/SV_sets/voices/original_data/sid_dev_lists_and_keys/dev-test.lst", names=["id"])
dev_test_list['file'] = dev_test_list.id.apply(lambda x: "/".join(x.split("/")[1:]).rstrip(".wav"))
recs = dev_test_list.id.apply(lambda x: x.rstrip(".wav").split('-'))
info = list(zip(*recs))[3:]
info_df = pd.DataFrame(info).T
info_df.columns = ["room", "noise", "spk_id", "ch_id", "sg_id", "mc_id", "mc_t", "mc_l", "mc_deg"]
dev_test_df = pd.concat([dev_test_list, info_df], axis=1)
# dev_test_df.drop(columns="wav", inplace=True)
dev_test_df['set'] = "test"

In [None]:
dev_test_df.head()

In [None]:
from IPython.display import Audio

In [None]:
Audio("/dataset/SV_sets/voices/sid_dev/sp0032/Lab41-SRI-VOiCES-rm2-babb-sp0032-ch021625-sg00

In [None]:
Audio("/dataset/SV_sets/voices/sid_dev/sp0032/Lab41-SRI-VOiCES-rm2-babb-sp0032-ch021631-sg00

### Total Dev List

In [None]:
dev_df = pd.concat([dev_enroll_df, dev_test_df], sort=False)
dev_df.to_csv("/dataset/SV_sets/voices/voices_dev.csv", index=False)

In [None]:
wavs = dev_df.file.apply(lambda x: "sid_dev/"+x+".wav")
ids = dev_df.id
pd.DataFrame(ids).assign(wav=wavs).to_csv("/dataset/SV_sets/voices/kaldi_files/wav.scp", index=False, sep=' ', header=None)
spk2utt = dev_df.sort_values("spk_id").groupby("spk_id").apply(lambda x: x.id.values)

with open("/dataset/SV_sets/voices/kaldi_files/spk2utt", "w") as f:
    for k, v in spk2utt.iteritems():
        line = ' '.join([k]+v.tolist())
        f.write(line+'\n')

### Dev Distribution

In [None]:
dev_df.groupby('spk_id').ch_id.value_counts()

In [None]:
# enrollemend room("rm1") and test room("rm2")
dev_df.room.value_counts()

In [None]:
dev_df.noise.value_counts()

In [None]:
dev_df.spk_id.value_counts()

### VOiECE trials

In [None]:
trial_list = pd.read_csv("/dataset/SV_sets/voices/original_data/sid_dev_lists_and_keys/dev-trial-keys.lst", delimiter=" ", names=["enroll_id", "test_id", "label"])
# trial_list.test_id = trial_list.test_id.apply(lambda x: x.split("/")[2].rstrip(".wav"))

In [None]:
dev_ids = dev_df.id.tolist()
id2idx = {v:i for i, v in enumerate(dev_ids)}
enr_idx = trial_list.enroll_id.apply(lambda x: id2idx[x])
test_idx = trial_list.test_id.apply(lambda x: id2idx[x])
trial_list['enroll_idx'] = enr_idx
trial_list['test_idx'] = test_idx
trial_list.loc[trial_list.label == 'imp', 'label'] = 0
trial_list.loc[trial_list.label == 'tgt', 'label'] = 1
trial_list.to_csv("/dataset/SV_sets/voices/voices_dev_trial.csv", index=False)

In [None]:
trial_list.label.value_counts()

In [None]:
print(f"target_ratio: {20096 / 3985792}")

## Eval Set

무조건 id는 패턴을 맞춰주는게 좋다 speaker-id로 시작해야 sorting이 올바르게 된다.

In [None]:
eval_enroll = pd.read_csv("/dataset/SV_sets/voices/eval_set/sid_eval_lists/eval-enroll.lst", delimiter=" ", names=["id", "file"])
eval_test = pd.read_csv("/dataset/SV_sets/voices/eval_set/sid_eval_lists/eval-test.lst", delimiter=" ", names=["id"])
eval_test["file"] = eval_test.id
eval_test["id"] = eval_test.id.apply(lambda x: x.split("/")[1].rstrip(".wav"))
eval_df = pd.concat([eval_enroll, eval_test])
fake_spk_id = ["spk"+str(i).zfill(5) for i in range(len(eval_df))]
eval_df["spk_id"] = fake_spk_id
eval_df["uttr_id"] = eval_df.apply(lambda x: x.spk_id + "/" + x.id, axis=1)
eval_df["file"] = eval_df.file.apply(lambda x: x.rstrip(".wav"))
eval_df = eval_df.reset_index(drop=True)
eval_df.to_csv("/dataset/SV_sets/voices/eval_set/voices_eval.csv")

In [None]:
eval_df

In [None]:
wavs = eval_df.file
ids = eval_df.uttr_id
pd.DataFrame(ids).assign(wav=wavs).to_csv("/dataset/SV_sets/voices/eval_set/kaldi_files/wav.scp", index=False, sep=' ', header=None)

In [None]:
spk2utt = eval_df.sort_values("spk_id").groupby("spk_id").apply(lambda x: x.uttr_id.values)
with open("/dataset/SV_sets/voices/eval_set/kaldi_files/spk2utt", "w") as f:
    for k, v in spk2utt.iteritems():
        line = ' '.join([k]+v.tolist())
        f.write(line+'\n')

In [None]:
eval_trial = pd.read_csv("/dataset/SV_sets/voices/eval_set/sid_eval_lists/eval-trial.lst", delimiter=" ", names=["enroll_id", "test_id"])
eval_trial["test_id"] = eval_trial.test_id.apply(lambda x: x.split("/")[1].rstrip(".wav"))
# eval_ids = eval_df.id.tolist()
# id2idx = {v:i for i, v in enumerate(eval_ids)}
# enr_idx = eval_trial.enroll_id.apply(lambda x: id2idx[x])
# test_idx = eval_trial.test_id.apply(lambda x: id2idx[x])
# eval_trial['enroll_idx'] = enr_idx
# eval_trial['test_idx'] = test_idx

eval_trial.to_csv("/dataset/SV_sets/voices/voices_eval_trial.csv", index=False)
eval_trial[["enroll_id", "test_id"]].to_csv("/dataset/SV_sets/voices/eval_set/kaldi_files/voices_eval_sv", sep=' ', header=None, index=False)

## Join to VoxCeleb12

In [None]:
voxc12_si = pd.read_csv("/dataset/SV_sets/voxceleb12/dataframes/voxc12_si.csv")
voxc12_sv = pd.read_csv("/dataset/SV_sets/voxceleb12/dataframes/voxc12_sv.csv")

In [None]:
voxc12_dev = pd.concat([voxc12_si, voxc12_sv])
voxc12_dev = voxc12_dev[['id', 'spk', 'file']]

In [None]:
voices_dev = pd.read_csv("/dataset/SV_sets/voices/voices_dev.csv")

In [None]:
voices_dev = voices_dev[['id', 'spk_id', 'file']]
voices_dev.columns = 

In [None]:
voices_dev.columns