In [4]:
import sys
import os
import pickle
import glob

import pandas as pd
import numpy as np

## train-clean-360 

In [46]:
libri_train360_dir = "/dataset/SV_sets/librispeech/LibriSpeech/train-clean-360/"

train360_wav_list = glob.glob(os.path.join(libri_train360_dir, '*', '*', '*.wav'))

train360_clean_df = pd.DataFrame(train360_wav_list, columns=['wav'])
train360_clean_df = train360_clean_df.assign(id=train360_clean_df.wav.apply(lambda x: x.split("/")[-1][:-9]))
train360_clean_df = train360_clean_df.assign(spk=train360_clean_df.wav.apply(lambda x: x.split("/")[-1].split("-")[0]))
train360_clean_df = train360_clean_df.set_index("id")
train360_clean_df = train360_clean_df.sort_index()

with open("/dataset/SV_sets/librispeech/LibriSpeech/train-clean-360//wav.scp", "w") as f:
    for idx, row in train360_clean_df.iterrows():
        f.write(idx+' '+row.wav+'\n')

spks = train360_clean_df.spk.unique().tolist()
spk2utt_dict = dict()
for spk in spks:
    spk2utt_dict[spk] = ' '.join(train360_clean_df[train360_clean_df.spk == spk].index.tolist())

with open("/dataset/SV_sets/librispeech/LibriSpeech/train-clean-360//spk2utt", "w") as f:
    for spk, utt in spk2utt_dict.items():
        f.write(spk+" "+utt+"\n")

with open("/dataset/SV_sets/librispeech/LibriSpeech/train-clean-360/utt2spk", "w") as f:
    for idx, row in train360_clean_df.iterrows():
        f.write(idx+" "+row.spk+"\n")

## train-clean-100 

In [47]:
libri_train100_dir = "/dataset/SV_sets/librispeech/LibriSpeech/train-clean-100/"

train100_wav_list = glob.glob(os.path.join(libri_train100_dir, '*', '*', '*.wav'))

train100_clean_df = pd.DataFrame(train100_wav_list, columns=['wav'])
train100_clean_df = train100_clean_df.assign(id=train100_clean_df.wav.apply(lambda x: x.split("/")[-1][:-9]))
train100_clean_df = train100_clean_df.assign(spk=train100_clean_df.wav.apply(lambda x: x.split("/")[-1].split("-")[0]))
train100_clean_df = train100_clean_df.set_index("id")
train100_clean_df = train100_clean_df.sort_index()

with open("/dataset/SV_sets/librispeech/LibriSpeech/train-clean-100//wav.scp", "w") as f:
    for idx, row in train100_clean_df.iterrows():
        f.write(idx+' '+row.wav+'\n')

spks = train100_clean_df.spk.unique().tolist()
spk2utt_dict = dict()
for spk in spks:
    spk2utt_dict[spk] = ' '.join(train100_clean_df[train100_clean_df.spk == spk].index.tolist())

with open("/dataset/SV_sets/librispeech/LibriSpeech/train-clean-100//spk2utt", "w") as f:
    for spk, utt in spk2utt_dict.items():
        f.write(spk+" "+utt+"\n")

with open("/dataset/SV_sets/librispeech/LibriSpeech/train-clean-100/utt2spk", "w") as f:
    for idx, row in train100_clean_df.iterrows():
        f.write(idx+" "+row.spk+"\n")

## train-clean

In [84]:
train_clean_df = pd.concat([train100_clean_df, train360_clean_df])
train_clean_df = train_clean_df.sort_index()

with open("/dataset/SV_sets/librispeech/LibriSpeech/train-clean/wav.scp", "w") as f:
    for idx, row in train_clean_df.iterrows():
        f.write(idx+' '+row.wav+'\n')

spks = train_clean_df.spk.unique().tolist()
spk2utt_dict = dict()
for spk in spks:
    spk2utt_dict[spk] = ' '.join(train_clean_df[train_clean_df.spk == spk].index.tolist())

with open("/dataset/SV_sets/librispeech/LibriSpeech/train-clean//spk2utt", "w") as f:
    for spk, utt in spk2utt_dict.items():
        f.write(spk+" "+utt+"\n")

with open("/dataset/SV_sets/librispeech/LibriSpeech/train-clean/utt2spk", "w") as f:
    for idx, row in train_clean_df.iterrows():
        f.write(idx+" "+row.spk+"\n")

In [83]:
train_clean_df.columns = ['file', 'spk']
train_clean_df['file'] = train_clean_df.file.apply(lambda x: x.rstrip("-norm.wav"))
train_clean_df['file'] = train_clean_df.file.apply(lambda x: re.sub("train-clean-...", "train-clean", x))
train_clean_df.to_pickle("/dataset/SV_sets/librispeech/LibriSpeech/train-clean/train_df.pkl")

## dev-clean

In [42]:
libri_dev_clean_dir = "/dataset/SV_sets/librispeech/LibriSpeech/dev-clean/1272/128104/"

dev_wav_list = glob.glob(os.path.join(libri_dev_clean_dir, '*', '*', '*.wav'))

dev_clean_df = pd.DataFrame(dev_wav_list, columns=['wav'])
dev_clean_df = dev_clean_df.assign(id=dev_clean_df.wav.apply(lambda x: x.split("/")[-1][:-9]))
dev_clean_df = dev_clean_df.assign(spk=dev_clean_df.wav.apply(lambda x: x.split("/")[-1].split("-")[0]))
dev_clean_df = dev_clean_df.set_index("id")
dev_clean_df = dev_clean_df.sort_index()

with open("/dataset/SV_sets/librispeech/LibriSpeech/dev-clean/wav.scp", "w") as f:
    for idx, row in dev_clean_df.iterrows():
        f.write(idx+' '+row.wav+'\n')

spks = dev_clean_df.spk.unique().tolist()
spk2utt_dict = dict()
for spk in spks:
    spk2utt_dict[spk] = ' '.join(dev_clean_df[dev_clean_df.spk == spk].index.tolist())

with open("/dataset/SV_sets/librispeech/LibriSpeech/dev-clean/spk2utt", "w") as f:
    for spk, utt in spk2utt_dict.items():
        f.write(spk+" "+utt+"\n")

with open("/dataset/SV_sets/librispeech/LibriSpeech/dev-clean/utt2spk", "w") as f:
    for idx, row in dev_clean_df.iterrows():
        f.write(idx+" "+row.spk+"\n")

In [58]:
dev_clean_df.columns = ['file', 'spk']

In [66]:
dev_clean_df['file'] = dev_clean_df.file.apply(lambda x: x.rstrip("-norm.wav"))
dev_clean_df.to_pickle("/dataset/SV_sets/librispeech/LibriSpeech/dev-clean/dev_df.pkl")

## embeds to files 

In [98]:
def key2df(keys, delimeter="-"):
    key_df = pd.DataFrame(keys, columns=['key'])
    key_df['spk'] = key_df.key.apply(lambda x: x.split(delimeter)[0])
    key_df['session'] = key_df.key.apply(lambda x: x.split(delimeter)[1])
    key_df['label'] = key_df.groupby('spk').ngroup()
    key_df['idx'] = range(len(key_df))
    key_df = key_df.set_index('key')
    
    key_df['idx'] = range(len(key_df))
    id2idx = key_df.idx.to_dict()
    idx2id = {v:k for k,v in id2idx.items()}

    return key_df

In [93]:
train_embeds = np.load("/dataset/SV_sets/librispeech/librispeech_embeds/train_embeds.npy")
dev_embeds = np.load("/dataset/SV_sets/librispeech/librispeech_embeds/dev_embeds.npy")

In [102]:
train_keys = pickle.load(open("/dataset/SV_sets/librispeech/librispeech_embeds/train_keys.pkl", "rb"))
dev_keys = pickle.load(open("/dataset/SV_sets/librispeech/librispeech_embeds/dev_keys.pkl", "rb"))
train_df = key2df(train_keys)

In [103]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
train_embed_mean = train_embeds.mean(0)
centered_train_embeds = train_embeds - train_embed_mean.reshape(1, -1)
centered_dev_embeds = dev_embeds - train_embed_mean.reshape(1, -1)

clf = LDA(solver='svd', n_components=200)
clf.fit(centered_train_embeds, train_df.label)

lda_train_embeds = clf.transform(centered_train_embeds)
lda_dev_embeds = clf.transform(centered_dev_embeds)

# length normalization
ln_lda_train_embeds = lda_train_embeds * np.sqrt(lda_train_embeds.shape[1]) / \
                                           np.linalg.norm(lda_train_embeds, axis=1, keepdims=True)
ln_lda_dev_embeds = lda_dev_embeds * np.sqrt(lda_dev_embeds.shape[1]) / \
                                           np.linalg.norm(lda_dev_embeds, axis=1, keepdims=True)

In [108]:
np.save("/dataset/SV_sets/librispeech/librispeech_embeds/ln_lda_train_embeds.npy", ln_lda_train_embeds)
np.save("/dataset/SV_sets/librispeech/librispeech_embeds/ln_lda_dev_embeds.npy", ln_lda_dev_embeds)

In [111]:
dev_embed_path = [os.path.join("/dataset/SV_sets/librispeech/LibriSpeech/dev-clean/", 
                               key.split('-')[0], key.split('-')[1], key+"-xvec.npy")
                               for key in dev_keys]

for path, embed in zip(dev_embed_path, ln_lda_dev_embeds):
    np.save(path, embed)

In [115]:
train_embed_path = [os.path.join("/dataset/SV_sets/librispeech/LibriSpeech/train-clean/", 
                               key.split('-')[0], key.split('-')[1], key+"-xvec.npy")
                               for key in train_keys]

for path, embed in zip(train_embed_path, ln_lda_train_embeds):
    np.save(path, embed)