In [64]:
import numpy as np
import pandas as pd 
from src.common.constants import Constants as consts

FILE_NAME = "wavlm_extracted"

COLLECTED_DIR = consts.data_dir / "collected_data" / "spoof_and_bonafide"
embeddings = COLLECTED_DIR / (FILE_NAME + ".npy")
metadata = COLLECTED_DIR / (FILE_NAME + ".csv")

print("Do files exist?", embeddings.exists() and metadata.exists())

Do files exist? True


In [65]:
meta_df = pd.read_csv(metadata)
embeddings_array = np.load(embeddings, allow_pickle=True)

In [66]:
print(meta_df.iloc[243835])

key_id            mls-vocoders-full_band_melgan_test/10_pth_1022...
duration                                                        4.0
starting_point                                                  0.0
target                                                        spoof
Name: 243835, dtype: object


In [3]:
print(meta_df.columns)
print("Metadata shape:", meta_df.shape)
print("Embeddings shape:", embeddings_array.shape)

Index(['key_id', 'duration', 'starting_point', 'target'], dtype='object')
Metadata shape: (268964, 4)
Embeddings shape: (268964, 768)


In [4]:
print(pd.unique(meta_df['target']))
target_stats = meta_df.groupby('target').count()
print(f"{target_stats.loc['spoof', 'key_id'] / target_stats.loc['bonafide', 'key_id']:.2f} x more spoof samples than bonafide samples.")
target_stats

['bonafide' 'spoof']
4.12 x more spoof samples than bonafide samples.


Unnamed: 0_level_0,key_id,duration,starting_point
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bonafide,52512,52512,52512
spoof,216452,216452,216452


In [9]:
for key in meta_df['key_id']:
    if "dev" in key and "1_0" in key:
        print(key)

None_dev/1_0
None_dev/1_0
None_dev/1_0
None_dev/1_0
None_dev/1_0
None_dev/1_0
None_dev/1_0
None_dev/11_0
None_dev/11_0
None_dev/11_0
None_dev/11_0
None_dev/11_0
None_dev/11_0
None_dev/11_0
None_dev/21_0
None_dev/21_0
None_dev/21_0
None_dev/21_0
None_dev/21_0
None_dev/21_0
None_dev/21_0
None_dev/21_0
None_dev/21_0
None_dev/31_0
None_dev/31_0
None_dev/31_0
None_dev/31_0
None_dev/31_0
None_dev/41_0
None_dev/41_0
None_dev/41_0
None_dev/41_0
None_dev/41_0
None_dev/41_0
None_dev/51_0
None_dev/51_0
None_dev/51_0
None_dev/51_0
None_dev/51_0
None_dev/51_0
None_dev/51_0
None_dev/61_0
None_dev/61_0
None_dev/61_0
None_dev/61_0
None_dev/61_0
None_dev/71_0
None_dev/71_0
None_dev/71_0
None_dev/71_0
None_dev/71_0
None_dev/71_0
None_dev/71_0
None_dev/81_0
None_dev/81_0
None_dev/81_0
None_dev/81_0
None_dev/81_0
None_dev/91_0
None_dev/91_0
None_dev/91_0
None_dev/91_0
None_dev/91_0
None_dev/91_0
None_dev/91_0
None_dev/101_0
None_dev/101_0
None_dev/101_0
None_dev/101_0
None_dev/101_0
None_dev/101_0
None_de

In [63]:
def change_config_in_key_id(key : str, config_name : str ='mls_eng'):
    idx = key.find('_')
    new_key = config_name + "/" + key[idx+1:]
    return new_key

def remove_trash_suffix_from_key(key : str):
    idx = key.find('_', key.find('/'))
    if idx == -1:
        return key
    return key[:idx]
    

test_word = "dupa_123_456/789_test"
print(change_config_in_key_id(test_word))
idx = test_word.find('_', test_word.find('/'))
print(test_word[idx:])
print(idx)

mls_eng/123_456/789_test
_test
16


In [39]:
meta_df["key_id"] = meta_df["key_id"].apply(
    lambda key: change_config_in_key_id(key) if "None" in key else key
)
print(meta_df.head())

meta_df["key_id"] = meta_df["key_id"].apply(lambda key: remove_trash_suffix_from_key(key))
x = meta_df['key_id'].unique()

def return_index_of_key(key: str, key_list: np.ndarray) -> int:
    for i, k in enumerate(key_list):
        if k == key:
            return i
    return -1

for key in x:
    if "mls-vocoders-full_band_melgan_test" in key:
        idx = key.find('/')
        try:
            number = int(key[idx+1:])
            # print(f"Found key: {key} with number: {number}")
        except ValueError:
            print(f"Key: {key} does not have a valid number after the slash.")

         key_id  duration  starting_point    target
0  mls_eng/dev/       4.0             0.0  bonafide
1  mls_eng/dev/       4.0             2.0  bonafide
2  mls_eng/dev/       4.0             4.0  bonafide
3  mls_eng/dev/       4.0             6.0  bonafide
4  mls_eng/dev/       4.0             8.0  bonafide
Key: mls-vocoders-full_band_melgan_test does not have a valid number after the slash.
Key: mls-vocoders-full_band_melgan_test/ does not have a valid number after the slash.


In [34]:
print(return_index_of_key("mls-vocoders-full_band_melgan_test", meta_df['key_id'].values))

243835


In [68]:
key_set = set()

for i, k in enumerate(meta_df['key_id'].values):
    if "mls-vocoders-full_band_melgan_test" in k:
        rm_suffix = remove_trash_suffix_from_key(k)
        key_set.add(rm_suffix)
        # if rm_suffix == "mls-vocoders-full_band_melgan_test" or rm_suffix == "mls-vocoders-full_band_melgan_test/":
        #     print(f"Index: {i}, Key: {k} does not have a valid number after the slash.")
        

In [70]:
key_set = set()
for i, k in enumerate(meta_df['key_id'].values):
    if "mls-vocoders-full_band_melgan_dev" in k:
        rm_suffix = remove_trash_suffix_from_key(k)
        key_set.add(rm_suffix)
        if rm_suffix == "mls-vocoders-full_band_melgan_dev" or rm_suffix == "mls-vocoders-full_band_melgan_dev/":
            print(f"Index: {i}, Key: {k} does not have a valid number after the slash.")

In [71]:
key_lst = list(key_set)
key_lst.sort()
print(key_lst[len(key_lst)-4:len(key_lst)])
print(key_lst[0:4])

['mls-vocoders-full_band_melgan_dev/996', 'mls-vocoders-full_band_melgan_dev/997', 'mls-vocoders-full_band_melgan_dev/998', 'mls-vocoders-full_band_melgan_dev/999']
['mls-vocoders-full_band_melgan_dev/0', 'mls-vocoders-full_band_melgan_dev/1', 'mls-vocoders-full_band_melgan_dev/10', 'mls-vocoders-full_band_melgan_dev/100']


In [67]:
meta_df['key_id'] = meta_df['key_id'].apply(lambda key: remove_trash_suffix_from_key(key))

In [59]:
print(meta_df.iloc[243835])

key_id            mls-vocoders-full_band_melgan_test/10
duration                                            4.0
starting_point                                      0.0
target                                            spoof
Name: 243835, dtype: object


In [6]:
CONFIG = 'mls-tts-bark'
SPLIT = 'dev'

def get_config_split_record_id(record):
    key = record['__key__']
    split_end_idx = key.find('/')
    split = key[:split_end_idx]
    record_end_idx = key.find('_', split_end_idx)
    record_id = key[split_end_idx + 1:record_end_idx] if record_end_idx != -1 else key[split_end_idx + 1:]
    return CONFIG, split, record_id
    
record_example = {'__key__': 'dev/12345_1_0'}
record_example_1 = {'__key__': 'dev/67890'}
print(get_config_split_record_id(record_example))
print(get_config_split_record_id(record_example_1))

('mls-tts-bark', 'dev', '12345')
('mls-tts-bark', 'dev', '67890')
