In [1]:
import os
os.chdir('/Users/markjos/projects/malachor5')
import sys
sys.path.append('scripts')
from longform import load_and_resample
import torchaudio
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm
torchvision is not available - cannot save figures


In [2]:
zulu_xml = 'data/SASOC/balanced_engzul.xml'
xslt_path = 'data/SASOC/tabulate_corpus.xslt'
dev_ids_path = 'data/SASOC/soapies_dev_and_test_set_utterance_ids/cs_engzul_balanced/transcriptions/engzul_dev_set_utterance_ids.txt'
test_ids_path = 'data/SASOC/soapies_dev_and_test_set_utterance_ids/cs_engzul_balanced/transcriptions/engzul_tst_set_utterance_ids.txt'
audio_path = 'data/SASOC/audio'
cs_ds_path = 'data/hf-datasets/sasoc-cs'
zul_ds_path = 'data/hf-datasets/sasoc-zul'
eng_ds_path = 'data/hf-datasets/sasoc-eng'
ds_paths = {
    'codeswitched': cs_ds_path,
    'zul': zul_ds_path,
    'eng': eng_ds_path
}

In [4]:
df=pd.read_xml(zulu_xml, stylesheet=xslt_path)
df.head()

Unnamed: 0,index,transcription,lang_id,speaker_id,audio,duration
0,0,i had no idea so much preparation went into a ...,eng,AKHONA,AKHONA_13-02-12_101.wav,3654.0
1,0,then you should know that i will go very far t...,eng,AKHONA,AKHONA_13-02-12_149.wav,3597.0
2,0,wenzani,zul,SENZO,SENZO_13-02-12_179.wav,428.0
3,0,yini indaba,zul,SENZO,SENZO_13-02-12_181.wav,396.0
4,0,ufunani,zul,SENZO,SENZO_13-02-12_182.wav,344.0


In [None]:
with open(dev_ids_path) as f:
    dev_ids = [x.strip() for x in f.readlines()]
with open(test_ids_path) as f:
    test_ids = [x.strip() for x in f.readlines()]
is_dev = lambda s: s.removesuffix('.wav') in dev_ids
is_test = lambda s: s.removesuffix('.wav') in test_ids
get_split = lambda s: 'dev' if is_dev(s) else 'test' if is_test(s) else 'train'
df['split'] = df['audio'].apply(get_split)
df['split'].value_counts()

split
train    13357
test      2232
dev        598
Name: count, dtype: int64

In [None]:
# save deduped transcripts for training LM
unique_sentences = df['transcription'].unique()
zulu_txt_path = 'data/SASOC/balanced_engzul_train_deduped.txt'
with open(zulu_txt_path, 'w') as f:
    for sentence in unique_sentences:
        f.write(sentence + '\n')

In [5]:
grouped_df=df.groupby('audio').agg({
    'transcription': ' '.join,
    'lang_id': ','.join,
    'duration': 'sum',
    'speaker_id': 'first',
    'split': 'first',
})
grouped_df['lang_id'].value_counts()

lang_id
zul                                                    4362
zul,eng                                                1239
eng                                                    1225
zul,eng,zul                                             789
eng,zul                                                 738
eng,zul,eng                                             318
zul,eng,zul,eng                                         248
eng,zul,eng,zul                                         151
zul,eng,zul,eng,zul                                     119
eng,zul,eng,zul,eng                                      59
zul,eng,zul,eng,zul,eng                                  42
eng,zul,eng,zul,eng,zul                                  24
zul,eng,zul,eng,zul,eng,zul                              24
eng,zul,eng,zul,eng,zul,eng                              12
zul,eng,zul,eng,zul,eng,zul,eng                           6
zul,eng,zul,eng,zul,eng,zul,eng,zul                       6
eng,zul,eng,zul,eng,zul,eng,zul 

In [6]:
grouped_df.loc[
    grouped_df['lang_id']=='eng',
    'duration'
].sum()/60_000

92.73778333333334

In [7]:
grouped_df.loc[
    grouped_df['lang_id']=='zul',
    'duration'
].sum()/60_000

92.76985

In [8]:
grouped_df.loc[
    grouped_df['lang_id'].str.contains(','),
    'duration'
].sum()/60_000

430.43915

In [9]:
grouped_df['lang_id_utt']=grouped_df['lang_id']
grouped_df.loc[
    grouped_df['lang_id'].str.contains(','),
    'lang_id_utt'
] = 'codeswitched'
pd.pivot_table(grouped_df, index='lang_id_utt', columns='split', values='duration', aggfunc='sum')

split,dev,test,train
lang_id_utt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
codeswitched,1326038.0,5823728.0,18676583.0
eng,837.0,1598.0,5561832.0
zul,,,5566191.0


In [10]:
grouped_df[(grouped_df['split']!='train')&(grouped_df['lang_id_utt']!='codeswitched')]

Unnamed: 0_level_0,transcription,lang_id,duration,speaker_id,split,lang_id_utt
audio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BEE_13-12-09_149.wav,whoa whoa,eng,1598.0,BEE,test,eng
CHOPPA_12-10-30_306.wav,do i look like i!m joking,eng,837.0,CHOPPA,dev,eng


In [32]:
for lang, ds_path in ds_paths.items():
    os.makedirs(ds_path, exist_ok=True)
    has_lang = grouped_df['lang_id_utt']==lang
    lang_df = grouped_df[has_lang]
    lang_df['file_name'] = lang_df['split'] + '/' + lang_df.index
    lang_df.index.names = ['src_file']
    lang_df.to_csv(
        os.path.join(ds_path,'metadata.csv'),
    )
    for split in ['train', 'dev', 'test']:
        if split != 'train' and lang != 'codeswitched':
            continue
        os.makedirs(os.path.join(ds_path,split), exist_ok=True)
        has_split = lang_df['split']==split
        split_df = lang_df[has_split]
        print(f"Saving audio files for {split} split of {lang} dataset")
        for audio in tqdm(split_df.index):
            tgt_path = os.path.join(ds_path,split,audio)
            src_path = os.path.join(audio_path,audio)
            if not os.path.exists(tgt_path):
                torchaudio.save(
                    tgt_path,
                    load_and_resample(src_path),
                    16_000
                )

Saving audio files for train split of codeswitched dataset


100%|██████████| 2793/2793 [00:00<00:00, 44620.43it/s]


Saving audio files for dev split of codeswitched dataset


100%|██████████| 224/224 [00:00<00:00, 55787.90it/s]


Saving audio files for test split of codeswitched dataset


100%|██████████| 767/767 [00:00<00:00, 40186.77it/s]


Saving audio files for train split of zul dataset


100%|██████████| 4362/4362 [00:00<00:00, 73852.20it/s]


Saving audio files for train split of eng dataset


100%|██████████| 1223/1223 [00:00<00:00, 15327.35it/s]


In [34]:
for ds_path in ds_paths.values():
    ds = load_dataset('audiofolder', data_dir=ds_path)
    outpath = ds_path.replace('hf-datasets', 'pyarrow-datasets')
    ds.save_to_disk(outpath)

Resolving data files: 100%|██████████| 2793/2793 [00:00<00:00, 95457.92it/s]
Resolving data files: 100%|██████████| 224/224 [00:00<00:00, 451694.28it/s]
Resolving data files: 100%|██████████| 767/767 [00:00<00:00, 449306.03it/s]
Saving the dataset (1/1 shards): 100%|██████████| 2793/2793 [00:02<00:00, 1087.20 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 224/224 [00:00<00:00, 752.46 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 767/767 [00:00<00:00, 991.62 examples/s]
Resolving data files: 100%|██████████| 4362/4362 [00:00<00:00, 28830.33it/s]
Saving the dataset (1/1 shards): 100%|██████████| 4362/4362 [00:03<00:00, 1378.56 examples/s]
Resolving data files: 100%|██████████| 1223/1223 [00:00<00:00, 442935.31it/s]
Saving the dataset (1/1 shards): 100%|██████████| 1223/1223 [00:01<00:00, 687.07 examples/s]
