# Combine snippets
Given Whisper output for Tira and English, make a csv with labels from concatenated snippets

In [155]:
import pandas as pd
import os
from tqdm import tqdm
import librosa
import soundfile
tqdm.pandas()
from datasets import load_dataset

In [1]:
metadata='/Users/markjos/projects/malachor5/data/tira-snippets-30s/metadata.csv'
tira_snippets='/Users/markjos/projects/malachor5/data/tira-snippets-30s/tira_snippets_out.csv'
eng_snippets='/Users/markjos/projects/malachor5/data/tira-snippets-30s/tira-cm-eng.csv'

In [4]:
metadata_df=pd.read_csv(metadata)
tira_df=pd.read_csv(tira_snippets)
eng_snippets=pd.read_csv(eng_snippets)

metadata_df.shape, tira_df.shape, eng_snippets.shape

((35225, 52), (25773, 54), (9162, 53))

For some reason 290 snippets got stolen by gremlins >:(

In [5]:
35225-25773-9162

290

In [106]:
metadata_df['turn_label']

0                            VAD
1                            VAD
2                            VAD
3                            VAD
4        àprí jɜ̀dí ðáŋàlà
                  ...           
35220                        VAD
35221              lá vr̀ðɔ̀ðɔ́
35222              lá vr̀ðɔ̀ðɔ́
35223                        VAD
35224                        VAD
Name: turn_label, Length: 35225, dtype: object

## Merging datasets
Create a unique identifier for each snippet

In [37]:
metadata_df['snippet_name']=metadata_df['mother_clip']+metadata_df['clip_i'].apply(str)
tira_df['snippet_name']=tira_df['mother_clip']+tira_df['clip_i'].apply(str)
eng_snippets['snippet_name']=eng_snippets['mother_clip']+eng_snippets['clip_i'].apply(str)

In [110]:
def get_snippet_label(row):
    snippet_name=row['snippet_name']

    tira=None
    eng=None
    if row['turn_label']!='VAD':
        tira=row['turn_label']
    elif snippet_name in tira_df['snippet_name'].values:
        tira=tira_df.loc[tira_df['snippet_name']==snippet_name,'yoruba'].item()

    if snippet_name in eng_snippets['snippet_name'].values:
        eng=eng_snippets.loc[eng_snippets['snippet_name']==snippet_name,'english'].item()

    if tira and eng:
        print(snippet_name, tira, eng)
    elif (not tira) and (not eng):
        print(snippet_name)
    return tira or eng

get_snippet_label(metadata_df.iloc[0])

" There's no excuse for the taking."

In [111]:
metadata_df['snippet_transcription']=metadata_df.progress_apply(get_snippet_label, axis=1)

data/tira-asr/clips/HH02262021-m45s28ms065-m45s30ms126.wav(11, 11) jǎ ðáŋâl və̀lɛ̀ðà ndòbà  you
data/tira-asr/clips/HH11042020-Zoom-m01s55ms717-m01s57ms184.wav(6, 120.465) ðə̀və̀lɛ̀ðɔ́ úrnò  Okay.
data/tira-asr/clips/HH20220306-2-h01m09s22ms622-h01m09s26ms635.wav6
data/tira-asr/clips/HH20220403-2-m11s57ms608-m11s59ms006.wav10
data/tira-asr/clips/HH20220629-2-m15s55ms620-m15s58ms840.wav12
data/tira-asr/clips/HH20230414-Zoom-2-m05s27ms066-m05s28ms599.wav0
data/tira-asr/clips/HH20230724-m13s51ms360-m13s52ms540.wav(5, 5) làít̪ɔ̀ dìjɔ́ ɛɽɛ̀  Later, diole.
data/tira-asr/clips/HH20240223-m48s49ms927-m48s50ms659.wav(6, 6) ðə́və́lɛ́ðáló ðàbɽà  I'm going to that other part.


In [114]:
(~metadata_df['snippet_transcription'].isna()).value_counts()

snippet_transcription
True     35221
False        4
Name: count, dtype: int64

## Combine snippets
Create a new dataset with a single transcription for all snippets in a 30s chunk

In [124]:
timestamps_30s=metadata_df['mother_timestamps_30s'].apply(eval)

start_30s=timestamps_30s.apply(lambda l:l[0][0])
end_30s=timestamps_30s.apply(lambda l:l[-1][-1])

(end_30s-start_30s).mean()

27.6126124343506

In [140]:
(metadata_df['mother_end']-metadata_df['mother_start']).mean()/1_000

62.264683946061034

In [126]:
metadata_df['end_30s']=end_30s
metadata_df['start_30s']=start_30s

In [152]:
rows = []
has_snippet_label=metadata_df[~metadata_df['snippet_transcription'].isna()].copy()
out_clips='/Users/markjos/projects/malachor5/data/tira-code-mixed/clips'

for clip in tqdm(has_snippet_label['mother_clip'].unique()):
    clip_snippets=has_snippet_label.loc[has_snippet_label['mother_clip']==clip]
    clip_snippets['clip_i']=clip_snippets['clip_i'].apply(
        eval
    ).apply(
        lambda x: x[0] if type(x) is tuple else x
    )
    start_30s=clip_snippets['start_30s'].iloc[0]
    end_30s=clip_snippets['end_30s'].iloc[0]

    start_60s=clip_snippets['mother_start'].iloc[0]/1_000
    end_60s=clip_snippets['mother_end'].iloc[0]/1_000

    clip_name=clip_snippets['mother_path'].iloc[0]
    
    wav, sr=librosa.load('../data/tira-code-mixed-60s/clips/train/'+clip_name)
    start_frame=int((start_30s-start_60s)*sr)
    end_frame=int((end_30s-start_60s)*sr)
    clipped_wav=wav[start_frame:end_frame]
    new_clip_path = os.path.join(out_clips, clip_name)
    soundfile.write(new_clip_path, clipped_wav, sr)

    row={
        'file_name': os.path.join('clips', clip_name),
        'transcription': ' '.join(clip_snippets['snippet_transcription']),
    }
    rows.append(row)
len(rows)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clip_snippets['clip_i']=clip_snippets['clip_i'].apply(
100%|██████████| 4421/4421 [02:21<00:00, 31.31it/s]


4421

In [153]:
df_joined=pd.DataFrame(rows)
df_joined['transcription']

0        There's no excuse for the taking. ðùrɛ́ ŋìb...
1        Okay, you asked me to pull? Oh, you thought y...
2        Yes, if it's a way, it's a print, you have a ...
3        Okay lìdèlì ðə̀bágɔ̀ ɔ́ŋ pɔ́lì àprí ja...
4        You heard it as high lálɔ́vɔ̀ là  Would you...
                              ...                        
4416    íjɛ́s lá vr̀ðɜ̀ìnɔ́ lávr̀ðɔ̀ ðìnɔ̀ lá vr...
4417     What could you also say? lá vr̀ðìðú  Yeah,...
4418    lá vr̀ðɔ̀ðìðɔ́ lá vr̀ðɔ̀ðìðɔ́  Oh, good. l...
4419     What took me off the train and what I expecte...
4420    lɛ́làrɛ́ lɛ̀nðí lá vr̀ðɔ̀ðɔ́ ðɛ́t̪ɔ́ ðɛ̀dɔ̀...
Name: transcription, Length: 4421, dtype: object

In [154]:
tira_cm_path='/Users/markjos/projects/malachor5/data/tira-code-mixed/'
tira_cm_metadata='/Users/markjos/projects/malachor5/data/tira-code-mixed/metadata.csv'
df_joined.to_csv(tira_cm_metadata, index=False)

# Dataset creation
Let's make this data boi, upload, train, and **go to bed**

In [156]:
ds=load_dataset('audiofolder', data_dir=tira_cm_path)
ds

Resolving data files: 100%|██████████| 4422/4422 [00:00<00:00, 30109.06it/s]
Downloading data files: 100%|██████████| 4422/4422 [00:00<00:00, 61254.98it/s]
Downloading data files: 0it [00:00, ?it/s]
Extracting data files: 0it [00:00, ?it/s]
Generating train split: 4421 examples [00:00, 9125.37 examples/s]


DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 4421
    })
})

In [158]:
ds_path='/Users/markjos/projects/malachor5/data/pyarrow-datasets/tira-code-mixed'
ds.save_to_disk(ds_path)

Saving the dataset (11/11 shards): 100%|██████████| 4421/4421 [00:23<00:00, 186.69 examples/s]
