In [64]:
from dataloading import load_tira_asr
import pandas as pd
from unidecode import unidecode
from tqdm import tqdm

In [65]:
ds=load_tira_asr()
ds

Dataset({
    features: ['audio', 'start', 'end', 'transcription', 'eaf_source', 'wav_source', 'raw_transcription', 'clip', 'wav_rawpath', 'path', 'allosaurus', 'clap_ipa_cos_sim', 'wada_snr', 'nist_stnr', 'speaker-diarization-3.1', 'voice-activity-detection', 'whisper-large-v3', 'clapipa-transcription-allosaurus', 'vad_s', 'drz_s', 'trans_len', 'pcnt_speech', 'trans_len_sq', 'trans_len_log', 'cos_sim_softmax', 'cos_sim_log', 'duration', 'filestem', 'rewritten_transcript'],
    num_rows: 20480
})

## Merged transcripts
Find instances where FST normalization caused several dissimilar hand-transcribed sentences to merge and save as a .csv file.

In [66]:
merged_csv = 'data/labels/keyphrases_rewritten_merges.csv'

In [67]:
colmap = {'transcription': 'eaf_text', 'rewritten_transcript': 'fst_text'}
cols_to_drop = set(ds.column_names)-set(colmap.keys())
ds_noaudio = ds.remove_columns(cols_to_drop)
df = ds_noaudio.to_pandas()
df = df.rename(columns=colmap)
df.head()

Unnamed: 0,eaf_text,fst_text
0,àprí jɜ̀dí ðáŋàlà,àpɾí jàdí ðáŋàlà
1,àprí jɜ̀dí ðáŋàlà,àpɾí jàdí ðáŋàlà
2,àprí jə̀və̀lɛ̀ðɔ́ ðáŋàlà,àpɾí jə̀və̀lɛ̀ðɔ́ ðáŋàlà
3,àprí jə̀və̀lɛ̀ðɔ́ ðáŋàlà,àpɾí jə̀və̀lɛ̀ðɔ́ ðáŋàlà
4,àprí jə̀və̀lɛ̀ðɔ́ ðáŋàlà,àpɾí jə̀və̀lɛ̀ðɔ́ ðáŋàlà


In [68]:
fst_to_eaf = {}
fst_unique = df['fst_text'].unique().tolist()
eaf_unique = df['eaf_text'].unique().tolist()
for fst_text in tqdm(fst_unique):
    mask = df['fst_text'] == fst_text
    eaf_text = df.loc[mask, 'eaf_text'].unique().tolist()
    fst_to_eaf[fst_text] = eaf_text
len(fst_unique), len(eaf_unique)

  0%|          | 0/8322 [00:00<?, ?it/s]

100%|██████████| 8322/8322 [00:05<00:00, 1567.86it/s]


(8322, 9399)

In [69]:
unique_phrase_df = df.drop_duplicates(subset=['eaf_text'])
print(df.shape, unique_phrase_df.shape)

unique_phrase_df['num_eaf_variants'] = unique_phrase_df['fst_text'].map(lambda x: len(fst_to_eaf[x]))
unique_phrase_df = unique_phrase_df.sort_values('num_eaf_variants', ascending=False)
unique_phrase_df=unique_phrase_df.set_index('fst_text')
unique_phrase_df.head()

(20480, 2) (9399, 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_phrase_df['num_eaf_variants'] = unique_phrase_df['fst_text'].map(lambda x: len(fst_to_eaf[x]))


Unnamed: 0_level_0,eaf_text,num_eaf_variants
fst_text,Unnamed: 1_level_1,Unnamed: 2_level_1
láló və́lɛ̀ðà nd̪ɔ̀bà,lálóvə́lɛ̂ðǎ ndɔ̀bà,8
láló və́lɛ̀ðà nd̪ɔ̀bà,lálóvə́lɛ̂ðà ndɔ̀bàgɛ̀,8
láló və́lɛ̀ðà nd̪ɔ̀bà,lálóvə́lɛ̂ðá nd̪ɔ̀bàgɛ̀,8
láló və́lɛ̀ðà nd̪ɔ̀bà,lǎlóvə́lɛ̂ðà nd̪ɔ̀bàgɛ̀,8
láló və́lɛ̀ðà nd̪ɔ̀bà,lálóvə́lɛ̀ðà nd̪ɔ̀bàgɛ̀,8


In [70]:
unique_phrase_df['num_eaf_variants'].value_counts()

num_eaf_variants
1    7435
2    1480
3     360
4      76
5      20
7      14
8       8
6       6
Name: count, dtype: int64

In [71]:
unique_phrase_df.to_csv(merged_csv)

## Keyphrase selection
Visualize keyphrases with 10 or more tokens and select the set to be used for KWS.

In [72]:
keyphrase_counts = df['fst_text'].value_counts()
keyphrase_counts.head(20)

fst_text
ŋgɛ́ĺ íŋgá t̪ə̀vɔ́ ðár       21
láló və́lɛ̀ðà nd̪ɔ̀bà        20
léɲâl və́lɛ̀ðɛ̀ ùnɛ́ɾɛ́       19
lə̀və̀lɛ̀ðɜ́lló únɛ́ɾɛ́        19
làlló və́lɛ̀ðɛ̀ ùnɛ́ɾɛ́       17
ìrɔ́ jìcə̀lò                  17
ðɔ́ íjɔ̀                        17
lâl òló və́lɛ̀ðɛ̀ ùnɛ́ɾɛ́    16
lɛ̀ĺ və́lɛ̀ðɛ̀ ùnɛ́ɾɛ́         16
vŕðɔ́ álɛ́                     16
lə̀və̀lɛ̀ðáɲâl únɛ́ɾɛ́        15
ɽa t̪ə́mànì                    15
áɔ̀nt̪ɔ́ ɛ́ɽɛ́                  15
kàŋɛ̂ və̀lɛ̀ðɛ̀ ùnɛ́ɾɛ́        14
lə̀və̀lɛ̀ðɜ́l únɛ́ɾɛ́           14
ɛ̀là                            14
léɲâĺ və́lɛ̀ðɛ̀ ùnɛ́ɾɛ́      14
kúkù kə̀ŋàcáŋà r̀ŋɔ̀        14
ðá ɲɔ́ðɔ́ və́lɛ̀ðà nd̪ɔ̀bà    13
lá vŕðìnɔ̀ nd̪ɔ̀bà           13
Name: count, dtype: int64