In [2]:
from dataloading import load_tira_asr
import pandas as pd
from unidecode import unidecode
from tqdm import tqdm

In [3]:
ds=load_tira_asr()
ds

Dataset({
    features: ['audio', 'start', 'end', 'transcription', 'eaf_source', 'wav_source', 'raw_transcription', 'clip', 'wav_rawpath', 'path', 'allosaurus', 'clap_ipa_cos_sim', 'wada_snr', 'nist_stnr', 'speaker-diarization-3.1', 'voice-activity-detection', 'whisper-large-v3', 'clapipa-transcription-allosaurus', 'vad_s', 'drz_s', 'trans_len', 'pcnt_speech', 'trans_len_sq', 'trans_len_log', 'cos_sim_softmax', 'cos_sim_log', 'duration', 'filestem', 'rewritten_transcript'],
    num_rows: 20480
})

## Merged transcripts
Find instances where FST normalization caused several dissimilar hand-transcribed sentences to merge and save as a .csv file.

In [4]:
merged_csv = 'data/labels/keyphrases_rewritten_merges.csv'

In [5]:
colmap = {'transcription': 'eaf_text', 'rewritten_transcript': 'fst_text'}
cols_to_drop = set(ds.column_names)-set(colmap.keys())
ds_noaudio = ds.remove_columns(cols_to_drop)
df = ds_noaudio.to_pandas()
df = df.rename(columns=colmap)
df.head()

Unnamed: 0,eaf_text,fst_text
0,àprí jɜ̀dí ðáŋàlà,àpɾí jàdí ðáŋàlà
1,àprí jɜ̀dí ðáŋàlà,àpɾí jàdí ðáŋàlà
2,àprí jə̀və̀lɛ̀ðɔ́ ðáŋàlà,àpɾí jə̀və̀lɛ̀ðɔ́ ðáŋàlà
3,àprí jə̀və̀lɛ̀ðɔ́ ðáŋàlà,àpɾí jə̀və̀lɛ̀ðɔ́ ðáŋàlà
4,àprí jə̀və̀lɛ̀ðɔ́ ðáŋàlà,àpɾí jə̀və̀lɛ̀ðɔ́ ðáŋàlà


In [6]:
fst_to_eaf = {}
fst_unique = df['fst_text'].unique().tolist()
eaf_unique = df['eaf_text'].unique().tolist()
for fst_text in tqdm(fst_unique):
    mask = df['fst_text'] == fst_text
    eaf_text = df.loc[mask, 'eaf_text'].unique().tolist()
    fst_to_eaf[fst_text] = eaf_text
len(fst_unique), len(eaf_unique)

100%|██████████| 8322/8322 [00:05<00:00, 1538.67it/s]


(8322, 9399)

In [7]:
unique_phrase_df = df.drop_duplicates(subset=['eaf_text'])
print(df.shape, unique_phrase_df.shape)

unique_phrase_df['num_eaf_variants'] = unique_phrase_df['fst_text'].map(lambda x: len(fst_to_eaf[x]))
unique_phrase_df = unique_phrase_df.sort_values('num_eaf_variants', ascending=False)
unique_phrase_df=unique_phrase_df.set_index('fst_text')
unique_phrase_df.head()

(20480, 2) (9399, 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_phrase_df['num_eaf_variants'] = unique_phrase_df['fst_text'].map(lambda x: len(fst_to_eaf[x]))


Unnamed: 0_level_0,eaf_text,num_eaf_variants
fst_text,Unnamed: 1_level_1,Unnamed: 2_level_1
láló və́lɛ̀ðà nd̪ɔ̀bà,lálóvə́lɛ̂ðǎ ndɔ̀bà,8
láló və́lɛ̀ðà nd̪ɔ̀bà,lálóvə́lɛ̂ðà ndɔ̀bàgɛ̀,8
láló və́lɛ̀ðà nd̪ɔ̀bà,lálóvə́lɛ̂ðá nd̪ɔ̀bàgɛ̀,8
láló və́lɛ̀ðà nd̪ɔ̀bà,lǎlóvə́lɛ̂ðà nd̪ɔ̀bàgɛ̀,8
láló və́lɛ̀ðà nd̪ɔ̀bà,lálóvə́lɛ̀ðà nd̪ɔ̀bàgɛ̀,8


In [8]:
unique_phrase_df['num_eaf_variants'].value_counts()

num_eaf_variants
1    7435
2    1480
3     360
4      76
5      20
7      14
8       8
6       6
Name: count, dtype: int64

In [28]:
all_phrases = unique_phrase_df.index.unique().tolist()
all_phrases[:10], len(all_phrases)

(['láló və́lɛ̀ðà nd̪ɔ̀bà',
  'lə̀və̀lɛ̀ðɜ́lló únɛ́ɾɛ́',
  'ɛ̀màð kìcə̀lò',
  'íŋgá ɾɔ́ðà',
  'ágá úrɔ̀ ká nɔ́nà ìmìdánŋwɔ́',
  'ɛ̀lɛ̀',
  'duɾ',
  'kə̀və̀lɛ̀ðɔ́l ùnɛ́ɾɛ́',
  'èd̪ɛ̀ɾɛ́ kìcə̀lò',
  'lɛ̀ĺ və́lɛ̀ðɛ̀ ùnɛ́ɾɛ́'],
 8322)

## Keyphrase selection
Visualize keyphrases with 10 or more tokens and select the set to be used for KWS.

In [13]:
keyphrase_counts = df['fst_text'].value_counts()
keyphrase_counts.head(10)

fst_text
ŋgɛ́ĺ íŋgá t̪ə̀vɔ́ ðár       21
láló və́lɛ̀ðà nd̪ɔ̀bà        20
léɲâl və́lɛ̀ðɛ̀ ùnɛ́ɾɛ́       19
lə̀və̀lɛ̀ðɜ́lló únɛ́ɾɛ́        19
làlló və́lɛ̀ðɛ̀ ùnɛ́ɾɛ́       17
ìrɔ́ jìcə̀lò                  17
ðɔ́ íjɔ̀                        17
lâl òló və́lɛ̀ðɛ̀ ùnɛ́ɾɛ́    16
lɛ̀ĺ və́lɛ̀ðɛ̀ ùnɛ́ɾɛ́         16
vŕðɔ́ álɛ́                     16
Name: count, dtype: int64

In [11]:
unique_phrase_df['token_count'] = keyphrase_counts
unique_phrase_df.head()

Unnamed: 0_level_0,eaf_text,num_eaf_variants,token_count
fst_text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
láló və́lɛ̀ðà nd̪ɔ̀bà,lálóvə́lɛ̂ðǎ ndɔ̀bà,8,20
láló və́lɛ̀ðà nd̪ɔ̀bà,lálóvə́lɛ̂ðà ndɔ̀bàgɛ̀,8,20
láló və́lɛ̀ðà nd̪ɔ̀bà,lálóvə́lɛ̂ðá nd̪ɔ̀bàgɛ̀,8,20
láló və́lɛ̀ðà nd̪ɔ̀bà,lǎlóvə́lɛ̂ðà nd̪ɔ̀bàgɛ̀,8,20
láló və́lɛ̀ðà nd̪ɔ̀bà,lálóvə́lɛ̀ðà nd̪ɔ̀bàgɛ̀,8,20


In [16]:
mask = unique_phrase_df['token_count'] > 10
high_freq_phrases = unique_phrase_df[mask]
high_freq_phrases.shape

(99, 3)

In [35]:
pd.Series(high_freq_phrases.index.str.len()).describe()

count    99.000000
mean     21.191919
std       7.492414
min       5.000000
25%      15.000000
50%      24.000000
75%      25.000000
max      37.000000
Name: fst_text, dtype: float64

For now, just sample randomly. Eventually want to hand-pick keyphrases.

In [17]:
num_keyphrases = 30
keyphrase_set = high_freq_phrases.sample(30)
keyphrase_set.head()

Unnamed: 0_level_0,eaf_text,num_eaf_variants,token_count
fst_text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
íŋgá ɾɔ́ðà,íŋgɔ́ɾɔ́ðà,6,11
kàŋɛ̂ və̀lɛ̀ðɛ̀ ùnɛ́ɾɛ́,kàŋvə́lɛ̂ðɛ̀ ùnɛ̀ɾɛ̀,3,14
íŋgátɛ́ nɔ́nà jùɽáɾɛ́ t̪ə́mànì,íŋgáðɛ́nònǎ dùɽáɾɛ́ t̪ówànî,4,12
íŋgá ɾɔ́ðà,íŋgɔ́rɔ́ðà,6,11
ìrɔ́ jìcə̀lò,ìrɔ́ jɛ̀cə̀lò,3,17


Get negative keyphrases based on edit distance from positive keyphrase.

First, make a distance matrix showing pairwise Levenshtein edit distance between any two keyword pairs.

In [31]:
from Levenshtein import distance as levenshtein_distance
import numpy as np

levenshtein_matrix = np.zeros((len(all_phrases), len(all_phrases)), dtype=int)
for i, phrase1 in tqdm(enumerate(all_phrases), total=len(all_phrases)):
    for j, phrase2 in enumerate(all_phrases):
        dist = levenshtein_matrix[j, i] or levenshtein_distance(phrase1, phrase2)
        levenshtein_matrix[i, j] = dist

  0%|          | 0/8322 [00:00<?, ?it/s]

100%|██████████| 8322/8322 [00:23<00:00, 360.67it/s]


In [32]:
dists_flattened = levenshtein_matrix.flatten()
pd.Series(dists_flattened).describe()

count    6.925568e+07
mean     2.489973e+01
std      8.790929e+00
min      0.000000e+00
25%      1.900000e+01
50%      2.400000e+01
75%      2.900000e+01
max      1.160000e+02
dtype: float64

Now, for each positive keyphrase, bucket negative keyphrases based on the ratio of the edit distance to the keyphrase's length.
Then sample 30 keywords from 0-33% edit distance/length (hard), 33-66% (medium) and 66-100% (easy)


In [36]:
for keyphrase in keyphrase_set.index:
    keyphrase_idx = all_phrases.index(keyphrase)
    keyphrase_len = len(keyphrase)

    dists_to_keyphrase = levenshtein_matrix[keyphrase_idx, :]
    dists_to_keyphrase_norm = dists_to_keyphrase / keyphrase_len

    easy_mask = dists_to_keyphrase_norm > 0.67
    medium_mask = (dists_to_keyphrase_norm <= 0.67) & (dists_to_keyphrase_norm > 0.33)
    hard_mask = dists_to_keyphrase_norm <= 0.33

    print(f'Keyphrase: "{keyphrase}" (len={keyphrase_len})')
    print(f'  Easy: {np.sum(easy_mask)}')
    print(f'  Medium: {np.sum(medium_mask)}')
    print(f'  Hard: {np.sum(hard_mask)}')

Keyphrase: "íŋgá ɾɔ́ðà" (len=13)
  Easy: 8126
  Medium: 182
  Hard: 14
Keyphrase: "kàŋɛ̂ və̀lɛ̀ðɛ̀ ùnɛ́ɾɛ́" (len=25)
  Easy: 7375
  Medium: 873
  Hard: 74
Keyphrase: "íŋgátɛ́ nɔ́nà jùɽáɾɛ́ t̪ə́mànì" (len=37)
  Easy: 7055
  Medium: 1250
  Hard: 17
Keyphrase: "íŋgá ɾɔ́ðà" (len=13)
  Easy: 8126
  Medium: 182
  Hard: 14
Keyphrase: "ìrɔ́ jìcə̀lò" (len=15)
  Easy: 7669
  Medium: 596
  Hard: 57
Keyphrase: "láló və́lɛ̀ðà nd̪ɔ̀bà" (len=25)
  Easy: 6877
  Medium: 1313
  Hard: 132
Keyphrase: "lə̀və̀lɛ̀ðɜ́l únɛ́ɾɛ́" (len=22)
  Easy: 7702
  Medium: 548
  Hard: 72
Keyphrase: "àɾò kìcə̀lò" (len=15)
  Easy: 7712
  Medium: 531
  Hard: 79
Keyphrase: "láló və́lɛ̀ðà nd̪ɔ̀bà" (len=25)
  Easy: 6877
  Medium: 1313
  Hard: 132
Keyphrase: "lâl òló və́lɛ̀ðɛ̀ ùnɛ́ɾɛ́" (len=29)
  Easy: 7072
  Medium: 1184
  Hard: 66
Keyphrase: "lə̀və̀lɛ̀ðɜ́l únɛ́ɾɛ́" (len=22)
  Easy: 7702
  Medium: 548
  Hard: 72
Keyphrase: "lə̀və̀lɛ̀ðáɲál únɛ́ɾɛ́" (len=25)
  Easy: 7559
  Medium: 680
  Hard:

In [23]:
non_keyphrase_mask = ~unique_phrase_df.index.isin(keyphrase_set.index)
non_keyphrase_df = unique_phrase_df[non_keyphrase_mask]
non_keyphrase_df.shape, non_keyphrase_df.index.nunique()

((9340, 3), 8304)

In [12]:
unique_phrase_df.to_csv(merged_csv)