In [83]:
import pandas as pd
from unidecode import unidecode
from tqdm import tqdm
import matplotlib.pyplot as plt
import random
import json
from collections import defaultdict

In [56]:
random_seed = 1337
random.seed(random_seed)

In [57]:
# using this until I figure out how to set jupyter working dir in PyCharm
import os
project_dir = os.path.expanduser('~/projects/tira_kws')
os.chdir(project_dir)

In [58]:
# local imports
from dataloading import load_tira_asr
from constants import (
    PHRASELIST_PATH, MERGED_PHRASES_CSV,
    KEYPHRASE_CSV, CER_MATRIX_PATH, CALIBRATION_LIST
)

In [59]:
ds=load_tira_asr()
ds

Dataset({
    features: ['audio', 'start', 'end', 'transcription', 'eaf_source', 'wav_source', 'raw_transcription', 'clip', 'wav_rawpath', 'path', 'allosaurus', 'clap_ipa_cos_sim', 'wada_snr', 'nist_stnr', 'speaker-diarization-3.1', 'voice-activity-detection', 'whisper-large-v3', 'clapipa-transcription-allosaurus', 'vad_s', 'drz_s', 'trans_len', 'pcnt_speech', 'trans_len_sq', 'trans_len_log', 'cos_sim_softmax', 'cos_sim_log', 'duration', 'filestem', 'rewritten_transcript'],
    num_rows: 20480
})

## Merged transcripts
Find instances where FST normalization caused several dissimilar hand-transcribed sentences to merge and save as a .csv file.

In [60]:
colmap = {'transcription': 'eaf_text', 'rewritten_transcript': 'fst_text'}
cols_to_drop = set(ds.column_names)-set(colmap.keys())
ds_noaudio = ds.remove_columns(cols_to_drop)
df = ds_noaudio.to_pandas()
df = df.rename(columns=colmap)
df.head()

Unnamed: 0,eaf_text,fst_text
0,àprí jɜ̀dí ðáŋàlà,àpɾí jàdí ðáŋàlà
1,àprí jɜ̀dí ðáŋàlà,àpɾí jàdí ðáŋàlà
2,àprí jə̀və̀lɛ̀ðɔ́ ðáŋàlà,àpɾí jə̀və̀lɛ̀ðɔ́ ðáŋàlà
3,àprí jə̀və̀lɛ̀ðɔ́ ðáŋàlà,àpɾí jə̀və̀lɛ̀ðɔ́ ðáŋàlà
4,àprí jə̀və̀lɛ̀ðɔ́ ðáŋàlà,àpɾí jə̀və̀lɛ̀ðɔ́ ðáŋàlà


In [61]:
fst_to_eaf = {}
fst_unique = df['fst_text'].unique().tolist()
eaf_unique = df['eaf_text'].unique().tolist()
eaf_strs_encountered = set()
for fst_text in tqdm(fst_unique):
    mask = df['fst_text'] == fst_text
    eaf_text = df.loc[mask, 'eaf_text'].unique().tolist()
    fst_to_eaf[fst_text] = eaf_text
    # ensure only one FST str per EAF str
    assert not any(eaf_str in eaf_strs_encountered for eaf_str in eaf_text)
    eaf_strs_encountered.update(*eaf_text)
len(fst_unique), len(eaf_unique)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8350/8350 [00:05<00:00, 1578.62it/s]


(8350, 9399)

In [62]:
eaf_unique_df = df.drop_duplicates(subset=['eaf_text'])
eaf_unique_df = eaf_unique_df.reset_index(drop=True)
print(df.shape, eaf_unique_df.shape)

eaf_unique_df['num_eaf_variants'] = eaf_unique_df['fst_text']\
    .apply(fst_to_eaf.get)\
    .apply(len)
eaf_unique_df = eaf_unique_df.sort_values('num_eaf_variants', ascending=False)
# eaf_unique_df = eaf_unique_df.set_index('fst_text')
eaf_unique_df.head()

(20480, 2) (9399, 2)


Unnamed: 0,eaf_text,fst_text,num_eaf_variants
5821,ɛ̀màðɛ̀lí kìcə̀lò,ɛ̀màð kìcə̀lò,7
5819,ɛ̀màðàlʊ́ kìcə̀lò,ɛ̀màð kìcə̀lò,7
7495,àmàð kìclò,ɛ̀màð kìcə̀lò,7
930,lə̀və̀lɛ̀ðɜ̂lló únɛ́rɛ̀,lə̀və̀lɛ̀ðɜ́lló únɛ́ɾɛ́,7
1308,lə̀və̀lɛ̀ðálírló únɛ́ɾɛ̀,lə̀və̀lɛ̀ðɜ́lló únɛ́ɾɛ́,7


How many unique EAF strings do we have per FST-normalized string?

In [63]:
eaf_unique_df['num_eaf_variants'].value_counts()

num_eaf_variants
1    7484
2    1440
3     366
4      68
5      15
7      14
6      12
Name: count, dtype: int64

Ignore EAF text, make a `DataFrame` with only unique FST strings

In [64]:
eaf_unique_df['fst_text'].unique().shape,\
df['fst_text'].unique().shape

((8350,), (8350,))

In [65]:
eaf_unique_df.to_csv(MERGED_PHRASES_CSV, index_label='index')

## Make dataframe of unique phrases
Now that we've explored the mapping between FST strings and EAF stings, let's make a `DataFrame` that has a single row for each unique FST string and add a column `token_counts` that indicates the number of times each string occurs in the original dataset.

In [66]:
unique_phrase_df = eaf_unique_df.drop(columns='eaf_text')
unique_phrase_df = unique_phrase_df.drop_duplicates(subset='fst_text')
unique_phrase_df.shape, eaf_unique_df.shape

((8350, 2), (9399, 3))

In [67]:
unique_phrase_df.head()

Unnamed: 0,fst_text,num_eaf_variants
5821,ɛ̀màð kìcə̀lò,7
930,lə̀və̀lɛ̀ðɜ́lló únɛ́ɾɛ́,7
536,íŋgá ɾɔ́ðà,6
265,láló və́lɛ̀ðà nd̪ɔ̀bàgɛ̀,6
372,kə̀və̀lɛ̀ðɔ́l ùnɛ́ɾɛ́,5


In [68]:
token_counts = df['fst_text'].value_counts()

# change index to ensure count is properly mapped to respective string
unique_phrase_df = unique_phrase_df.set_index('fst_text')
unique_phrase_df['token_count'] = token_counts

# sanity check
for _ in range(100):
    i = random.randint(0, len(unique_phrase_df))
    curr_keyphrase = unique_phrase_df.index[i]
    assert token_counts[curr_keyphrase] ==\
        unique_phrase_df.at[curr_keyphrase, 'token_count'].item()

# reset index to clean up
unique_phrase_df = unique_phrase_df.reset_index()
unique_phrase_df.head()

Unnamed: 0,fst_text,num_eaf_variants,token_count
0,ɛ̀màð kìcə̀lò,7,10
1,lə̀və̀lɛ̀ðɜ́lló únɛ́ɾɛ́,7,19
2,íŋgá ɾɔ́ðà,6,11
3,láló və́lɛ̀ðà nd̪ɔ̀bàgɛ̀,6,18
4,kə̀və̀lɛ̀ðɔ́l ùnɛ́ɾɛ́,5,10


In [69]:
unique_phrase_df['token_count'].describe()

count    8350.000000
mean        2.452695
std         1.711873
min         1.000000
25%         1.000000
50%         2.000000
75%         3.000000
max        21.000000
Name: token_count, dtype: float64

Get a list of all phrases. The order of this list will be used to index positive and negative keyphrase tokens in the keyphrase dataset to be generated.

In [70]:
all_phrases = unique_phrase_df['fst_text'].tolist()
len(all_phrases), all_phrases[:5]

(8350,
 ['ɛ̀màð kìcə̀lò',
  'lə̀və̀lɛ̀ðɜ́lló únɛ́ɾɛ́',
  'íŋgá ɾɔ́ðà',
  'láló və́lɛ̀ðà nd̪ɔ̀bàgɛ̀',
  'kə̀və̀lɛ̀ðɔ́l ùnɛ́ɾɛ́'])

## Keyphrase selection
Visualize keyphrases with 10 or more tokens and select the set to be used for KWS.

In [71]:
keyphrase_mask = unique_phrase_df['token_count'] >= 10
unique_phrase_df['is_keyphrase'] = keyphrase_mask
high_freq_phrases = unique_phrase_df[keyphrase_mask]['fst_text'].tolist()
high_freq_phrases[:5], unique_phrase_df[keyphrase_mask].shape

(['ɛ̀màð kìcə̀lò',
  'lə̀və̀lɛ̀ðɜ́lló únɛ́ɾɛ́',
  'íŋgá ɾɔ́ðà',
  'láló və́lɛ̀ðà nd̪ɔ̀bàgɛ̀',
  'kə̀və̀lɛ̀ðɔ́l ùnɛ́ɾɛ́'],
 (56, 4))

We have 56 unique keyphrases with 10 or more occurrences. We'll define these as our keyword set.

Let's do some introspection on these keywords, starting with the distribution of their string length.

In [72]:
pd.Series(unique_phrase_df[keyphrase_mask]['fst_text'].str.len()).describe()

count    56.000000
mean     21.285714
std       8.307093
min       5.000000
25%      15.000000
50%      22.000000
75%      26.000000
max      40.000000
Name: fst_text, dtype: float64

Get negative keyphrases based on CER from positive keyphrase.

First, make a distance matrix showing pairwise CER between keyphrases and all sentences.
Row

In [73]:
from jiwer import cer, process_characters
import numpy as np

cer_matrix = np.zeros((keyphrase_mask.sum(), len(all_phrases)), dtype=float)
for i, phrase1 in tqdm(enumerate(high_freq_phrases), total=len(high_freq_phrases)):
    for j, phrase2 in enumerate(all_phrases):
        dist = cer(phrase1, phrase2)
        cer_matrix[i, j] = dist
cer_matrix[:5,:5]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 56/56 [00:06<00:00,  8.45it/s]


array([[0.        , 1.375     , 0.875     , 1.3125    , 1.125     ],
       [0.88      , 0.        , 0.84      , 0.84      , 0.24      ],
       [1.07692308, 1.61538462, 0.        , 1.69230769, 1.46153846],
       [0.75      , 0.75      , 0.78571429, 0.        , 0.71428571],
       [0.81818182, 0.27272727, 0.86363636, 0.90909091, 0.        ]])

Now, for each positive keyphrase, bucket negative keyphrases based on the ratio of the edit distance to the keyphrase's length.
Then sample 30 keywords from 0-33% edit distance/length (hard), 33-66% (medium) and 66-100% (easy)


In [74]:
unique_phrase_df['num_easy'] = 0
unique_phrase_df['num_medium'] = 0
unique_phrase_df['num_hard'] = 0

for keyphrase_idx, keyphrase in enumerate(high_freq_phrases):
    keyphrase_len = len(keyphrase)

    dists_to_keyphrase = cer_matrix[keyphrase_idx, :]

    easy_mask = dists_to_keyphrase > 0.67
    medium_mask = (dists_to_keyphrase <= 0.67) & (dists_to_keyphrase > 0.33)
    hard_mask = (dists_to_keyphrase > 0) & (dists_to_keyphrase <= 0.33)

    curr_keyphrase_mask = unique_phrase_df['fst_text'] == keyphrase
    unique_phrase_df.loc[curr_keyphrase_mask, 'num_easy'] = easy_mask.sum()
    unique_phrase_df.loc[curr_keyphrase_mask, 'num_medium'] = medium_mask.sum()
    unique_phrase_df.loc[curr_keyphrase_mask, 'num_hard'] = hard_mask.sum()

unique_phrase_df[keyphrase_mask].head()

Unnamed: 0,fst_text,num_eaf_variants,token_count,is_keyphrase,num_easy,num_medium,num_hard
0,ɛ̀màð kìcə̀lò,7,10,True,7801,450,98
1,lə̀və̀lɛ̀ðɜ́lló únɛ́ɾɛ́,7,19,True,7652,627,70
2,íŋgá ɾɔ́ðà,6,11,True,8152,184,13
3,láló və́lɛ̀ðà nd̪ɔ̀bàgɛ̀,6,18,True,6922,1310,117
4,kə̀və̀lɛ̀ðɔ́l ùnɛ́ɾɛ́,5,10,True,7673,595,81


In [75]:
unique_phrase_df[keyphrase_mask]['num_easy'].mean(),\
unique_phrase_df[keyphrase_mask]['num_medium'].mean(),\
unique_phrase_df[keyphrase_mask]['num_hard'].mean(),

(7701.267857142857, 614.0535714285714, 33.67857142857143)

In [76]:
unique_phrase_df[keyphrase_mask].to_csv(KEYPHRASE_CSV, index_label='index')
np.save(CER_MATRIX_PATH, cer_matrix)

# Keyphrase lists
Define lists of positive and negative records for each keyphrase.

## Calibration set
- 10 positive samples
- 50 negative samples for easy, medium and hard
- use to tune $\tau$ threshold

## Evaluation set
- use all positive and negative samples
- use macro-averaging

English negative samples will be handled separately since those are the same across all keyphrases.

## Output
JSON object looks like:
```json
[
    {
        'keyphrase': $str,
        'keyphrase_idx': $int,
        'record_idcs': [$int, $int, ...]
        'easy': {
            'phrase_idcs': [$int, $int, ...]
            'record_idcs': [$int, $int, ...]
        },
        'medium': {...},
        'hard': {...},
    },
    ...
]
```

In [77]:
calibration_num_negative = 50

has_easy = unique_phrase_df['num_easy'] >= calibration_num_negative
has_medium = unique_phrase_df['num_medium'] >= calibration_num_negative
has_hard = unique_phrase_df['num_hard'] >= calibration_num_negative

has_negative = has_easy & has_medium & has_hard
calibration_keyphrases = unique_phrase_df.loc[has_negative, 'fst_text'].tolist()
unique_phrase_df['in_calibration_set'] = has_negative
print(has_negative.sum(), len(high_freq_phrases))
calibration_keyphrases[:5]

20 56


['ɛ̀màð kìcə̀lò',
 'lə̀və̀lɛ̀ðɜ́lló únɛ́ɾɛ́',
 'láló və́lɛ̀ðà nd̪ɔ̀bàgɛ̀',
 'kə̀və̀lɛ̀ðɔ́l ùnɛ́ɾɛ́',
 'lɛ̀ĺ və́lɛ̀ðɛ̀ ùnɛ́ɾɛ́']

In [92]:
keyphrase_list = []
calibration_list = []


for keyphrase_i, keyphrase in enumerate(high_freq_phrases):
    dists_to_keyphrase = cer_matrix[keyphrase_i, :]
    keyphrase_mask = unique_phrase_df['fst_text'] == keyphrase
    row = unique_phrase_df[keyphrase_mask]
    assert len(row) == 1
    row = row.iloc[0]

    # add positive rows
    keyphrase_obj['keyphrase'] = keyphrase
    keyphrase_obj['keyphrase_idx'] = keyphrase_i
    keyphrase_obj['records_idcs'] = 
    
    # add negative rows
    
    easy_mask = dists_to_keyphrase > 0.67
    medium_mask = (dists_to_keyphrase <= 0.67) & (dists_to_keyphrase > 0.33)
    hard_mask = (dists_to_keyphrase > 0) & (dists_to_keyphrase <= 0.33)

    for mask, split in (easy_mask, 'easy'), (medium_mask, 'medium'), (hard_mask, 'hard'):
        all_negative_rows = np.argwhere(mask).tolist()
        keyphrase_list[keyphrase][split] = all_negative_rows
        if row['in_calibration_set']:
            calibration_negative_rows = random.sample(all_negative_rows, calibration_num_negative)
            calibration_list[keyphrase][split] = calibration_negative_rows

    


In [80]:
with open(CALIBRATION_LIST, encoding='utf8', mode='w') as f:
    ...    