In [1]:
from pympi import *
import torchaudio
from glob import glob
import os
os.chdir(r'C:\projects\malachor5')
import numpy as np
import pandas as pd
from tqdm import tqdm
import sys
sys.path.append(r'C:\projects\malachor5\scripts')
from string_norm import tira2arpabet, tira2mfa, remove_punct
from lid_utils import is_tira_word

In [3]:
eaf_paths = glob(r'C:\projects\malachor5\meta\*.eaf')
wav_paths = glob(r'E:\data\wav\*.wav')
speaker_tiers = ['SHA', 'NIN', 'HIM', 'MAR', 'PET', 'MISC']
stem2paths={}
for eaf_path in eaf_paths:
    basename = os.path.basename(eaf_path)
    stem = os.path.splitext(basename)[0]
    wav_path = [wav_path for wav_path in wav_paths if stem in wav_path]
    assert len(wav_path)==1
    wav_path=wav_path[0]
    stem2paths[stem]={'eaf': eaf_path, 'wav': wav_path}
stem2paths

{'HH20210312': {'eaf': 'C:\\projects\\malachor5\\meta\\HH20210312.eaf',
  'wav': 'E:\\data\\wav\\HH20210312.WAV'},
 'HH20210913': {'eaf': 'C:\\projects\\malachor5\\meta\\HH20210913.eaf',
  'wav': 'E:\\data\\wav\\HH20210913.wav'},
 'HH20220327-2': {'eaf': 'C:\\projects\\malachor5\\meta\\HH20220327-2.eaf',
  'wav': 'E:\\data\\wav\\HH20220327-2.wav'}}

# Merge tiers
`IPA Transcription` kept separate from `HIM`: add annotations back in where needed

In [20]:
for eaf_path in tqdm(eaf_paths):
    tqdm.write(eaf_path)
    eaf = Elan.Eaf(eaf_path)
    ipa_tier = 'IPA Transcription'
    himidan_tier = 'HIM'
    if ipa_tier not in eaf.get_tier_names():
        tqdm.write('IPA transcription tier not found, skipping...')
        continue
    add_count=0
    skip_count=0
    for interval in eaf.get_annotation_data_for_tier(ipa_tier):
        start, end, value = interval[:3]
        midpoint = (start+end)//2
        if eaf.get_annotation_data_at_time(himidan_tier, midpoint):
            skip_count+=1
            continue
        eaf.add_annotation(himidan_tier, start, end, value)
        add_count+=1
    print(f"{add_count=}\t{skip_count=}")
    eaf.to_file(eaf_path)

 33%|███▎      | 1/3 [00:00<00:00,  6.80it/s]

C:\projects\malachor5\meta\HH20210312.eaf
add_count=0	skip_count=246
C:\projects\malachor5\meta\HH20210913.eaf
add_count=0	skip_count=142


100%|██████████| 3/3 [00:00<00:00, 11.36it/s]

C:\projects\malachor5\meta\HH20220327-2.eaf
add_count=0	skip_count=89





# Remove non-speech

In [31]:
for eaf_path in tqdm(eaf_paths):
    tqdm.write(eaf_path)
    eaf = Elan.Eaf(eaf_path)
    for i, tier in enumerate(speaker_tiers):
        remove_count = 0
        if tier not in eaf.get_tier_names():
            continue
        for interval in eaf.get_annotation_data_for_tier(tier):
            start, end, value = interval[:3]
            midpoint = (start+end)//2
            # remove empty or non-speech tiers
            if (not value) or (value in ['HUMMING', 'NOLING']):
                eaf.remove_annotation(tier, midpoint)
                remove_count+=1
        if remove_count:
            tqdm.write(f"{tier=}\t{remove_count=}")
    eaf.to_file(eaf_path)

 33%|███▎      | 1/3 [00:00<00:00,  5.05it/s]

C:\projects\malachor5\meta\HH20210312.eaf
tier='NIN'	remove_count=15
tier='HIM'	remove_count=32
tier='PET'	remove_count=14
tier='MISC'	remove_count=1


 33%|███▎      | 1/3 [00:00<00:00,  5.05it/s]

C:\projects\malachor5\meta\HH20210913.eaf


 67%|██████▋   | 2/3 [00:00<00:00,  3.18it/s]

tier='HIM'	remove_count=42


 67%|██████▋   | 2/3 [00:00<00:00,  3.18it/s]

C:\projects\malachor5\meta\HH20220327-2.eaf


100%|██████████| 3/3 [00:00<00:00,  3.52it/s]

tier='HIM'	remove_count=6





# Overlap
Let's figure out how many overlapping intervals there are

In [35]:
for eaf_path in eaf_paths:
    print(eaf_path)
    eaf = Elan.Eaf(eaf_path)
    maxlen = eaf.get_full_time_interval()[1]
    overlap_array = np.zeros(maxlen, dtype=int)
    for i, tier in enumerate(speaker_tiers):
        if tier not in eaf.get_tier_names():
            continue
        for interval in eaf.get_annotation_data_for_tier(tier):
            start, end = interval[:2]
            overlap_array[start:end]+=1
    overlap_array = pd.Series(overlap_array)
    print(overlap_array.value_counts()/(overlap_array>0).sum())

C:\projects\malachor5\meta\HH20210312.eaf
1    0.968445
0    0.611428
2    0.031555
Name: count, dtype: float64
C:\projects\malachor5\meta\HH20210913.eaf
0    1.964611
1    0.955809
2    0.044191
Name: count, dtype: float64
C:\projects\malachor5\meta\HH20220327-2.eaf
1    0.963585
0    0.527386
2    0.036415
Name: count, dtype: float64


3-4% overlap for each file

# Create corpus for alignment
Next step, let's save all intervals as separate .wav and .lab files in a directory structure expected by MFA

In [25]:
# data_dir = r'C:\projects\malachor5\data\tira_eval_mfa'
# for filestem, paths in tqdm(stem2paths.items()):
#     wav, sr = torchaudio.load(paths['wav'])
#     samples_per_ms = sr/1_000
#     eaf = Elan.Eaf(paths['eaf'])
#     for speaker in tqdm(speaker_tiers):
#         if speaker not in eaf.get_tier_names():
#             continue
#         speaker_dir = os.path.join(data_dir, speaker)
#         os.makedirs(speaker_dir, exist_ok=True)
#         for interval in eaf.get_annotation_data_for_tier(speaker):
#             start_ms, end_ms, value = interval[:3]
#             start_samples = int(start_ms*samples_per_ms)
#             end_samples = int(end_ms*samples_per_ms)
#             clip_stem = f"{filestem}_{start_ms}_{end_ms}"
#             clip_path = os.path.join(speaker_dir, clip_stem)

#             # save .wav
#             wav_clip = wav[:,start_samples:end_samples]
#             torchaudio.save(clip_path+'.wav', wav_clip, sr)

#             # save .lab
#             # with open(clip_path+'.lab', 'w', encoding='utf8') as f:
#                 # f.write(value)

#             # save .TextGrid
#             duration_s = (end_ms-start_ms)/1_000
#             textgrid = Praat.TextGrid(xmin=0, xmax=duration_s)
#             label_tier: Praat.Tier = textgrid.add_tier('label')
#             label_tier.add_interval(0, duration_s, value)
#             textgrid.to_file(clip_path+'.TextGrid')


100%|██████████| 6/6 [00:00<00:00, 29.13it/s]
100%|██████████| 6/6 [00:00<00:00, 1999.83it/s]
100%|██████████| 6/6 [00:00<00:00, 1997.60it/s]
100%|██████████| 3/3 [00:03<00:00,  1.12s/it]


Except! That isn't necessary, just convert each `.eaf` file into a TextGrid.

In [36]:
data_dir = r'C:\projects\malachor5\data\tira_eval_mfa'
for filestem, paths in tqdm(stem2paths.items()):
    wav, sr = torchaudio.load(paths['wav'])
    eaf = Elan.Eaf(paths['eaf'])

    out_stem = os.path.join(data_dir, filestem)

    tg = eaf.to_textgrid(filtin=speaker_tiers)
    tg.to_file(out_stem+'.TextGrid')

    torchaudio.save(out_stem+'.wav', wav, sr)


100%|██████████| 3/3 [00:15<00:00,  5.33s/it]


# Dictionary
Save arpabet and MFA dictionaries for Tira words

In [None]:
dictionary_lines = []
tira_words = set()
for filestem, paths in tqdm(stem2paths.items()):
    eaf = Elan.Eaf(paths['eaf'])
    for speaker in tqdm(speaker_tiers):
        if speaker not in eaf.get_tier_names():
            continue
        for interval in eaf.get_annotation_data_for_tier(speaker):
            value = interval[2]
            words = remove_punct(value).split()
            for word in words:
                if is_tira_word(word):
                    tira_words.add(word)
len(tira_words), tira_words

100%|██████████| 6/6 [00:00<00:00, 10.48it/s]
100%|██████████| 6/6 [00:00<00:00,  9.16it/s]
100%|██████████| 6/6 [00:00<00:00,  6.71it/s]
100%|██████████| 3/3 [00:07<00:00,  2.41s/it]


(214,
 {'a',
  'àlɔ́',
  'àn',
  'àpɾì',
  'àpɾí',
  'àŋgèɲɔ́',
  'àɾò',
  'á',
  'ápɾí',
  'âɾò',
  'cìcə̀lò',
  'cícə̀lò',
  'cùbò',
  'cùbó',
  'cùbɔ̀',
  'cɔ́lɔ̀',
  'cə̀mú',
  'dɔ̀ɽðàt̪à',
  'dɔ́ɽðàt̪à',
  'dɔ́ɾàt̪à',
  'emakəŋe',
  'éɲá',
  'gèɲɔ́',
  'giɲ',
  'gìɲɔ́',
  'i',
  'ìjɔ́',
  'íbí',
  'jál',
  'jáŋál',
  'jáŋə́l',
  'jéɲál',
  'jìgèɲɔ́',
  'jə',
  'jɛ',
  'jɛ̀',
  'kà',
  'kàŋú',
  'kàɾɛ́',
  'ká',
  'káddɔ̀ɽðàt̪à',
  'kádɔ̀ɽðàt̪à',
  'kágɛ̀',
  'káðdɔ̀ɽðàt̪à',
  'káŋú',
  'kìcə̀lò',
  'kìjɔ́',
  'kícə̀lò',
  'kúkù',
  'kúkùŋù',
  'kɔ̀ɾɔ́',
  'kɛgiɲ',
  'kɛgɛ',
  'kɛ̀',
  'kɛ̀gèɲɔ́',
  'kɛ̀gìɲɔ́',
  'kɛ̀ŋgìɲɔ́',
  'l',
  'lallivəlɛðir',
  'laŋ',
  'là',
  'làlə́lvə̀lɛ̀ðɛ̀',
  'làlə́lvə̀vɛ̀ðɔ̀',
  'làrò',
  'làvàrà',
  'làŋə̀l',
  'làŋə̄l',
  'làɾò',
  'lá',
  'ládɔ̀ɽðàt̪à',
  'lálló',
  'lálə̀və̀lɛ̀ðír',
  'lápɾí',
  'lávándə́ŋé',
  'láŋə́l',
  'lâlló',
  'lǎ',
  '

In [6]:
tira_dict_dir = r'C:\projects\malachor5\data\tira_mfa_dicts'
os.makedirs(tira_dict_dir, exist_ok=True)
tira_mfa_dict_path = os.path.join(tira_dict_dir, 'tira_mfa.dict')
tira_arpa_dict_path = os.path.join(tira_dict_dir, 'tira_arpa.dict')


mfa_dict_lines = []
arpa_dict_lines = []

for word in tira_words:
    mfa_dict_lines.append(f"{word}\t{tira2mfa(word)}\n")
    arpa_dict_lines.append(f"{word}\t{tira2arpabet(word)}\n")
with open(tira_mfa_dict_path, 'w', encoding='utf8') as f:
    f.writelines(mfa_dict_lines)
with open(tira_arpa_dict_path, 'w', encoding='utf8') as f:
    f.writelines(arpa_dict_lines)

# Keyword lists
Relatedly, create lists of keywords that are Tira words *and phrases* specific to each file.

In [None]:
phrases_per_file = {}
for filestem, paths in tqdm(stem2paths.items()):
    eaf = Elan.Eaf(paths['eaf'])
    phrases_per_file[paths['eaf']]=set()
    for speaker in tqdm(speaker_tiers):
        if speaker not in eaf.get_tier_names():
            continue
        for interval in eaf.get_annotation_data_for_tier(speaker):
            value = interval[2]
            words = remove_punct(value).split()
            current_phrase = ''
            for word in words:
                if not is_tira_word(word):
                    if len(current_phrase)>3:
                        phrases_per_file[paths['eaf']].add(current_phrase.strip())
                    current_phrase = ''
                elif current_phrase and word.strip() == current_phrase.split()[-1]:
                    continue # don't add repeat words
                else:
                    current_phrase += word + ' '
{k:len(v) for k,v in phrases_per_file.items()}, phrases_per_file

100%|██████████| 6/6 [00:00<00:00, 11.01it/s]
100%|██████████| 6/6 [00:00<00:00, 11.12it/s]
100%|██████████| 6/6 [00:00<00:00,  8.13it/s]
100%|██████████| 3/3 [00:01<00:00,  1.55it/s]


({'C:\\projects\\malachor5\\meta\\HH20210312.eaf': 18,
  'C:\\projects\\malachor5\\meta\\HH20210913.eaf': 23,
  'C:\\projects\\malachor5\\meta\\HH20220327-2.eaf': 60},
 {'C:\\projects\\malachor5\\meta\\HH20210312.eaf': {'a ló',
   'àlɔ́',
   'éɲá',
   'jáŋə́l və́lɛ̀ðǎjó',
   'jáŋə́l və́lɛ̀ðǎló',
   'jɛ jə',
   'laŋ',
   'làŋə̀l və́lɛ̂ðɛ̀',
   'làŋə̄l',
   'lâlló və́lɛ̀ðɔ́',
   'lǎ',
   'léɲál və́lɛ̂ðà',
   'léɲál və́lɛ̂ðɔ́ló nd̪ɔ̀bà',
   'ló',
   'lós',
   'ùnɛ́ɾɛ́',
   'ɜ̂l',
   'ɲál'},
  'C:\\projects\\malachor5\\meta\\HH20210913.eaf': {'âɾò',
   'cɔ́lɔ̀',
   'íbí',
   'lɔ́ɾɔ́',
   'máɽðɔ́',
   'ná',
   'ðé',
   'ðíbɔ́',
   'ðî',
   'ðî ðìcə̀lò',
   'ðə̀mbɾɔ́',
   'ðə́máɽðá',
   'ðə̂',
   'ðɛ̀',
   'ŋìcɔ́lɔ̀',
   'ŋòɽòn',
   'ŋòɽón',
   'ŋ̀cɔ́lɔ̀',
   'ɔɾɔ',
   'ɔ́ɾɔ́',
   'ɛ̀bɛ̀',
   'ɛ̀ɾɛ̀ð',
   'ɜ́ɾú'},
  'C:\\projects\\malachor5\\meta\\HH20220327-2.eaf': {'àn',
   'àn kɛ̀gèɲɔ́ lídɔ̀',
   'àn kɛ̀gìɲɔ́ lídɔ̀',
   'àn ŋgèɲɔ́ àp

In [17]:
keyword_dir = r'data\keyword_lists'
os.makedirs(keyword_dir, exist_ok=True)
for eaf_path, phrases in phrases_per_file.items():
    keyword_list_basename = os.path.basename(eaf_path.replace('.eaf', '-keywords.txt'))
    keyword_list_path = os.path.join(keyword_dir, keyword_list_basename)
    with open(keyword_list_path, 'w', encoding='utf8') as f:
        f.write("\n".join(phrases))