In [17]:
import csv
import os
import parselmouth
import textgrids

In [36]:
vowels = "aeiou"
theoretical_formants = {
    "a": (650, 1300),
    "e": (450, 1800),
    "i": (300, 2200),
    "o": (450, 1000),
    "u": (300, 1000),
}

DISTANCE_THRESHOLD = 50
def extract_f1_f2_and_intensity_from_sound(snd):
    formants = snd.to_formant_burg()
    intensity = snd.to_intensity()
    f1 = list()
    f2 = list()
    intensity_list = list()
    for t in formants.t_grid():
        f1.append(formants.get_value_at_time(1,t))
        f2.append(formants.get_value_at_time(2,t))
        intensity_list.append(intensity.get_value(t))

    return f1, f2, intensity_list, formants.t_grid()

def get_closer_vowel(f1,f2, theoretical_formants=theoretical_formants):
    return min(
        map(
            lambda x:
                (x[0], abs(x[1] - f1) + abs(x[2] - f2)),
            ((vowel, t_f1, t_f2) for vowel, (t_f1, t_f2) in theoretical_formants.items())
        ),
        key=lambda x: x[1]
    )

def evaluate_non_vocal(vowel_and_distance, distance_threshold=DISTANCE_THRESHOLD):
    vowel, distance = vowel_and_distance
    if distance > distance_threshold:
        return (f'non_vocal_{vowel}', distance)
    return vowel_and_distance


In [37]:
base_folder = "words_annotations"
annotations_folder = "annotations"
wav_folder = "wav"
annotations_file_path = os.path.join(base_folder, "word_map.txt")
with open(annotations_file_path) as af:
    annotations = af.readlines()
annotations

['27356,Arete\n',
 '27358,Reloj\n',
 '27361,Cadena\n',
 '27364,Gafas\n',
 '27365,Anillo\n',
 '27387,Pollo\n',
 '36677,Papa\n',
 '32540,Yuca\n',
 '33582,Tomate\n',
 '31873,Zanahoria\n',
 '36438,Cebolla\n',
 '35472,Cilantro\n',
 '29029,Pepino\n',
 '32621,Habichuela\n',
 '36271,Te amo\n',
 '36358,Me gusta\n',
 '31500,Quiero\n',
 '35159,Feliz\n',
 '34100,Triste\n',
 '28102,Miedo\n',
 '28728,Odio\n',
 '32467,Culpa\n',
 '33981,Enojo\n',
 '35165,Aburrido\n',
 '35616,Celos\n',
 '36265,Amor\n',
 '34617,Agua\n',
 '33307,Café\n',
 '27436,Jugo\n',
 '29420,Leche\n',
 '31317,Avena\n',
 '34014,Chocolate\n',
 '29809,Limonada\n',
 '30373,Bombero\n',
 '32784,Medico\n',
 '30538,Profesor\n',
 '28699,Estudiante\n',
 '32438,Interpretador\n',
 '28066,Conductor\n',
 '36798,Enfermero\n',
 '30380,Cantante\n',
 '32791,Carpintero\n',
 '29617,Chef\n',
 '33954,Fotógrafo\n',
 '31077,Músico\n',
 '28487,Odontólogo\n',
 '31485,Peluquero\n',
 '36245,Periodista\n',
 '33962,Piloto\n',
 '36249,Policía\n',
 '28492,Pintor\n'

Confusion matrix
TP = Evaluated vowel and annotated vowel
TN = Evaluated vowel and non annotated vowel
FP = Evaluated non vowel and annotated vowel
FN = Evaluated non vowel and annotated non vowel
WV = Evaluated vowel but not the vowel we want

In [38]:
global_tp = 0
global_tn = 0
global_fp = 0
global_fn = 0
global_wv = 0


def evaluate_vowel_in_time(vowels_intervals, evaluated_vowel, t):
    for t_min, t_max, vowel in vowels_intervals:
        if t_min < t < t_max:
            if evaluated_vowel == vowel:
                return True
    return False
results = list()
for annotation in annotations:
    record_name, transcription = annotation.replace("\n", "").split(",")
    record_path = os.path.join(base_folder, wav_folder, f"{record_name}.wav")
    transcription_file = os.path.join(base_folder, annotations_folder, f"{record_name}.TextGrid")
    if os.path.exists(transcription_file):
        tp = 0
        tn = 0
        fp = 0
        fn = 0
        wv = 0
        current_text_grid = textgrids.TextGrid(transcription_file)
        try:
            print("TRANSCRIPTION:", transcription)
            intervals = current_text_grid[transcription.lower()]
            vowels_intervals = list()
            if os.path.exists(record_path):
                snd = parselmouth.Sound(record_path)
                field_names = ["t", "f1", "f2", "i", "vowel", "distance"]
                for f1, f2, intensity, t in zip(*extract_f1_f2_and_intensity_from_sound(snd)):
                    vowel, distance = get_closer_vowel(f1, f2)
                    vowel_non_vowel, distance = evaluate_non_vocal((vowel, distance))
                    # Tengo lag mental, esto no es eficiente, no me juzguen
                    for interval in intervals:
                        if interval.xmin < t < interval.xmax:
                            phoneme_annotation = interval.text.strip()
                            if phoneme_annotation == vowel_non_vowel:
                                tp += 1
                            elif vowel_non_vowel in vowels and phoneme_annotation not in vowels:
                                tn += 1
                            elif vowel_non_vowel.startswith("non_vocal_") and phoneme_annotation in vowels:
                                fp += 1
                            elif vowel_non_vowel.startswith("non_vocal_") and phoneme_annotation not in vowels:
                                fn += 1
                            else:
                                wv += 1
                                tp += 1
            else:
                print(f"Record file {record_path} does not exists")
        except KeyError:
            pass
        print("Before appending")
        results.append({
            "transcription": transcription,
            "tp": tp,
            "tn": tn,
            "fp": fp,
            "fn": fn,
            "wv": wv
        })
        global_tp += tp
        global_tn += tn
        global_fp += fp
        global_fn += fn
        global_wv += wv


results.append({
    "transcription": "GLOBAL",
    "tp": global_tp,
    "tn": global_tn,
    "fp": global_fp,
    "fn": global_fn,
    "wv": global_wv
})
results

TRANSCRIPTION: Arete
Before appending
TRANSCRIPTION: Reloj
Before appending
TRANSCRIPTION: Cadena
Before appending
TRANSCRIPTION: Gafas
Before appending
TRANSCRIPTION: Anillo
Before appending
TRANSCRIPTION: Pollo
Before appending
TRANSCRIPTION: Pepino
Before appending
TRANSCRIPTION: Miedo
Before appending
TRANSCRIPTION: Odio
Before appending
TRANSCRIPTION: Jugo
Before appending
TRANSCRIPTION: Leche
Before appending
TRANSCRIPTION: Limonada
Before appending
TRANSCRIPTION: Estudiante
Before appending
TRANSCRIPTION: Conductor
Before appending
TRANSCRIPTION: Chef
Before appending
TRANSCRIPTION: Odontólogo
Before appending
TRANSCRIPTION: Pintor
Before appending
TRANSCRIPTION: Familia
Before appending
TRANSCRIPTION: Papá
Before appending
TRANSCRIPTION: Hermana
Before appending
TRANSCRIPTION: Abuela
Before appending
TRANSCRIPTION: Amiga
Before appending
TRANSCRIPTION: Novia
Before appending
TRANSCRIPTION: Esposo
Before appending
TRANSCRIPTION: Nieta
Before appending
TRANSCRIPTION: Borrador
Bef

[{'transcription': 'Arete', 'tp': 8, 'tn': 12, 'fp': 28, 'fn': 150, 'wv': 0},
 {'transcription': 'Reloj', 'tp': 0, 'tn': 1, 'fp': 24, 'fn': 214, 'wv': 0},
 {'transcription': 'Cadena', 'tp': 7, 'tn': 0, 'fp': 74, 'fn': 158, 'wv': 1},
 {'transcription': 'Gafas', 'tp': 2, 'tn': 4, 'fp': 27, 'fn': 186, 'wv': 2},
 {'transcription': 'Anillo', 'tp': 1, 'tn': 0, 'fp': 47, 'fn': 191, 'wv': 1},
 {'transcription': 'Pollo', 'tp': 1, 'tn': 5, 'fp': 36, 'fn': 156, 'wv': 1},
 {'transcription': 'Pepino', 'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0, 'wv': 0},
 {'transcription': 'Miedo', 'tp': 2, 'tn': 1, 'fp': 68, 'fn': 353, 'wv': 0},
 {'transcription': 'Odio', 'tp': 5, 'tn': 7, 'fp': 66, 'fn': 141, 'wv': 3},
 {'transcription': 'Jugo', 'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0, 'wv': 0},
 {'transcription': 'Leche', 'tp': 0, 'tn': 1, 'fp': 45, 'fn': 152, 'wv': 0},
 {'transcription': 'Limonada',
  'tp': 16,
  'tn': 9,
  'fp': 49,
  'fn': 268,
  'wv': 0},
 {'transcription': 'Estudiante',
  'tp': 3,
  'tn': 3,
  'fp': 90,

In [39]:
performance_dir = "performance"
os.makedirs(performance_dir, exist_ok=True)

with open(os.path.join(performance_dir, f"performance_theoretical_th_{DISTANCE_THRESHOLD}.csv"), "w+") as f:
    headers = ["transcription", "tp", "tn", "fp", "fn", "wv"]
    dict_writer = csv.DictWriter(f, headers)
    dict_writer.writeheader()
    dict_writer.writerows(results)

In [None]:
# =(B102+E102)/(SUM(B102:E102))