In [11]:
from scipy.io import wavfile
import csv
import os
import parselmouth
import textgrids
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, f1_score, classification_report


In [12]:
vowels = "aeiou"
observed_formants = {
    'a': [682.845033143518, 1347.6680693901671],
 'e': [494.84181512897976, 1619.9787143718208],
 'i': [382.33315846211553, 1657.3348620074385],
 'o': [506.2493853362084, 1101.1472533066992],
 'u': [434.7785079520296, 984.1856223625758]}

observed_formants_std = {'a': [73.7791520366105, 93.04641134935628],
 'e': [55.49710196672828, 132.8479221603176],
 'i': [50.65527125493847, 155.24836087654543],
 'o': [60.20226523820195, 170.6404810742714],
 'u': [50.328755387520836, 193.66177958793827]}

def extract_f1_f2_and_intensity_from_sound(snd):
    formants = snd.to_formant_burg()
    intensity = snd.to_intensity()
    f1 = list()
    f2 = list()
    intensity_list = list()
    for t in formants.t_grid():
        f1.append(formants.get_value_at_time(1,t))
        f2.append(formants.get_value_at_time(2,t))
        intensity_list.append(intensity.get_value(t))

    return f1, f2, intensity_list, formants.t_grid()

def get_closer_vowel(f1,f2, theoretical_formants=observed_formants):
    return min(
        map(
            lambda x:
                (x[0], abs(x[1] - f1) + abs(x[2] - f2)),
            ((vowel, t_f1, t_f2) for vowel, (t_f1, t_f2) in theoretical_formants.items())
        ),
        key=lambda x: x[1]
    )

def evaluate_non_vocal(vowel_and_distance, observed_std=observed_formants_std):
    vowel, distance = vowel_and_distance
    if distance > sum(observed_std[vowel]):
        return (f'non_vocal_{vowel}', distance)
    return vowel_and_distance



In [13]:
base_folder = "words_annotations"
annotations_folder = "annotations"
wav_folder = "wav"
annotations_file_path = os.path.join(base_folder, "word_map.txt")
with open(annotations_file_path) as af:
    annotations = af.readlines()
annotations

['27356,Arete\n',
 '27358,Reloj\n',
 '27361,Cadena\n',
 '27364,Gafas\n',
 '27365,Anillo\n',
 '27387,Pollo\n',
 '36677,Papa\n',
 '32540,Yuca\n',
 '33582,Tomate\n',
 '31873,Zanahoria\n',
 '36438,Cebolla\n',
 '35472,Cilantro\n',
 '29029,Pepino\n',
 '32621,Habichuela\n',
 '36271,Te amo\n',
 '36358,Me gusta\n',
 '31500,Quiero\n',
 '35159,Feliz\n',
 '34100,Triste\n',
 '28102,Miedo\n',
 '28728,Odio\n',
 '32467,Culpa\n',
 '33981,Enojo\n',
 '35165,Aburrido\n',
 '35616,Celos\n',
 '36265,Amor\n',
 '34617,Agua\n',
 '33307,Café\n',
 '27436,Jugo\n',
 '29420,Leche\n',
 '31317,Avena\n',
 '34014,Chocolate\n',
 '29809,Limonada\n',
 '30373,Bombero\n',
 '32784,Medico\n',
 '30538,Profesor\n',
 '28699,Estudiante\n',
 '32438,Interpretador\n',
 '28066,Conductor\n',
 '36798,Enfermero\n',
 '30380,Cantante\n',
 '32791,Carpintero\n',
 '29617,Chef\n',
 '33954,Fotógrafo\n',
 '31077,Músico\n',
 '28487,Odontólogo\n',
 '31485,Peluquero\n',
 '36245,Periodista\n',
 '33962,Piloto\n',
 '36249,Policía\n',
 '28492,Pintor\n'

In [14]:
expected = list()
result = list()
vowels = "aeiou"
for annotation in annotations:
    record_name, transcription = annotation.replace("\n", "").split(",")
    record_path = os.path.join(base_folder, wav_folder, f"{record_name}.wav")
    transcription_file = os.path.join(base_folder, annotations_folder, f"{record_name}.TextGrid")
    if os.path.exists(transcription_file):
        current_text_grid = textgrids.TextGrid(transcription_file)
        try:
            # print("TRANSCRIPTION:", transcription)
            intervals = current_text_grid[transcription.lower()]
            vowels_intervals = list()
            if os.path.exists(record_path):
                snd = parselmouth.Sound(record_path)
                field_names = ["t", "f1", "f2", "i", "vowel", "distance"]
                for f1, f2, intensity, t in zip(*extract_f1_f2_and_intensity_from_sound(snd)):
                    vowel, distance = get_closer_vowel(f1, f2)
                    vowel_non_vowel, distance = evaluate_non_vocal((vowel, distance))
                    is_non_vowel = vowel_non_vowel.startswith("non_vocal_")
                    # Tengo lag mental, esto no es eficiente, no me juzguen
                    for interval in intervals:
                        if interval.xmin < t < interval.xmax:
                            phoneme_annotation = interval.text.strip()
                            is_transcription_non_vowel = phoneme_annotation not in vowels
                            if phoneme_annotation != "sil":
                                expected.append(phoneme_annotation if not is_transcription_non_vowel else "non_vowel")
                                result.append(vowel_non_vowel if not is_non_vowel else "non_vowel")

            else:
                print(f"Record file {record_path} does not exists")
        except KeyError:
            pass


expected[100:]




['o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'e',
 'e',
 'e',
 'e',
 'e',
 'e',
 'e',
 'e',
 'e',
 'e',
 'e',
 'e',
 'e',
 'e',
 'e',
 'e',
 'e',
 'e',
 'e',
 'e'

In [15]:
len(expected)

8805

In [16]:
result[100:]

['o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'o',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'a',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'e',
 'non_vowel',
 'o',
 'non_vowel',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'non_vowel',
 'o',
 'non_vowel',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'non_vowel',
 'e',
 'e',
 'e',
 'non_vowel',
 'e',
 'e',
 'e',
 'e',
 'e',
 'e',
 'e',
 'e',
 'e',
 'e',
 'e',
 'e',
 'e',
 'non_vowel',
 'non_vowel',
 'e',
 'e',
 'non_vowel',
 'non_vowel',
 'u',
 'non_vowel',
 'non_vowel',
 'non_vowel',
 'non_vowe

In [17]:
len(result)

8805

In [18]:
report = classification_report(expected, result, labels=["a", "e", "i", "o", "u", "non_vowel"], digits=4)
print(report)

              precision    recall  f1-score   support

           a     0.6019    0.3634    0.4532      1706
           e     0.2869    0.2990    0.2928       806
           i     0.1452    0.1794    0.1605       535
           o     0.3877    0.3102    0.3447      1386
           u     0.1039    0.2824    0.1519       301
   non_vowel     0.5072    0.5729    0.5381      3849

   micro avg     0.4176    0.4284    0.4229      8583
   macro avg     0.3388    0.3346    0.3235      8583
weighted avg     0.4494    0.4284    0.4299      8583



In [19]:
performance_dir = "performance"
os.makedirs(performance_dir, exist_ok=True)

In [20]:
with open(os.path.join(performance_dir, f"performance_observed.txt"), "w+") as f:
    f.write(report)

Confusion matrix
TP = Evaluated vowel and annotated vowel
TN = Evaluated vowel and non annotated vowel
FP = Evaluated non vowel and annotated vowel
FN = Evaluated non vowel and annotated non vowel
WV = Evaluated vowel but not the vowel we want

In [None]:
global_tp = 0
global_tn = 0
global_fp = 0
global_fn = 0
global_wv = 0


def evaluate_vowel_in_time(vowels_intervals, evaluated_vowel, t):
    for t_min, t_max, vowel in vowels_intervals:
        if t_min < t < t_max:
            if evaluated_vowel == vowel:
                return True
    return False
results = list()
for annotation in annotations:
    record_name, transcription = annotation.replace("\n", "").split(",")
    record_path = os.path.join(base_folder, wav_folder, f"{record_name}.wav")
    transcription_file = os.path.join(base_folder, annotations_folder, f"{record_name}.TextGrid")
    if os.path.exists(transcription_file):
        tp = 0
        tn = 0
        fp = 0
        fn = 0
        wv = 0
        current_text_grid = textgrids.TextGrid(transcription_file)
        try:
            print("TRANSCRIPTION:", transcription)
            intervals = current_text_grid[transcription.lower()]
            vowels_intervals = list()
            if os.path.exists(record_path):
                snd = parselmouth.Sound(record_path)
                field_names = ["t", "f1", "f2", "i", "vowel", "distance"]
                for f1, f2, intensity, t in zip(*extract_f1_f2_and_intensity_from_sound(snd)):
                    vowel, distance = get_closer_vowel(f1, f2)
                    vowel_non_vowel, distance = evaluate_non_vocal((vowel, distance))
                    # Tengo lag mental, esto no es eficiente, no me juzguen
                    for interval in intervals:
                        if interval.xmin < t < interval.xmax:
                            phoneme_annotation = interval.text.strip()
                            if phoneme_annotation == vowel_non_vowel:
                                tp += 1
                            elif vowel_non_vowel in vowels and phoneme_annotation not in vowels:
                                tn += 1
                            elif vowel_non_vowel.startswith("non_vocal_") and phoneme_annotation in vowels:
                                fp += 1
                            elif vowel_non_vowel.startswith("non_vocal_") and phoneme_annotation not in vowels:
                                fn += 1
                            else:
                                wv += 1
                                tp += 1
            else:
                print(f"Record file {record_path} does not exists")
        except KeyError:
            pass
        print("Before appending")
        results.append({
            "transcription": transcription,
            "tp": tp,
            "tn": tn,
            "fp": fp,
            "fn": fn,
            "wv": wv
        })
        global_tp += tp
        global_tn += tn
        global_fp += fp
        global_fn += fn
        global_wv += wv


results.append({
    "transcription": "GLOBAL",
    "tp": global_tp,
    "tn": global_tn,
    "fp": global_fp,
    "fn": global_fn,
    "wv": global_wv
})
results


In [44]:
performance_dir = "performance"
os.makedirs(performance_dir, exist_ok=True)

with open(os.path.join(performance_dir, f"performance_observed.csv"), "w+") as f:
    headers = ["transcription", "tp", "tn", "fp", "fn", "wv"]
    dict_writer = csv.DictWriter(f, headers)
    dict_writer.writeheader()
    dict_writer.writerows(results)

In [None]:
# =(B102+E102)/(SUM(B102:E102))