In [1]:
import copy
import os
import re

from matplotlib import pyplot as plt
import textgrids
from scipy.io import wavfile
from silence_aligner import extract_segments_from_file
import IPython.display
from jiwer import wer

In [2]:
AUDIOS_FOLDER = "cropped_audios"
TRANSCRIPTION_FOLDER = "cropped_annotations"
TEXTS_FOLDER = "texts"
OUTPUT_ALIGN_FOLDER = "naive_align"

In [5]:
def sent_tokenize(text):
    return [x for x in re.split("\.|,|;|:|\n|!|¿|¡|\?|-|—|\(|\)|«|»", text) if x.replace(" ", "")]


def align_single(base_file_name):
    frequency_from_signal, signal = wavfile.read(
        os.path.join(
            AUDIOS_FOLDER,
            f"{base_file_name}.wav"
        )
    )
    silences, frequency  = extract_segments_from_file(
        os.path.join(
            AUDIOS_FOLDER,
            f"{base_file_name}.wav"
        )
    )

    segments = list()
    last_start = 0
    for start, stop in silences:
        segments.append((last_start, start))
        last_start = stop

    segments.append((last_start,signal.shape[-1]))
    signal_segment_lengths = [(stop-start)/frequency for start, stop in segments]
    with open(
        os.path.join(
            TEXTS_FOLDER,
            f"{base_file_name.replace('cropped_', '')}.txt"
        )
    ) as file:
        text = " ".join(file.readlines())

    tokens = sent_tokenize(text)
    tokens = tokens[1:]
    words_count = len(text.split(" "))
    transcription = textgrids.TextGrid(
        os.path.join(
            TRANSCRIPTION_FOLDER,
            f"{base_file_name}.TextGrid"
        )
    )

    intervals = transcription[base_file_name.replace("cropped_", "")]
    alignment = list()
    for index, current_segment in enumerate(segments):
        try:
            alignment.append(
                {
                    "text": tokens[index].replace("\"", "'"),
                    "xmin" : current_segment[0]/frequency,
                    "xmax" : current_segment[1]/frequency
                }
            )
        except IndexError:
            print("Breaking on index", index)
            break


    tg = textgrids.TextGrid()
    tg.xmin = 0
    tg.xmax = silences[-1][1] / frequency
    tier = textgrids.Tier()
    tg[base_file_name] = tier

    for align in alignment:
        tier.append(
            textgrids.Interval(
                align["text"],
                align["xmin"],
                align["xmax"]
            )
        )

    tg.write(
        os.path.join(
            OUTPUT_ALIGN_FOLDER,
            f"{base_file_name}.TextGrid"
        )
    )

    wer_list = list()

    current_annotation_index = 0
    current_interval_index = 0
    while current_interval_index < len(intervals):
        try:
            interval = intervals[current_interval_index]
            t_text = interval.text
            t_min = interval.xmin
            t_max = interval.xmax

            a_text = alignment[current_annotation_index]["text"]
            a_min = alignment[current_annotation_index]["xmin"]
            a_xmax = alignment[current_annotation_index]["xmax"]

            if a_xmax < t_max + 1:

                wer_list.append((tokens[int(t_text)-1], a_text))
            else:

                interval_to_append = [t_text]

                while not a_xmax < t_max + 1:

                    current_interval_index += 1
                    interval = intervals[current_interval_index]
                    t_text = interval.text
                    t_min = interval.xmin
                    t_max = interval.xmax
                    interval_to_append.append(t_text)
                value_to_append = list()
                for i in interval_to_append:
                    try:
                        value_to_append.append(tokens[int(i)-1])
                    except ValueError:
                        print("Error decoding", i)
                wer_list.append(("".join(value_to_append), a_text))


            current_interval_index += 1
            current_annotation_index += 1
        except (IndexError, ValueError):
            print("Exiting on index current_interval_index", current_interval_index, "current_annotation_index", current_annotation_index)
            break

    wer_values = list()
    for x in wer_list:
        true_value = x[0].split()
        inferred_value = x[1].split()
        local_wer = wer(true_value, inferred_value)
        percentaje = len(true_value)/ words_count
        wer_values.append(local_wer * percentaje)
    total_wer = sum(wer_values)
    return total_wer * 100, sum(signal_segment_lengths)

In [6]:
results = list()
output_text = "name,aer,length"
for file_name in os.listdir(TRANSCRIPTION_FOLDER):
    print("================")
    print(file_name)
    try:
        base_file_name = file_name.replace(".TextGrid", "")
        local_wer = align_single(
                base_file_name
            )
        results.append(
            local_wer
        )

        output_text += f"{base_file_name}, {local_wer[0]}, {local_wer[1]}\n"

    except (FileNotFoundError, ValueError, IndexError) as e:
        print("File not found on", file_name)
        # output_text += f"{base_file_name}, {e} \n"


cropped_F_13_1.TextGrid
Desired threshold, 0.05
2807376715.761885
cropped_M_45_1.TextGrid
Desired threshold, 0.05
1693037607.5033379
Breaking on index 344
cropped_M_74_1.TextGrid
Desired threshold, 0.05
2125183467.5488214
Breaking on index 10
cropped_F_73_1.TextGrid
Desired threshold, 0.05
2170949378.5337625
Breaking on index 12
cropped_M_27_1.TextGrid
Desired threshold, 0.05
11036553227.861982
Breaking on index 154
cropped_F_60_1.TextGrid
Desired threshold, 0.05
4381507435.437266
cropped_F_10_1.TextGrid
Desired threshold, 0.05
269450236.9541746
File not found on cropped_F_10_1.TextGrid
cropped_M_15_1.TextGrid
Desired threshold, 0.05
4278862679.0722523
Breaking on index 19
cropped_M_71_1.TextGrid
Desired threshold, 0.05
0.01088053934242552
File not found on cropped_M_71_1.TextGrid
cropped_M_70_1.TextGrid
Desired threshold, 0.05
1620344246.9001057
Breaking on index 9
cropped_F_74_1.TextGrid
Desired threshold, 0.05
2260028588.27425
Breaking on index 16
cropped_M_61_1.TextGrid
Desired thr

In [7]:
total_wer = sum(results)/len(results)
output_text += f"TOTAL_RESULTS, {total_wer}\n"

TypeError: unsupported operand type(s) for +: 'int' and 'tuple'

In [8]:
with open("naive_results_all.txt.orig", "w+") as results_file:
    results_file.write(output_text)