In [7]:
import os
import glob


def create_audio_alignment_mapping(audio_root: str, alignment_root: str):
    found = 0
    not_found = 0
    mapping = []

    # Get all .flac files in the audio_root directory
    audio_files = glob.glob(f"{audio_root}/**/*.flac", recursive=True)

    for audio_file in audio_files:
        # Get the name of the file without the extension
        name = os.path.basename(audio_file).rsplit(".", 1)[0]

        # Construct the path to the expected alignment directory
        alignment_dir = os.path.join(alignment_root, name)

        # Check if the alignment directory exists
        if os.path.isdir(alignment_dir):
            # Get the transcript file in the alignment directory
            transcript_files = glob.glob(f"{alignment_dir}/*.txt")
            if transcript_files:
                # Append the audio file and transcript file to the mapping
                mapping.append(dict(audio=audio_file, transcript=transcript_files[0]))
                # ((audio_file, transcript_files[0]))
                found += 1
            else:
                not_found += 1
        else:
            not_found += 1

    print(f"Found: {found}")
    print(f"Not found: {not_found}")

    return mapping


# Uncomment the next line and replace the paths to test the function
# create_audio_alignment_mapping("path/to/audio/root", "path/to/alignment/root")

In [8]:
audio_root = "/Users/lukas/Desktop/Projects/MIT/data/peoples_speech/audio_debug"
alignment_root = (
    "/Users/lukas/Desktop/Projects/MIT/data/peoples_speech/peoples-speech-clean"
)

mapping = create_audio_alignment_mapping(audio_root, alignment_root)

Found: 2
Not found: 3


In [9]:
mapping

[{'audio': '/Users/lukas/Desktop/Projects/MIT/data/peoples_speech/audio_debug/1_2_2018_Williston_Selectboard_SLASH_1_2_2018_Williston_Selectboard_DOT_mp3/1_2_2018_Williston_Selectboard_SLASH_1_2_2018_Williston_Selectboard_DOT_mp3.flac',
  'transcript': '/Users/lukas/Desktop/Projects/MIT/data/peoples_speech/peoples-speech-clean/1_2_2018_Williston_Selectboard_SLASH_1_2_2018_Williston_Selectboard_DOT_mp3/1_2_2018_Williston_Selectboard_SLASH_1_2_2018_Williston_Selectboard_DOT_mp3_transcript.txt'},
 {'audio': '/Users/lukas/Desktop/Projects/MIT/data/peoples_speech/audio_debug/1_2_2018_Winooski_City_Council_SLASH_1_2_2018_Winooski_City_Council_DOT_mp3/1_2_2018_Winooski_City_Council_SLASH_1_2_2018_Winooski_City_Council_DOT_mp3.flac',
  'transcript': '/Users/lukas/Desktop/Projects/MIT/data/peoples_speech/peoples-speech-clean/1_2_2018_Winooski_City_Council_SLASH_1_2_2018_Winooski_City_Council_DOT_mp3/1_2_2018_Winooski_City_Council_SLASH_1_2_2018_Winooski_City_Council_DOT_mp3_transcript.txt'}]

In [1]:
# load bert tokenizer

from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
token = 101

tokenizer.convert_ids_to_tokens(token)

'[CLS]'

In [19]:
import torch

t = torch.tensor(
    [
        [
            -100,
            3432,
            2517,
            -100,
            8910,
            -100,
            1996,
            -100,
            2030,
            12381,
            2714,
            2653,
            -100,
            1037,
            -100,
            -100,
            -100,
            2852,
            -100,
            -100,
            1005,
            1055,
            -100,
            -100,
            -100,
            -100,
            -100,
            6767,
            -100,
            -100,
            6588,
            6366,
            -100,
            1012,
            11338,
            -100,
            2271,
            -100,
            -100,
            3218,
            -100,
            -100,
            -100,
            -100,
            1012,
            1999,
            -100,
            -100,
            2008,
            2852,
            1012,
            9712,
            17912,
            27161,
            2000,
            -100,
            -100,
            -100,
            7559,
            -100,
            4487,
            20939,
            -100,
            7405,
            -100,
            -100,
            -100,
            -100,
            -100,
            2271,
            -100,
            -100,
            1012,
            -100,
            -100,
            1996,
            -100,
            2106,
            102,
        ],
        [
            -100,
            2025,
            -100,
            -100,
            -100,
            -100,
            1998,
            4487,
            -100,
            6499,
            -100,
            8910,
            -100,
            -100,
            -100,
            1010,
            -100,
            -100,
            -100,
            2031,
            2589,
            -100,
            -100,
            -100,
            1996,
            -100,
            2653,
            -100,
            -100,
            2122,
            -100,
            3033,
            -100,
            -100,
            -100,
            2595,
            -100,
            -100,
            -100,
            -100,
            3832,
            -100,
            3115,
            -100,
            -100,
            -100,
            -100,
            -100,
            -100,
            -100,
            -100,
            -100,
            -100,
            11338,
            2386,
            2271,
            -100,
            2852,
            -100,
            9712,
            -100,
            -100,
            -100,
            -100,
            2852,
            -100,
            -100,
            -100,
            -100,
            -100,
            -100,
            -100,
            -100,
            -100,
            -100,
            -100,
            -100,
            -100,
            -100,
        ],
    ]
)

t.shape

torch.Size([2, 79])

In [20]:
pred = torch.rand(2, t.shape[1], 30522)
pred.shape

torch.Size([2, 79, 30522])

In [22]:
import torch

loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100)
loss = loss_fct(pred.view(-1, 30522), t.view(-1))
loss

tensor(10.3751)

In [26]:
ignore_indices = t == -100

valid_t = t[~ignore_indices]
valid_pred = pred[~ignore_indices]

loss_fct(valid_pred.view(-1, 30522), valid_t.view(-1))

tensor(10.3751)