In [None]:
from zipfile import ZipFile
with ZipFile("main_dataset_folder.zip", "r") as zip_ref:
  zip_ref.extractall("./")
print("Extraction of zip file complete")

In [None]:
from datasets import  load_dataset, Audio

dataset = load_dataset(
    "csv",
    data_files="/content/main_dataset_folder/main_dataset.csv",
    split = "train"
)

dataset

In [None]:
dataset = dataset.remove_columns(["Unnamed: 0", "id", "filename", "start", "end", "duration"])

In [None]:
dataset

In [None]:
import os

AUDIO_ROOT = "/main_dataset_folder"

def fix_path(example):
  example["file_path"] = os.path.join(AUDIO_ROOT, example["file_path"])
  return example

dataset = dataset.map(fix_path)

In [None]:
import numpy as np
import librosa
import soundfile as sf
import os

def load_audio_with_librosa(example):
    try:
        audio_array, sample_rate = librosa.load(
            example["file_path"],
            sr=16000,  # 16kHz resample
            mono=True,
            dtype=np.float32
        )

        audio_array = audio_array.astype(np.float32)

        example["audio"] = {
            "array": audio_array,
            "sampling_rate": 16000
        }

        return example
    except Exception as e:
        print(f"Error loading audio file {example['file_path']}: {e}")
    
        example["audio"] = {
            "array": np.zeros(16000, dtype=np.float32),
            "sampling_rate": 16000
        }
        return example

dataset = dataset.map(load_audio_with_librosa)


In [None]:
dataset["file_path"][0]

In [None]:
from transformers import SpeechT5Processor

checkpoint = "microsoft/speecht5_tts"
processor = SpeechT5Processor.from_pretrained(checkpoint)

In [None]:
tokenizer = processor.tokenizer

In [None]:
def extract_all_chars(batch):
  texts = [str(text) for text in batch["text"] if text is not None]
  all_text = " ".join(texts)
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all text": [all_text]}

vocabs = dataset.map(
    extract_all_chars,
    batched = True,
    batch_size = -1,
    keep_in_memory = True,
    remove_columns = dataset.column_names,
)

dataset_vocab = set(vocabs["vocab"][0])
tokenizer_vocab = {k for k, _ in tokenizer.get_vocab().items()}

In [None]:
dataset_vocab- tokenizer_vocab

In [None]:
dataset["text"][5696]

In [None]:
import re

def normalize_text(text):

  if text is None or not str(text).strip():
    return ""

  text = str(text).lower()

  text = re.sub(r'[^\w\s\']', '', text)
  text = ' '.join(text.split())

  return text

def add_normalized_text(example):
  example["normalized_text"] = normalize_text(example["text"])

  return example

dataset = dataset.map(add_normalized_text)

In [None]:
def extract_all_chars(batch):
    all_text = " ".join(batch["normalized_text"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}


vocabs = dataset.map(
    extract_all_chars,
    batched=True,
    batch_size=-1,
    keep_in_memory=True,
    remove_columns=dataset.column_names,
)

dataset_vocab = set(vocabs["vocab"][0])
tokenizer_vocab = {k for k, _ in tokenizer.get_vocab().items()}

In [None]:
dataset_vocab - tokenizer_vocab