In [55]:
!pip install librosa soundfile noisereduce datasets

import os
import librosa
import soundfile as sf
import noisereduce as nr
from datasets import load_dataset
import json
import numpy as np
import IPython.display as ipd
from IPython.core.display import display, HTML
import pandas as pd
from tqdm import tqdm



In [56]:
# Boolean flag to control dataset size
USE_FULL_DATASET = True  # Set to True to use the full dataset, False for 10%


In [57]:
# Load the dataset from Hugging Face
def load_subset(dataset_name, split, use_full_dataset, subset_ratio=0.01):
    """
    Load a dataset split and return either the full dataset or a subset.
    """
    dataset = load_dataset(dataset_name, split=split)
    if not use_full_dataset:
        # Select a subset (10% of the dataset)
        subset_size = int(len(dataset) * subset_ratio)
        dataset = dataset.select(range(subset_size))
    return dataset

In [58]:
# Load datasets
train_dataset = load_subset("pourmand1376/asr-farsi-youtube-chunked-10-seconds", "train", USE_FULL_DATASET)
val_dataset = load_subset("pourmand1376/asr-farsi-youtube-chunked-10-seconds", "val", USE_FULL_DATASET)
test_dataset = load_subset("pourmand1376/asr-farsi-youtube-chunked-10-seconds", "test", USE_FULL_DATASET)

# Check the dataset structure
print(f"Train Dataset Size: {len(train_dataset)}")
print(f"Validation Dataset Size: {len(val_dataset)}")
print(f"Test Dataset Size: {len(test_dataset)}")

Resolving data files:   0%|          | 0/32 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/32 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/32 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/32 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/32 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/32 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/32 [00:00<?, ?it/s]

Train Dataset Size: 113204
Validation Dataset Size: 14151
Test Dataset Size: 14151


In [59]:
# Function to display samples with playable audio
def show_samples(data, num_samples=5, default_rate=16000):
    """
    Display a subset of samples with playable audio and transcription text.
    Supports both Hugging Face Dataset and pandas DataFrame.
    """
    # Check if data is a Hugging Face Dataset
    if isinstance(data, pd.DataFrame):
        samples = data.head(num_samples)
    else:  # Assume it's a Hugging Face Dataset
        samples = data.select(range(min(num_samples, len(data))))

    display_data = []
    for sample in samples.iterrows() if isinstance(samples, pd.DataFrame) else samples:
        if isinstance(samples, pd.DataFrame):
            # Extract from DataFrame
            row = sample[1]
            audio_path = row["audio_filepath"]
            text = row["text"]
            audio_widget = ipd.Audio(audio_path, autoplay=False)._repr_html_()
        else:
            # Extract from Hugging Face Dataset
            audio_data = sample["audio"]["array"]
            sampling_rate = sample["audio"]["sampling_rate"] if "sampling_rate" in sample["audio"] else default_rate
            text = sample["transcription"]
            audio_widget = ipd.Audio(audio_data, rate=sampling_rate, autoplay=False)._repr_html_()

        display_data.append({"Playable Audio": audio_widget, "Text": text})

    # Convert to DataFrame and display
    df = pd.DataFrame(display_data)
    html_table = df.to_html(escape=False, index=False)
    display(HTML(html_table))

In [60]:
import unicodedata
import string

SKIP = set(
    list(string.ascii_letters)
    + [
        "=",  # occurs only 2x in utterance (transl.): "twenty = xx"
        "ā",  # occurs only 4x together with "š"
        "š",
        # Arabic letters
        "ة",  # TEH MARBUTA
    ]
)

DISCARD = [
    # "(laughter)" in Farsi
    "(خنده)",
    # ASCII
    "!",
    '"',
    "#",
    "&",
    "'",
    "(",
    ")",
    ",",
    "-",
    ".",
    ":",
    ";",
    # Unicode punctuation?
    "–",
    "“",
    "”",
    "…",
    "؟",
    "،",
    "؛",
    "ـ",
    # Unicode whitespace?
    "ً",
    "ٌ",
    "َ",
    "ُ",
    "ِ",
    "ّ",
    "ْ",
    "ٔ",
    # Other
    "«",
    "»",
]

REPLACEMENTS = {
    "أ": "ا",
    "ۀ": "ە",
    "ك": "ک",
    "ي": "ی",
    "ى": "ی",
    "ﯽ": "ی",
    "ﻮ": "و",
    "ے": "ی",
    "ﺒ": "ب",
    "ﻢ": "ﻡ",
    "٬": " ",
    "ە": "ه",
}


def maybe_normalize(text: str) -> str | None:

    # Skip selected with banned characters
    if set(text) & SKIP:
        return None  # skip this

    # Remove hashtags - they are not being read in Farsi CV
    text = " ".join(w for w in text.split() if not w.startswith("#"))

    # Replace selected characters with others
    for lhs, rhs in REPLACEMENTS.items():
        text = text.replace(lhs, rhs)

    # Replace selected characters with empty strings
    for tok in DISCARD:
        text = text.replace(tok, "")

    # Unify the symbols that have the same meaning but different Unicode representation.
    text = unicodedata.normalize("NFKC", text)

    # Remove hamza's that were not merged with any letter by NFKC.
    text = text.replace("ء", "")

    # Remove double whitespace etc.
    return " ".join(t for t in text.split() if t)

In [61]:
def preprocess_text(text, noise_reduction=True, normalize=True, trim_silence=True):
    """
    Preprocess transcription.
    """

    # Preprocess text
    processed_text = maybe_normalize(text)
    return processed_text

In [64]:
def preprocess_dataset_with_text_processing(dataset, output_dir, manifest_file, corpus_file, preview_samples=False):
    """
    Preprocess dataset, including text.
    """
    # processed_files = []
    all_transcriptions = []
    skipped_indices = []  # To log problematic indices

    # for idx, data in enumerate(tqdm(dataset)):
    for idx in tqdm(range(len(dataset))):
        try:
            data = dataset[idx]
            text = data["transcription"]

            # Preprocess both audio and text
            processed_text = preprocess_audio_and_text(text)

            # Skip if transcription was deemed invalid
            if processed_text is None:
                continue

            # Save transcription for corpus
            all_transcriptions.append(processed_text)
        except Exception as e:
            skipped_indices.append((idx, str(e)))
            continue

    # Save transcriptions to corpus file
    with open(corpus_file, "w", encoding="utf-8") as f:
        for transcription in all_transcriptions:
            f.write(transcription + "\n")

    # Show samples of processed audios if requested
    if preview_samples:
        print(f"\nShowing samples from {output_dir}...\n")
        show_samples(pd.DataFrame(processed_files), num_samples=5)

In [65]:
# Process datasets with integrated transcription preprocessing
preprocess_dataset_with_text_processing(train_dataset, "processed_train_audio", "train_manifest.json", "train_corpus.txt")
preprocess_dataset_with_text_processing(val_dataset, "processed_val_audio", "val_manifest.json", "val_corpus.txt")
preprocess_dataset_with_text_processing(test_dataset, "processed_test_audio", "test_manifest.json", "test_corpus.txt")

100%|██████████| 113204/113204 [50:57<00:00, 37.02it/s]
100%|██████████| 14151/14151 [06:20<00:00, 37.22it/s]
100%|██████████| 14151/14151 [06:28<00:00, 36.45it/s]


In [66]:
def merge_corpora(corpus_files, output_file):
    """
    Merge multiple corpus files into a single corpus file.

    Parameters:
        corpus_files (list): List of corpus file paths to merge.
        output_file (str): Path to save the merged corpus file.
    """
    with open(output_file, "w", encoding="utf-8") as outfile:
        for file in corpus_files:
            with open(file, "r", encoding="utf-8") as infile:
                # Write each line from the current file to the output file
                for line in infile:
                    outfile.write(line)
    print(f"Merged corpus saved to: {output_file}")


# List of corpus files
corpus_files = ["train_corpus.txt", "val_corpus.txt", "test_corpus.txt"]

# Output file path
merged_corpus_file = "combined_corpus.txt"

# Merge the corpora
merge_corpora(corpus_files, merged_corpus_file)


Merged corpus saved to: combined_corpus.txt


In [None]:
# ! rm -rf processed_train_audio
# ! rm -rf processed_test_audio
# ! rm -rf processed_val_audio