### Clone Repo

In [1]:
!git clone https://github.com/huggingface/dataspeech.git

Cloning into 'dataspeech'...
remote: Enumerating objects: 650, done.[K
remote: Counting objects: 100% (221/221), done.[K
remote: Compressing objects: 100% (90/90), done.[K
remote: Total 650 (delta 138), reused 131 (delta 131), pack-reused 429 (from 1)[K
Receiving objects: 100% (650/650), 158.78 KiB | 5.47 MiB/s, done.
Resolving deltas: 100% (395/395), done.


### Install Requirements

In [2]:
%cd dataspeech
!pip install -r requirements.txt -q

/kaggle/working/dataspeech
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
dataproc-jupyter-plugin 0.1.79 requires pydantic~=1.10.0, but you have pydantic 2.8.2 which is incompatible.
ydata-profiling 4.10.0 requires scipy<1.14,>=1.4.1, but you have scipy 1.14.1 which is incompatible.[0m[31m
[0m

In [3]:
!pip install g2p_id_py -q
!pip install lingua-language-detector -q

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
preprocessing 0.1.13 requires nltk==3.2.4, but you have nltk 3.8.1 which is incompatible.[0m[31m
[0m

### Edit `rate.py`

In [4]:
contain_rate = """
from g2p_id import G2p as G2P_ID
from g2p import make_g2p
from lingua import Language, LanguageDetectorBuilder

# Initialize G2P models
g2p_id = G2P_ID()  # Indonesian G2P
g2p_en = make_g2p("eng", "eng-ipa")  # English G2P

# Initialize language detector
languages = [Language.ENGLISH, Language.INDONESIAN]
detector = LanguageDetectorBuilder.from_languages(*languages).build()

# Function to process each word based on language
def process_word(word):
    language = detector.detect_language_of(word)
    if language == Language.ENGLISH:
        return g2p_en(word).output_string  # English G2P
    else:
        try:
            list_phonemes = g2p_id(word)
        except KeyError as e:
            print(f"KeyError: {e}")
            return ""
            
        if list_phonemes:
            return "".join(list_phonemes[0])  # Indonesian G2P
        else:
            print(f"Missing phonemes : {word}")
            print(list_phonemes)
            return ""

# Updated rate_apply function
def rate_apply(batch, rank=None, audio_column_name="audio", text_column_name="text"):
    if isinstance(batch[text_column_name], list):  
        speaking_rates = []
        phonemes_list = []
        if "speech_duration" in batch:
            for text, audio_duration in zip(batch[text_column_name], batch["speech_duration"]):
                # Process each word in the text
                phonemes = " ".join([process_word(word) for word in text.split()])
                audio_duration = audio_duration if audio_duration != 0 else 0.01
                speaking_rate = len(phonemes) / audio_duration
                speaking_rates.append(speaking_rate)
                phonemes_list.append(phonemes)
        else:
            for text, audio in zip(batch[text_column_name], batch[audio_column_name]):
                # Process each word in the text
                phonemes = " ".join([process_word(word) for word in text.split()])
                
                sample_rate = audio["sampling_rate"]
                audio_length = len(audio["array"].squeeze()) / sample_rate
                
                speaking_rate = len(phonemes) / audio_length
                
                speaking_rates.append(speaking_rate)
                phonemes_list.append(phonemes)
        
        batch["speaking_rate"] = speaking_rates
        batch["phonemes"] = phonemes_list
    else:
        # Process a single text entry
        phonemes = " ".join([process_word(word) for word in batch[text_column_name].split()])
        if "speech_duration" in batch:
            audio_length = batch["speech_duration"] if batch["speech_duration"] != 0 else 0.01
        else:
            sample_rate = batch[audio_column_name]["sampling_rate"]
            audio_length = len(batch[audio_column_name]["array"].squeeze()) / sample_rate

        speaking_rate = len(phonemes) / audio_length
        
        batch["speaking_rate"] = speaking_rate
        batch["phonemes"] = phonemes

    return batch

"""

In [5]:
with open('dataspeech/cpu_enrichments/rate.py', 'w') as file:
    file.write(contain_rate)

### Prepare Huggingface Access

In [6]:
from huggingface_hub import HfApi, HfFolder
import os

In [7]:
os.environ['HF_TOKEN'] = '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'

In [8]:
hf_api = HfApi()
HfFolder.save_token(os.environ['HF_TOKEN'])

In [9]:
user_info = hf_api.whoami()
print(f"Logged in as: {user_info['name']}")

Logged in as: Amadeus99


### Annotate Dataset

In [10]:
!python main.py "Amadeus99/youtube-transcript-dataset" \
  --configuration "default" \
  --text_column_name "transcript_normalized" \
  --audio_column_name "audio" \
  --cpu_num_workers 6 \
  --rename_column \
  --repo_id "Amadeus99/youtube-transcript-dataset-processed" \
  --apply_squim_quality_estimation

[32mINFO[0m - Applied quirks (see `speechbrain.utils.quirks`): [allow_tf32, disable_jit_profiling]
[32mINFO[0m - Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []
  from speechbrain.pretrained import (
README.md: 100%|███████████████████████████████| 593/593 [00:00<00:00, 3.95MB/s]
  self.pid = os.fork()
train-00012-of-00013.parquet: 100%|██████████| 479M/479M [00:11<00:00, 42.5MB/s]
train-00009-of-00013.parquet: 100%|██████████| 479M/479M [00:11<00:00, 42.5MB/s]
train-00003-of-00013.parquet: 100%|██████████| 481M/481M [00:11<00:00, 42.2MB/s]
train-00000-of-00013.parquet: 100%|██████████| 481M/481M [00:11<00:00, 42.4MB/s]
train-00006-of-00013.parquet: 100%|██████████| 481M/481M [00:11<00:00, 42.5MB/s]
train-00001-of-00013.parquet: 100%|██████████| 481M/481M [00:11<00:00, 42.6MB/s]
train-00010-of-00013.parquet: 100%|██████████| 482M/482M [00:11<00:00, 42.5MB/s]
train-00004-of-00013.parquet: 100%|██████████| 480M/480M [00:11<00:00