# Speech-To-Text Model Fine-Tuning for Various Accents on Engineerning Domains


Preparing Environment

In [None]:
!pip install -U datasets
#!pip install git+https://github.com/huggingface/transformers
!pip install evaluate>=0.30
!pip install jiwer
!pip install accelerate -U
!pip install wandb
!pip install transformers

In [None]:
!pip install --upgrade transformers>=4.30.0

In [None]:
#HuggingFace Login (Optional, only when access to private datasets needed)
from huggingface_hub import notebook_login
notebook_login()

Import FeatureExtractor and Tokenizer

In [None]:
from tqdm import tqdm
from transformers.models.whisper.feature_extraction_whisper import WhisperFeatureExtractor
# 파인튜닝을 진행하고자 하는 모델의 feature extractor를 로드
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")

In [None]:
from transformers import WhisperTokenizer
# 파인튜닝을 진행하고자 하는 모델의 tokenizer를 로드
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base", language="English", task="transcribe")

Combine to WhisperProcessor(Using Tiny Model for now)

In [None]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-base", language="English", task="transcribe")

Prepare DataSets

In [None]:
from datasets import load_dataset
from datasets import Dataset, DatasetDict
from datasets import Audio

In [None]:
!rm -rf ~/.cache/huggingface/datasets/krishan23___indian_english
#데이터셋 캐시 삭제

In [None]:
AccentDataset = load_dataset("yongjune2002/MITOCW-whisper")
#3GB에 해당하는 Indian Accent Dataset

In [None]:
AccentDataset = AccentDataset.cast_column("audio", Audio(sampling_rate=16_000))

In [None]:
def is_under_30s(example):
    array = example["audio"]["array"]
    sr    = example["audio"]["sampling_rate"]
    duration = len(array) / sr
    return duration <= 30.0

AccentDataset = AccentDataset.filter(is_under_30s)

In [None]:
print(AccentDataset)

In [None]:
def prepare_dataset(batch):
    # 오디오 파일을 16kHz로 로드
    audio = batch["audio"]

    # input audio array로부터 log-Mel spectrogram 변환
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # target text를 label ids로 변환
    batch["labels"] = tokenizer(batch["text"]).input_ids
    return batch

In [None]:
ProcessedAccentDataset = AccentDataset.map(prepare_dataset, remove_columns=AccentDataset.column_names["train"], num_proc=None)

In [None]:
ProcessedAccentDataset.push_to_hub("yongjune2002/MITOCW-Whisper-Processed")

CheckPoint : Data Is Preprocessed and Uploaded to HuggingFace

In [None]:
ProcessedAccentDataset = load_dataset("yongjune2002/Whisper_IndianAccent")

In [None]:
print(ProcessedAccentDataset)