In [2]:
import pandas as pd
from datasets import Dataset, DatasetDict
import os
import librosa
from transformers import WhisperFeatureExtractor, WhisperForConditionalGeneration, WhisperTokenizer, WhisperProcessor
import torch
import evaluate
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from datasets import Audio

1001


In [None]:
folder_name = r"Datasets\cv-corpus-17.0-delta-2024-03-15\pl\clips"
df = pd.read_csv(r"Datasets\cv-corpus-17.0-delta-2024-03-15\pl\validated.tsv", sep="\t")

def process_audio_file(file_path):
    audio_path = os.path.join(folder_name, file_path)
    audio_array, sampling_rate = librosa.load(audio_path, sr=None)
    return {
        'path': audio_path,
        'array': audio_array,
        'sampling_rate': sampling_rate
    }

df['audio'] = df['path'].apply(lambda x: process_audio_file(x))

df.drop(columns=['client_id', 'path', 'sentence_id', 'sentence_domain', 'up_votes', 'down_votes', 'age', 'gender', 'accents', 'variant', 'locale', 'segment'], inplace=True)

data_dict = {
    'sentence': df['sentence'].tolist(),
    'audio': df['audio'].tolist()
}

dataset = Dataset.from_dict(data_dict)

# Split the dataset into 85% training and 15% testing
train_test_split = dataset.train_test_split(test_size=0.15)

common_voice = DatasetDict()
common_voice["train"] = train_test_split["train"]
common_voice["test"] = train_test_split["test"]

In [3]:
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", language="Polish", task="transcribe")
input_str = common_voice["train"][0]["sentence"]
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")

processor = WhisperProcessor.from_pretrained("openai/whisper-tiny", language="Polish", task="transcribe")




Input:                 Zarówno pani, jak i my wiemy, że mówiła pani o zadaniu niemal niemożliwym
Decoded w/ special:    <|startoftranscript|><|pl|><|transcribe|><|notimestamps|>Zarówno pani, jak i my wiemy, że mówiła pani o zadaniu niemal niemożliwym<|endoftext|>
Decoded w/out special: Zarówno pani, jak i my wiemy, że mówiła pani o zadaniu niemal niemożliwym
Are equal:             True


In [None]:
from datasets import Audio


common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

def prepare_dataset(batch, feature_extractor, tokenizer):
    audio = batch["audio"]
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

common_voice2 = common_voice.map(
    prepare_dataset,
    fn_kwargs={"feature_extractor": feature_extractor, "tokenizer": tokenizer},
    remove_columns=common_voice.column_names["train"],
    num_proc=2,
)

print(common_voice2["train"][0])


Map (num_proc=2): 100%|██████████| 433/433 [02:42<00:00,  2.67 examples/s] 
Map (num_proc=2): 100%|██████████| 77/77 [01:12<00:00,  1.07 examples/s] 


{'input_features': [[-0.7108817100524902, -0.7108817100524902, -0.7108817100524902, -0.7108817100524902, -0.6354376077651978, -0.7108817100524902, -0.5470696687698364, -0.7108817100524902, -0.4987189769744873, -0.7108817100524902, -0.7016459703445435, -0.685486912727356, -0.6613118648529053, -0.7108817100524902, -0.7108817100524902, -0.7108817100524902, -0.6327930688858032, -0.7108817100524902, -0.7108817100524902, -0.7108817100524902, -0.65128493309021, -0.5855538845062256, -0.45812904834747314, -0.423537015914917, -0.6004492044448853, -0.524622917175293, -0.5701379776000977, -0.5659408569335938, -0.5166306495666504, -0.5031123161315918, -0.42330169677734375, -0.5675925016403198, -0.7108817100524902, -0.5766924619674683, -0.617501974105835, -0.7108817100524902, -0.7108817100524902, -0.5031193494796753, -0.545857310295105, -0.7108817100524902, -0.5257382392883301, -0.7108817100524902, -0.7108817100524902, -0.6331301927566528, -0.7108817100524902, -0.7108817100524902, -0.656770944595336

In [5]:


model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
model.generation_config.language = "polish"
model.generation_config.task = "transcribe"
model.generation_config.forced_decoder_ids = None


In [None]:
import torch
import evaluate
from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch
    
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)


metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}


In [17]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="models",
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=2000,
    gradient_checkpointing=True,
    fp16=True,
    eval_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=500,
    eval_steps=500,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    remove_unused_columns=False,
    greater_is_better=False,
    push_to_hub=False,
)
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice2["train"],
    eval_dataset=common_voice2["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    processing_class=processor,
)


In [18]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...
                                        
  0%|          | 0/2000 [13:10<?, ?it/s]            

{'loss': 1.1914, 'grad_norm': 20.275367736816406, 'learning_rate': 5.000000000000001e-07, 'epoch': 0.89}


                                        
  0%|          | 0/2000 [20:36<?, ?it/s]            

{'loss': 1.1398, 'grad_norm': 17.13619613647461, 'learning_rate': 1.0000000000000002e-06, 'epoch': 1.79}


                                        
  0%|          | 0/2000 [28:01<?, ?it/s]           

{'loss': 1.0404, 'grad_norm': 14.59571361541748, 'learning_rate': 1.5e-06, 'epoch': 2.68}




KeyboardInterrupt: 