In [1]:
import random
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from huggingface_hub import notebook_login
from datasets import load_dataset, DatasetDict, Audio, Dataset, concatenate_datasets
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, pipeline
import evaluate
import jiwer
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import torch
import os
import gradio as gr

In [2]:
model_name = "openai/whisper-large-v3-turbo"
model = WhisperForConditionalGeneration.from_pretrained(model_name)
model.generation_config.language = "polish"
model.generation_config.task = "transcribe"
processor = WhisperProcessor.from_pretrained(model_name)

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
        
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        # Create attention mask based on input features
        attention_mask = batch["input_features"].ne(0).long()  # Assume padding value is 0 for input features
        batch["attention_mask"] = attention_mask

        batch["labels"] = labels
        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)


config.json:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/1.62G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.71M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

In [4]:
# Load the dataset
directory = "Prepared_Datasets"
all_dirs = os.listdir(directory)
train_batch_dirs = [f"{directory}/{dir_name}" for dir_name in all_dirs if dir_name.startswith('processed_train_batch')]
test_batch_dirs = [f"{directory}/{dir_name}" for dir_name in all_dirs if dir_name.startswith('processed_test_batch')]
val_batch_dirs = [f"{directory}/{dir_name}" for dir_name in all_dirs if dir_name.startswith('processed_val_batch')]

train_batch_datasets = [Dataset.load_from_disk(batch_dir) for batch_dir in train_batch_dirs]
test_batch_datasets = [Dataset.load_from_disk(batch_dir) for batch_dir in test_batch_dirs]
val_batch_datasets = [Dataset.load_from_disk(batch_dir) for batch_dir in val_batch_dirs]

train_dataset = concatenate_datasets(train_batch_datasets)
test_dataset = concatenate_datasets(test_batch_datasets)
val_dataset = concatenate_datasets(val_batch_datasets)

train_dataloader = DataLoader(
    train_dataset,
    batch_size=4,
    collate_fn=data_collator,
)

In [5]:
print("Length of train dataset:", len(train_dataset))
print("Length of test dataset:", len(test_dataset))
print("Length of val dataset:", len(val_dataset))
print("Keys train dataset:", train_dataset[0].keys())
print("Keys test dataset:", test_dataset[0].keys())
print("Keys evaluation dataset:", val_dataset[0].keys())
print("Example of train dataset:", train_dataset[0]['input_features'][:1])
print(train_dataset[0]['labels'])

Length of train dataset: 23967
Length of test dataset: 9230
Length of val dataset: 5992
Keys train dataset: dict_keys(['input_features', 'labels'])
Keys test dataset: dict_keys(['input_features', 'labels'])
Keys evaluation dataset: dict_keys(['input_features', 'labels'])
Example of train dataset: [[-0.6631942987442017, -0.6631942987442017, -0.6631942987442017, -0.6631942987442017, -0.1795949935913086, -0.12517893314361572, -0.2037043571472168, -0.1995849609375, -0.28836262226104736, -0.22072160243988037, -0.1948835849761963, -0.511208176612854, -0.14611577987670898, -0.31661951541900635, -0.6576759815216064, -0.1547255516052246, -0.4112142324447632, -0.3144327402114868, -0.30326128005981445, -0.23319053649902344, -0.3203343152999878, -0.4964789152145386, -0.2958409786224365, -0.43426787853240967, -0.52558434009552, -0.5168839693069458, -0.15035605430603027, -0.3171960115432739, -0.6631942987442017, -0.3597283363342285, -0.05811285972595215, -0.2631187438964844, -0.22832512855529785, -0

In [6]:
metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    pred_str = [p.lower() for p in pred_str]
    label_str = [l.lower() for l in label_str]

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)
    return {"eval_wer": wer}


In [7]:
model.config.use_cache = False

training_args = Seq2SeqTrainingArguments(
    output_dir=f"{model_name.strip("/")[1]}-pl",
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,
    learning_rate=1e-5,
    warmup_steps=200,
    gradient_checkpointing=True,
    fp16=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    logging_steps=300,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="eval_wer", 
    greater_is_better=False,
    push_to_hub=True,
    num_train_epochs=5,
)

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics, 
    processing_class=processor
)


: 

In [7]:
# from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, EncoderDecoderCache

# past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
# model.config.use_cache = False

# training_args = Seq2SeqTrainingArguments(
#     output_dir="whisper-tiny-pl03",
#     per_device_train_batch_size=16,
#     gradient_a ccumulation_steps=1,
#     learning_rate=1e-5,
#     warmup_steps=200,
#     max_steps=3000,
#     gradient_checkpointing=True,
#     fp16=True,
#     eval_strategy="steps",
#     per_device_eval_batch_size=8,
#     predict_with_generate=True,
#     generation_max_length=225,
#     save_steps=300,
#     eval_steps=300,
#     logging_steps=100,
#     report_to=["tensorboard"],
#     load_best_model_at_end=True,
#     metric_for_best_model="eval_wer", 
#     greater_is_better=False,
#     push_to_hub=True,
# )

# # Create the trainer instance
# trainer = Seq2SeqTrainer(
#     args=training_args,
#     model=model,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,
#     data_collator=data_collator,
#     compute_metrics=compute_metrics, 
#     tokenizer=processor
# )


In [2]:
import warnings
import logging
warnings.filterwarnings("ignore", message=".*is now deprecated.*")
PYTHONWARNINGS="ignore::DeprecationWarning"
logging.getLogger("transformers").setLevel(logging.ERROR)

trainer.train()

In [14]:
# from transformers import WhisperForConditionalGeneration, WhisperProcessor
# from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# # Path to the trained model
# model_path = r"C:\Users\kogut\PYTHONIK\Whisper-tiny-tunned\whisper-tiny-pl"

# # Load the trained model and processor
# model = WhisperForConditionalGeneration.from_pretrained(model_path)
# processor = WhisperProcessor.from_pretrained(model_path)

# # Dummy training arguments (no actual training or evaluation)
# training_args = Seq2SeqTrainingArguments(
#     output_dir=model_path,  # Use the same directory
#     do_train=False,
#     do_eval=False,
#     logging_dir="./logs",  # Optional: specify where logs should go
# )

# # Initialize the Trainer
# trainer = Seq2SeqTrainer(
#     model=model,
#     args=training_args,
#     tokenizer=processor,  # Attach the tokenizer/processor
# )

  trainer = Seq2SeqTrainer(


In [15]:
from huggingface_hub import notebook_login
# notebook_login(token="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")

kwargs = {
    "dataset_tags": "mozilla-foundation/common_voice_17_0",
    "dataset": "Common Voice 17.0", 
    "language": "pl",
    "model_name": "Whisper Tiny PL", 
    "finetuned_from": "openai/whisper-tiny",
    "tasks": "automatic-speech-recognition",
}

In [17]:
trainer.push_to_hub(
    **kwargs,
)



events.out.tfevents.1736104303.Marcin.15464.0:   0%|          | 0.00/13.8k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.50k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/marcsixtysix/whisper-tiny-pl/commit/f5fe29fe20e0889a6a1c5dc3d0d7b0b061440e37', commit_message='End of training', commit_description='', oid='f5fe29fe20e0889a6a1c5dc3d0d7b0b061440e37', pr_url=None, repo_url=RepoUrl('https://huggingface.co/marcsixtysix/whisper-tiny-pl', endpoint='https://huggingface.co', repo_type='model', repo_id='marcsixtysix/whisper-tiny-pl'), pr_revision=None, pr_num=None)

In [None]:
from transformers import pipeline
import gradio as gr

pipe = pipeline(model="marcsixtysix/whisper-tiny-pl03")  # Change to your model

def transcribe(audio):
    text = pipe(audio)["text"]
    return text

iface = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(type="filepath"),  # Removed 'source' argument
    outputs="text",
    title="Whisper Tiny Polish",
    description="Realtime demo for Polish speech recognition using a fine-tuned Whisper tiny model for mobile devices.",
)

iface.launch(share=True)
