In [None]:
!pip install --upgrade datasets[audio] transformers accelerate evaluate jiwer tensorboard gradio

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting jiwer
  Downloading jiwer-3.0.5-py3-none-any.whl.metadata (2.7 kB)
Collecting tensorboard
  Downloading tensorboard-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Collecting gradio
  Downloading gradio-5.9.1-py3-none-any.whl.metadata (16 kB)
Collecting datasets[audio]
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets[audio])
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets[audio])
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets[audio])
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets[audio])
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting rapidfuzz<4,>=3 (from j

In [None]:
from huggingface_hub import login

login(
  token="", # ADD YOUR TOKEN HERE
  add_to_git_credential=True
)


In [None]:
from google.colab import drive
import zipfile
import os

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import tarfile

# Define the path of the tar.gz file in Google Drive
tar_file_path = '/content/drive/My Drive/Colab Notebooks/Datasets/GV_Eval_3h.tar.gz'

# Define the extraction folder in the current Colab session
dataset_folder = '/content/dataset'

# Create the dataset folder if it doesn't exist
os.makedirs(dataset_folder, exist_ok=True)

# Extract the tar.gz file
with tarfile.open(tar_file_path, 'r:gz') as tar_ref:
    tar_ref.extractall(dataset_folder)

print("File extracted successfully into the dataset folder!")

File extracted successfully into the dataset folder!


In [None]:
import os
import json
import librosa
import soundfile as sf
from datasets import Dataset, Audio
import re

def process_text_file(text_file_path):
    """
    Process the text file containing annotations and return a dictionary mapping
    audio IDs to their transcriptions.
    """
    annotations = {}
    with open(text_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            # Extract the audio ID and text from each line
            # The format appears to be: ID-text
            parts = line.strip().split(' ', 1)
            if len(parts) == 2:
                audio_id = parts[0].strip()
                text = parts[1].strip()
                annotations[audio_id] = text
    return annotations

def create_dataset(audio_dir, text_file_path, output_dir):
    """
    Create a Hugging Face dataset from audio files and their transcriptions.

    Parameters:
    - audio_dir: Directory containing audio files
    - text_file_path: Path to the text file containing annotations
    - output_dir: Directory to save the processed dataset
    """
    # Process text file
    annotations = process_text_file(text_file_path)

    # Prepare dataset entries
    dataset_entries = []

    # List all audio files
    audio_files = [f for f in os.listdir(audio_dir) if f.endswith('.mp3')]

    for audio_file in audio_files:
        # Extract audio ID from filename
        audio_id = audio_file.split('.')[0]

        if audio_id in annotations:
            # Full path to audio file
            audio_path = os.path.join(audio_dir, audio_file)

            try:
                # Load and resample audio to 16kHz
                audio, orig_sr = librosa.load(audio_path, sr=None)
                if orig_sr != 16000:
                    audio = librosa.resample(audio, orig_sr=orig_sr, target_sr=16000)

                # Save resampled audio
                resampled_path = os.path.join(output_dir, f"resampled_{audio_file}")
                sf.write(resampled_path, audio, 16000)

                # Create dataset entry
                entry = {
                    'audio': resampled_path,
                    'text': annotations[audio_id],
                    'audio_id': audio_id
                }
                dataset_entries.append(entry)

            except Exception as e:
                print(f"Error processing {audio_file}: {str(e)}")

    # Create Hugging Face dataset
    dataset = Dataset.from_dict({
        'audio': [entry['audio'] for entry in dataset_entries],
        'text': [entry['text'] for entry in dataset_entries],
        'audio_id': [entry['audio_id'] for entry in dataset_entries]
    })

    # Cast audio column to Audio feature
    dataset = dataset.cast_column('audio', Audio(sampling_rate=16000))

    return dataset

In [None]:
# Replace these with your actual paths
audio_dir = '/content/dataset/GV_Eval_3h/Audio'
text_file_path = '/content/dataset/GV_Eval_3h/text'
output_dir = '/content/hugsdataset'

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Create dataset
dataset = create_dataset(audio_dir, text_file_path, output_dir)

# Save dataset
dataset.save_to_disk(os.path.join(output_dir, 'final_dataset'))

# Print some statistics
print(f"Dataset created with {len(dataset)} examples")
print("\nSample entries:")
print(dataset[:2])

Saving the dataset (0/1 shards):   0%|          | 0/1032 [00:00<?, ? examples/s]

Dataset created with 1032 examples

Sample entries:
{'audio': [{'path': '/content/hugsdataset/resampled_01-07315-02.mp3', 'array': array([ 4.13258839e-03, -2.49612331e-03, -1.18489750e-02, ...,
        7.72839849e-05,  3.29720846e-04,  1.73159526e-04]), 'sampling_rate': 16000}, {'path': '/content/hugsdataset/resampled_01-02287-01.mp3', 'array': array([-1.79739669e-04,  3.56930792e-02,  7.12691993e-02, ...,
       -6.18704362e-05,  5.76680759e-06, -7.60500552e-06]), 'sampling_rate': 16000}], 'text': ['वो साथ चलें बन कर साया वो साथ चलें बन कर साया उससे ही जीवन बनता आया वो अपनेपन की परिभाषा वो आशा वो अभिलाषा नारी के हैं रूप अनेक नारी', 'के लिए आज परीक्षा आयोजित की गई जिसमें अभ्यर्थियों का प्रमाणपत्र वेरिफिकेशन लिखित परीक्षा एवं साक्षात्कार का आयोजन किया गया विद्यालय परिसर में ही किया गया इस आयोजन में लगभग साठ अभियार्थियों ने योगदान किया मैं राजीव कुमार ठाकुर ग्राम राइसेर पोस्ट वाजिपुर ज़िला मुंगेर मुंगेर मोबाइल वाणी से धन्यवाद'], 'audio_id': ['01-07315-02', '01-02287-01']}


In [None]:
print(dataset[:1])

{'audio': [{'path': '/content/hugsdataset/resampled_01-07315-02.mp3', 'array': array([ 4.13258839e-03, -2.49612331e-03, -1.18489750e-02, ...,
        7.72839849e-05,  3.29720846e-04,  1.73159526e-04]), 'sampling_rate': 16000}], 'text': ['वो साथ चलें बन कर साया वो साथ चलें बन कर साया उससे ही जीवन बनता आया वो अपनेपन की परिभाषा वो आशा वो अभिलाषा नारी के हैं रूप अनेक नारी'], 'audio_id': ['01-07315-02']}


In [None]:
def display_audio_text_pair(dataset, index):
    """
    Display and play an audio-text pair from the dataset in Colab.

    Parameters:
    - dataset: Hugging Face dataset
    - index: Index of the example to display
    """
    from IPython.display import Audio, display, HTML
    import numpy as np

    # Get the example
    example = dataset[index]

    # Display text
    print("📝 Transcription:")
    print(example['text'])
    print("\n🔊 Audio:")

    # Get audio array and sampling rate
    audio_array = example['audio']['array']
    sampling_rate = example['audio']['sampling_rate']

    # Create audio player
    display(Audio(audio_array, rate=sampling_rate))

    # Display metadata
    print(f"\nℹ️ Audio ID: {example['audio_id']}")
    print(f"📊 Audio length: {len(audio_array)/sampling_rate:.2f} seconds")

In [None]:
# Display single example
display_audio_text_pair(dataset, 0)

📝 Transcription:
वो साथ चलें बन कर साया वो साथ चलें बन कर साया उससे ही जीवन बनता आया वो अपनेपन की परिभाषा वो आशा वो अभिलाषा नारी के हैं रूप अनेक नारी

🔊 Audio:



ℹ️ Audio ID: 01-07315-02
📊 Audio length: 12.87 seconds


In [None]:
from datasets import load_from_disk

def split_dataset(dataset, test_size=0.2, seed=42):
    """
    Split the dataset into training and test sets.

    Parameters:
    - dataset: The Hugging Face dataset to split
    - test_size: Proportion of dataset to include in the test split (default: 0.2)
    - seed: Random seed for reproducibility (default: 42)

    Returns:
    - Dictionary containing train and test datasets
    """
    # Shuffle and split the dataset
    dataset_dict = dataset.train_test_split(
        test_size=test_size,
        seed=seed,
        shuffle=True
    )

    # Print split information
    print(f"Total examples: {len(dataset)}")
    print(f"Training examples: {len(dataset_dict['train'])}")
    print(f"Test examples: {len(dataset_dict['test'])}")

    return dataset_dict

In [None]:
# Split the dataset
dataset_dict = split_dataset(dataset)

# Access train and test splits
train_dataset = dataset_dict['train']
test_dataset = dataset_dict['test']

Total examples: 1032
Training examples: 825
Test examples: 207


In [None]:
train_dataset[0]

{'audio': {'path': '/content/hugsdataset/resampled_02-13282-02.mp3',
  'array': array([-0.14027268, -0.1642369 , -0.05345406, ...,  0.01650552,
          0.01599734,  0.0066894 ]),
  'sampling_rate': 16000},
 'text': 'किये कहीं स्मार्ट क्लास के रूम के छत से पानी टपकता रहता है तो कहीं बैठने',
 'audio_id': '02-13282-02'}

In [None]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

In [None]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Hindi", task="transcribe")

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

In [None]:
input_str = train_dataset[0]["text"]
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

In [None]:
print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")

Input:                 किये कहीं स्मार्ट क्लास के रूम के छत से पानी टपकता रहता है तो कहीं बैठने
Decoded w/ special:    <|startoftranscript|><|hi|><|transcribe|><|notimestamps|>किये कहीं स्मार्ट क्लास के रूम के छत से पानी टपकता रहता है तो कहीं बैठने<|endoftext|>
Decoded w/out special: किये कहीं स्मार्ट क्लास के रूम के छत से पानी टपकता रहता है तो कहीं बैठने
Are equal:             True


In [None]:
labels

[50258,
 50276,
 50359,
 50363,
 41858,
 33279,
 48268,
 21981,
 31970,
 44500,
 31881,
 31945,
 49316,
 27099,
 48521,
 17937,
 25411,
 27099,
 3941,
 253,
 31970,
 27099,
 46758,
 17937,
 45938,
 31970,
 21981,
 8485,
 108,
 8703,
 224,
 48521,
 31970,
 21981,
 8485,
 249,
 36158,
 49316,
 21981,
 8485,
 103,
 17937,
 35082,
 31881,
 8485,
 253,
 3941,
 103,
 41858,
 36158,
 17937,
 8485,
 108,
 44500,
 36158,
 17937,
 37139,
 43372,
 8485,
 97,
 33926,
 31970,
 44500,
 31881,
 31945,
 8485,
 105,
 43372,
 3941,
 254,
 35082,
 21981,
 50257]

In [None]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Hindi", task="transcribe")

In [None]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["text"]).input_ids
    return batch


In [None]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['audio', 'text', 'audio_id'],
        num_rows: 825
    })
    test: Dataset({
        features: ['audio', 'text', 'audio_id'],
        num_rows: 207
    })
})

In [None]:
common_voice = dataset_dict.map(prepare_dataset, remove_columns=dataset_dict.column_names["train"], num_proc=1)


Map:   0%|          | 0/825 [00:00<?, ? examples/s]

Map:   0%|          | 0/207 [00:00<?, ? examples/s]

In [None]:
common_voice

DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 825
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 207
    })
})

In [None]:
def check_long_labels(dataset_dict, threshold=400):
    """
    Check and print labels that exceed a certain length in both train and test sets.

    Parameters:
    - dataset_dict: Dictionary containing train and test datasets
    - threshold: Maximum length threshold (default: 400)
    """
    # Check training set
    print("Training set long labels:")
    print("-" * 50)
    train_long_count = 0
    for idx, label in enumerate(dataset_dict['train']['labels']):
        if len(label) > threshold:
            print(f"Index: {idx}")
            print(f"Length: {len(label)}")
            print(f"Label: {label}")
            print("-" * 50)
            train_long_count += 1

    # Check test set
    print("\nTest set long labels:")
    print("-" * 50)
    test_long_count = 0
    for idx, label in enumerate(dataset_dict['test']['labels']):
        if len(label) > threshold:
            print(f"Index: {idx}")
            print(f"Length: {len(label)}")
            print(f"Label: {label}")
            print("-" * 50)
            test_long_count += 1

    # Print summary
    print(f"\nSummary:")
    print(f"Training set: {train_long_count} labels exceed {threshold} characters")
    print(f"Test set: {test_long_count} labels exceed {threshold} characters")

In [None]:
check_long_labels(common_voice)


Training set long labels:
--------------------------------------------------
Index: 370
Length: 403
Label: [50258, 50276, 50359, 50363, 3941, 105, 8703, 251, 21981, 3941, 245, 31881, 49316, 25411, 41858, 17937, 25411, 49316, 21981, 37139, 48521, 21981, 31945, 8485, 228, 3941, 114, 17937, 8485, 242, 25411, 8485, 231, 48521, 27099, 48521, 31881, 3941, 99, 37139, 43372, 31970, 31881, 8485, 229, 45938, 41858, 17937, 8485, 103, 17937, 46758, 35082, 31970, 25411, 8485, 103, 31945, 3941, 248, 17937, 48268, 36158, 31970, 33926, 8485, 237, 41858, 49316, 3941, 114, 41858, 27099, 36158, 8485, 103, 31945, 3941, 248, 17937, 48268, 36158, 8485, 105, 35082, 17937, 31970, 25411, 8485, 103, 31945, 3941, 248, 17937, 48268, 36158, 31970, 31881, 8485, 250, 35082, 36158, 17937, 3941, 241, 31945, 31970, 17937, 31970, 46758, 27099, 48268, 17937, 3941, 96, 37139, 33926, 3941, 245, 17937, 8485, 108, 33926, 3941, 250, 3941, 245, 17937, 25411, 48449, 33279, 46758, 21981, 3941, 245, 17937, 8485, 103, 46758, 17937

In [None]:
def truncate_label(example, max_length=445):
    """
    Truncate label if it exceeds max_length.

    Parameters:
    - example: Dataset example containing 'labels' field
    - max_length: Maximum allowed length for labels (default: 445)
    """
    if len(example['labels']) > max_length:
        example['labels'] = example['labels'][:max_length]
    return example

def truncate_dataset_labels(dataset_dict, max_length=445):
    """
    Truncate labels in both train and test sets using map function.

    Parameters:
    - dataset_dict: Dictionary containing train and test datasets
    - max_length: Maximum allowed length for labels (default: 445)

    Returns:
    - Dictionary containing processed train and test datasets
    """
    # Process training set
    processed_train = dataset_dict['train'].map(
        lambda x: truncate_label(x, max_length),
        desc="Processing training set"
    )

    # Process test set
    processed_test = dataset_dict['test'].map(
        lambda x: truncate_label(x, max_length),
        desc="Processing test set"
    )

    # Return processed datasets
    return {
        'train': processed_train,
        'test': processed_test
    }

# Example usage:



In [None]:
processed_dataset = truncate_dataset_labels(common_voice)

Processing training set:   0%|          | 0/825 [00:00<?, ? examples/s]

Processing test set:   0%|          | 0/207 [00:00<?, ? examples/s]

In [None]:
processed_dataset

{'train': Dataset({
     features: ['input_features', 'labels'],
     num_rows: 825
 }),
 'test': Dataset({
     features: ['input_features', 'labels'],
     num_rows: 207
 })}

In [None]:
processed_dataset['train'][0]['input_features']

[[0.5755787491798401,
  0.2784930467605591,
  0.08493930101394653,
  -0.21420395374298096,
  0.3027413487434387,
  0.060339152812957764,
  0.25886374711990356,
  0.20470327138900757,
  -0.0386807918548584,
  -0.3048290014266968,
  0.14486092329025269,
  0.14457857608795166,
  0.2986183166503906,
  0.1401720643043518,
  0.20447462797164917,
  0.10434293746948242,
  0.20486503839492798,
  0.2115505337715149,
  -0.048264265060424805,
  0.19732952117919922,
  0.13612717390060425,
  0.1592773199081421,
  0.18908393383026123,
  0.27430254220962524,
  0.23452073335647583,
  0.34249013662338257,
  0.34033626317977905,
  0.3302510380744934,
  0.31616443395614624,
  -0.031604886054992676,
  0.24835604429244995,
  0.09632891416549683,
  0.31122976541519165,
  0.25153565406799316,
  0.3025575876235962,
  0.13763439655303955,
  0.19488561153411865,
  0.31826114654541016,
  0.22105789184570312,
  0.22064673900604248,
  0.37806105613708496,
  0.39717841148376465,
  0.5839828252792358,
  0.45428985357

In [None]:
def verify_lengths(dataset_dict, max_length=445):
    """
    Verify that no labels exceed max_length after processing.
    """
    print("Verifying training set:")
    max_train = max(len(label) for label in dataset_dict['train']['labels'])
    print(f"Maximum label length in training set: {max_train}")

    print("\nVerifying test set:")
    max_test = max(len(label) for label in dataset_dict['test']['labels'])
    print(f"Maximum label length in test set: {max_test}")

# Example verification:
verify_lengths(processed_dataset)

Verifying training set:
Maximum label length in training set: 445

Verifying test set:
Maximum label length in test set: 327


In [None]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

In [None]:
model.generation_config.language = "hindi"
model.generation_config.task = "transcribe"

model.generation_config.forced_decoder_ids = None

In [None]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

In [None]:
import evaluate

metric = evaluate.load("wer")

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [None]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-small-hi",  # change to a repo name of your choice
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=25,
    max_steps=200,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=100,
    eval_steps=50,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)



In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=processed_dataset["train"],
    eval_dataset=processed_dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

  trainer = Seq2SeqTrainer(


In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Wer
50,0.7309,0.657384,61.742354
100,0.5152,0.573903,57.42354
150,0.3875,0.548021,54.902688
200,0.3178,0.546276,54.087118


You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, 50259], [2, 50359], [3, 50363]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use

TrainOutput(global_step=200, training_loss=0.530673348903656, metrics={'train_runtime': 1674.7124, 'train_samples_per_second': 1.911, 'train_steps_per_second': 0.119, 'total_flos': 9.1741298761728e+17, 'train_loss': 0.530673348903656, 'epoch': 3.8461538461538463})

In [None]:
trained_model = WhisperForConditionalGeneration.from_pretrained("/content/whisper-small-hi")

In [None]:
trained_model.eval()

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
        

In [None]:
trained_model

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
        

In [None]:
trained_model.generation_config.language = "hindi"
trained_model.generation_config.task = "transcribe"

trained_model.generation_config.forced_decoder_ids = None

In [None]:
dataset[0]['audio']['array']


array([ 4.13258839e-03, -2.49612331e-03, -1.18489750e-02, ...,
        7.72839849e-05,  3.29720846e-04,  1.73159526e-04])

In [None]:
import torch
import librosa
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperForConditionalGeneration

def load_audio(audio_path, target_sampling_rate=16000):
    """
    Load and resample audio file
    """
    # Load audio file
    audio, sampling_rate = librosa.load(audio_path, sr=None)

    # Resample if needed
    if sampling_rate != target_sampling_rate:
        audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=target_sampling_rate)

    return {
        "array": audio,
        "sampling_rate": target_sampling_rate
    }

def transcribe_audio(audio_path, model_name="openai/whisper-large-v3"):
    """
    Transcribe a single audio file
    """
    # Load model and processors
    feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name)
    tokenizer = WhisperTokenizer.from_pretrained(model_name)
    # model = WhisperForConditionalGeneration.from_pretrained("akashmaggon/whisper-small-hi")

    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")


    # model.generation_config.language = "hindi"
    # model.generation_config.task = "transcribe"

    # model.generation_config.forced_decoder_ids = None


    # Move model to GPU if available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)

    model.eval()

    # Load and process audio
    audio = load_audio(audio_path)

    # Extract features
    input_features = feature_extractor(
        audio["array"],
        sampling_rate=audio["sampling_rate"]
    ).input_features[0]

    # Convert to tensor and add batch dimension
    input_features = torch.tensor(input_features).unsqueeze(0).to(device)

    forced_decoder_ids = tokenizer.get_decoder_prompt_ids(language="hi", task="transcribe")

    # Generate tokens
    generated_ids = model.generate(
        input_features,
        max_length=445,  # As per your truncation requirement
        num_beams=5,
        early_stopping=True,
        forced_decoder_ids = tokenizer.get_decoder_prompt_ids(language="hi", task="transcribe")
    )

    # Decode the generated tokens
    transcription = tokenizer.batch_decode(
        generated_ids,
        skip_special_tokens=False,
        clean_up_tokenization_spaces=True
    )[0]
    print("==========================================")

    print(forced_decoder_ids)
    print("==========================================")




    print("==========================================")
    print(generated_ids)
    print("==========================================")

    return transcription

# Example usage
if __name__ == "__main__":
    audio_path = "/content/call11.mp3"
    try:
        transcription = transcribe_audio(audio_path)
        print(f"Transcription: {transcription}")
    except Exception as e:
        print(f"Error during transcription: {str(e)}")

[(1, 50276), (2, 50360), (3, 50364)]
tensor([[50258, 50276, 50360, 50364,  8485,   228,  3941,   103, 41858, 33926,
          8485,   228,  3941,   103, 41858, 33926,  8485,   228,  3941,   103,
         41858, 33926,  8485,   228,  3941,   103, 41858, 33926,  8485,   228,
          3941,   103, 41858, 33926,  8485,   228,  3941,   103, 41858, 33926,
          8485,   228,  3941,   103, 41858, 33926]], device='cuda:0')
Transcription: <|startoftranscript|><|hi|><|transcribe|><|notimestamps|> आपको आपको आपको आपको आपको आपको आपको
