In [2]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer, Trainer, TrainingArguments
import soundfile as sf
import torch
from datasets import load_dataset

model_name = "ai4bharat/indicwav2vec_v1_bengali"
tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

Downloading tokenizer_config.json:   0%|          | 0.00/257 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/940 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.


Downloading config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

In [4]:
def read_audio(file_path):
    speech, _ = sf.read(file_path)
    return speech

def prepare_dataset(batch):
    batch["speech"] = read_audio(batch["path"])
    batch["input_values"] = tokenizer(batch["speech"], return_tensors="pt", padding="longest").input_values[0]
    return batch

data_dir = "/kaggle/input/bengaliai-speech/"

data_files = {
    "train": data_dir + "train.csv",
    "validation": data_dir + "validation.csv"
}
dataset = load_dataset("csv", data_files=data_files)
dataset = dataset.map(prepare_dataset)

training_args = TrainingArguments(
  output_dir="./results",
  per_device_train_batch_size=16,
  evaluation_strategy="steps",
  num_train_epochs=1,
  save_steps=400,
  eval_steps=400,
  logging_steps=400
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"]
)

trainer.train()

def test_model(file_path):
    speech = read_audio(file_path)
    input_values = tokenizer(speech, return_tensors="pt", padding="longest").input_values
    with torch.no_grad():
        logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = tokenizer.decode(predicted_ids[0])
    print("Recognized Speech:", transcription)

model.save_pretrained("./fine_tuned_model")

FileNotFoundError: Unable to find '/kaggle/input/bengaliai-speech/validation.csv' at /kaggle/working