In [None]:
from transformers import BertTokenizer, SpeechT5Processor
import torch
dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]


bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

speech_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_asr")


def preprocess_data(example):
    
    text_inputs = bert_tokenizer(
        example["text"], padding="max_length", truncation=True, max_length=128, return_tensors="pt"
    )

   
    audio = example["audio"]["array"]
    audio_features = speech_processor(audio, sampling_rate=16000, return_tensors="pt")
    
   
    return {
        "input_ids": text_inputs["input_ids"].squeeze(),
        "attention_mask": text_inputs["attention_mask"].squeeze(),
        "speech_features": audio_features.input_values.squeeze()
    }


train_dataset = train_dataset.map(preprocess_data, remove_columns=["text", "audio"])
eval_dataset = eval_dataset.map(preprocess_data, remove_columns=["text", "audio"])
