In [None]:
from tqdm import tqdm
from glob import glob
import pandas as pd
import whisper

# Loading files

In [None]:
DATASET_PATH='../data/audios/'
DATASET_FILES = glob(DATASET_PATH + '*.mp3')

# Using Whisper

In [None]:
model = whisper.load_model("medium")

In [None]:
final_dataset = []

for audio_file in tqdm(DATASET_FILES):
    audio_data = {}
    audio_path = audio_file
    audio_id = audio_path.replace(DATASET_PATH, '').replace('.mp3', '')
    
    audio = whisper.load_audio(audio_file)
    audio = whisper.pad_or_trim(audio)

    # make log-Mel spectrogram and move to the same device as the model
    mel = whisper.log_mel_spectrogram(audio).to(model.device)
    
    _, probs = model.detect_language(mel)
    detected_language = max(probs, key=probs.get)
    
    result = model.transcribe(audio_path)
    
    audio_data['audio_id'] = audio_id
    audio_data['transcription'] = result["text"].strip()
    audio_data['detected_language'] = detected_language
    
    final_dataset.append(audio_data)
    
final_dataset_df = pd.DataFrame.from_dict(final_dataset)
final_dataset_df.to_csv('transcripted_audios.csv', index=False)