In [None]:
import tarfile

with tarfile.open("common_voice_17.gz", "r:gz") as tar:
    tar.extractall("mp3s/")

In [None]:
import os
import pandas as pd
import torch
import torchaudio
import soundfile as sf
from transformers import AutoProcessor, WhisperForConditionalGeneration

# Load the fine-tuned Whisper model and processor
processor = AutoProcessor.from_pretrained("erdiyalcin/whisper-large-v3-turkish-test1")
model = WhisperForConditionalGeneration.from_pretrained("erdiyalcin/whisper-large-v3-turkish-test1")

def transcribe_audio(file_path):
    # Load and resample the audio file
    audio_input, sampling_rate = sf.read(file_path)
    if sampling_rate != 16000:
        resampler = torchaudio.transforms.Resample(sampling_rate, 16000)
        audio_input = resampler(torch.tensor(audio_input, dtype=torch.float32)).numpy()
        sampling_rate = 16000

    # Process the audio 
    # https://huggingface.co/docs/transformers/main_classes/pipelines
    # https://huggingface.co/docs/transformers/main/en/model_doc/whisper#transformers.WhisperForConditionalGeneration
    inputs = processor(audio_input, return_tensors="pt", sampling_rate=sampling_rate)
    input_features = inputs.input_features
    generated_ids = model.generate(inputs=input_features)
    transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return transcription

# Paths
# https://commonvoice.mozilla.org/en/datasets (Turkish, 17.0), extracted to mp3s/cv-corpus-17.0-2024-03-15
audio_folder = "mp3s/cv-corpus-17.0-2024-03-15/tr/clips/"
test_file_path = "mp3s/cv-corpus-17.0-2024-03-15/tr/test.tsv"
save_to = "transcription_results_larger_than_50_tr_250_finetuned_whisper.csv"

# Load test data
test_data = pd.read_csv(test_file_path, sep="\t")

# Filter texts with a length of at least 50 characters
test_data = test_data[test_data["sentence"].str.len() > 50]

# Randomly pick 250 samples if there are more than 250 samples
if test_data.shape[0] > 250:
    test_data = test_data.sample(n=250, random_state=42)

# a list to store the results
transcriptions = []

# Transcribe each audio file and store the results
for index, row in test_data.iterrows():
    audio_file = row["path"]
    expected_transcription = row["sentence"]
    
    audio_path = os.path.join(audio_folder, audio_file)
    
    if os.path.exists(audio_path):
        transcription = transcribe_audio(audio_path)
        transcriptions.append({
            "file": audio_file,
            "expected_transcription": expected_transcription,
            "transcription": transcription
        })
    else:
        print(f"File {audio_file} not found.")

# Create a DataFrame from the results and save it to a CSV file
results_df = pd.DataFrame(transcriptions)
results_df.to_csv(save_to, index=False)
print(results_df)