In [None]:
import tarfile

with tarfile.open("common_voice_17.gz", "r:gz") as tar:
    tar.extractall("mp3s/")

In [None]:
import whisper
import os
import pandas as pd

model = whisper.load_model("large-v3")

def transcribe_audio(file_path):
    result = model.transcribe(file_path, language="tr")
    return result["text"]

# Load the test data (common voice v17, extracted Turkish data)
audio_folder = "mp3s/cv-corpus-17.0-2024-03-15/tr/clips/"
test_file_path = "mp3s/cv-corpus-17.0-2024-03-15/tr/test.tsv"
save_to = "transcription_results_larger_than_50_tr.csv"

test_data = pd.read_csv(test_file_path, sep="\t")

# lets filter the texts with at least length of 50
test_data = test_data[test_data["sentence"].str.len() > 50]

# randomly pick 1000 samples (or can be 250 too (which we are doing in tester_model_2.ipynb))
if test_data.shape[0] > 50:
    test_data = test_data.sample(n=50, random_state=42)

# Initialize a list to store the results
transcriptions = []

for index, row in test_data.iterrows():
    audio_file = row["path"]
    expected_transcription = row["sentence"]
    
    audio_path = os.path.join(audio_folder, audio_file)
    
    # Transcribe the audio file
    if os.path.exists(audio_path):
        transcription = transcribe_audio(audio_path)
        transcriptions.append({
            "file": audio_file,
            "expected_transcription": expected_transcription,
            "transcription": transcription
        })
    else:
        print(f"File {audio_file} not found.")

results_df = pd.DataFrame(transcriptions)
results_df.to_csv(save_to, index=False)
print(results_df)
