In [None]:
import pandas as pd
import Levenshtein as lev
from jiwer import wer
import string
import re

def preprocess_text(text):
    # Remove ellipses from the text
    text = text.replace("…", "")
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert Turkish characters to lowercase
    turkish_lower_map = str.maketrans("İI", "iı")
    text = text.translate(turkish_lower_map).lower()
    return text

def remove_first_char(text):
    return text
    return text[1:] if text else text # if using the whisper model please use this return statement, if using the finetuned model please use the above return statement

def levenshtein_distance(s1, s2):
    return lev.distance(s1, s2)

# Define a function to convert numbers to Turkish text
# this algorithm works like Roman numerals, it converts the number to text
def number_to_turkish_text(number):
    units = ["", "bir", "iki", "üç", "dört", "beş", "altı", "yedi", "sekiz", "dokuz"]
    tens = ["", "on", "yirmi", "otuz", "kırk", "elli", "altmış", "yetmiş", "seksen", "doksan"]
    
    number = str(number)
    length = len(number)
    
    if length == 1:
        return units[int(number)]
    elif length == 2:
        return tens[int(number[0])] + " " + units[int(number[1])]
    elif length == 3:
        if number[1:] == "00":
            return units[int(number[0])] + " yüz"
        else:
            return units[int(number[0])] + " yüz " + number_to_turkish_text(number[1:])
    elif length == 4:
        if number[1:] == "000":
            return units[int(number[0])] + " bin"
        else:
            return units[int(number[0])] + " bin " + number_to_turkish_text(number[1:])
    else:
        return number

def convert_numbers_to_text(text):
    # Pattern to match numbers with optional punctuation after the number
    pattern = re.compile(r"(\d+)([.,']*)")
    converted_text = []

    ordinal_suffixes = {
        '0': 'sıfırıncı', '1': 'birinci', '2': 'ikinci', '3': 'üçüncü',
        '4': 'dördüncü', '5': 'beşinci', '6': 'altıncı', '7': 'yedinci',
        '8': 'sekizinci', '9': 'dokuzuncu'
    }
    
    pattern_percentage = re.compile(r"%(\d+)")

    # Find and replace percentages
    for match in pattern_percentage.finditer(text):
        percentage = match.groups()
        number = int(percentage[0])
        converted_number = number_to_turkish_text(number)
        text = text.replace(match.group(), f"yüzde {converted_number}", 1)

    # Find and replace numbers with punctuation after the number
    for match in pattern.finditer(text):
        number, punct = match.groups()
        converted_number = number_to_turkish_text(number)
        if punct == '.':
            if number in ordinal_suffixes:
                converted_text.append(ordinal_suffixes[number])
            else:
                converted_text.append(converted_number + "ıncı")
        else:
            converted_text.append(converted_number + punct)
    
    # Replace matched patterns with their converted forms
    for original, converted in zip(pattern.findall(text), converted_text):
        text = text.replace(''.join(original), converted, 1)

    return text

def handle_special_apostrophes(text):
    words_to_replace = {
        "dört'ü": "dördü",
    }
    for word, replacement in words_to_replace.items():
        text = text.replace(word, replacement)
    # Remove other apostrophes
    text = text.replace("'", "")
    return text

# Load the results dataframe
df = pd.read_csv('transcription_results_larger_than_50_tr_250_finetuned_whisper.csv')

# Convert numbers in transcriptions to text
df['transcription'] = df['transcription'].apply(convert_numbers_to_text)

# Handle special apostrophes in transcriptions
df['transcription'] = df['transcription'].apply(handle_special_apostrophes)

# Preprocess the expected and actual transcriptions
df['expected_transcription'] = df['expected_transcription'].apply(preprocess_text)
df['transcription'] = df['transcription'].apply(preprocess_text)

# Remove the first character from the transcriptions
df['transcription'] = df['transcription'].apply(remove_first_char)

# Calculate Levenshtein distance and word error rate
df['levenshtein_distance'] = df.apply(lambda row: levenshtein_distance(row['expected_transcription'], row['transcription']), axis=1)
df['word_error_rate'] = df.apply(lambda row: wer(row['expected_transcription'], row['transcription']), axis=1)

# Save the results with metrics to a new CSV file
df.to_csv('transcription_results_with_metrics_50_tr_md2.csv', index=False)

# Print the results
print(df)

In [None]:
df["word_error_rate"].mean()

In [None]:
df["levenshtein_distance"].mean()

In [None]:
df['expected_transcription'].apply(len).mean()

In [None]:
df['expected_transcription'].apply(len).mean() / df['transcription'].apply(len).mean()