In [27]:
import pandas as pd
import numpy as np
from Levenshtein import distance as levenshtein_distance
from difflib import SequenceMatcher

df = pd.read_csv('transcription_comparison.csv')

print(df.head())
print(df.info())

def levenshtein_similarity(a, b):
    max_len = max(len(a), len(b))
    if max_len == 0:
        return 1.0  # Both strings are empty, consider them identical
    return 1 - levenshtein_distance(a, b) / max_len

def sequence_matching_ratio(a, b):
    return SequenceMatcher(None, a, b).ratio()

similarity_data = []

for column in df.columns[1:]:  # Skip the ground_truth column
    levenshtein_similarities = df.apply(lambda row: levenshtein_similarity(row['ground_truth'], row[column]), axis=1)
    sequence_similarities = df.apply(lambda row: sequence_matching_ratio(row['ground_truth'], row[column]), axis=1)
    
    avg_levenshtein = levenshtein_similarities.mean()
    avg_sequence = sequence_similarities.mean()
    
    similarity_data.append({
        'Method': column,
        'Avg_Levenshtein_Similarity': avg_levenshtein,
        'Avg_Sequence_Matching': avg_sequence
    })

similarities_df = pd.DataFrame(similarity_data)


  ground_truth transcribus_result     gpt-4o gpt-4o_with_transkribus
0    distincta          distincta  distincta               distincta
1      maculis            maculis    maculis                 maculis
2            ,                  .          .                       ,
3      virtute               unum       unum                 virtute
4           et          crudelior  crudelior                      et
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255 entries, 0 to 254
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   ground_truth             255 non-null    object
 1   transcribus_result       255 non-null    object
 2   gpt-4o                   255 non-null    object
 3   gpt-4o_with_transkribus  255 non-null    object
dtypes: object(4)
memory usage: 8.1+ KB
None


In [28]:
print(similarities_df)

                    Method  Avg_Levenshtein_Similarity  Avg_Sequence_Matching
0       transcribus_result                    0.317625               0.363472
1                   gpt-4o                    0.317625               0.363472
2  gpt-4o_with_transkribus                    0.694865               0.714168


In [30]:
similarities_df.to_csv('similarity_data.csv', index=False)