In [5]:
import Levenshtein
import os

gt = "../data/gt"
pred = "../data/pred"

for filename in os.listdir(gt):
    if filename.endswith(".txt"):
        with open(os.path.join(pred, filename), encoding="utf-8") as f_pred, open(os.path.join(gt, filename), encoding="utf-8") as f_gt:
            pred_text = f_pred.read()
            gt_text = f_gt.read()

    print("-" * 40)
    distance = Levenshtein.distance(pred_text, gt_text)
    print(f"Levenshtein distance: {distance}")

    similarity = Levenshtein.ratio(pred_text, gt_text)
    print(f"Similarity ratio: {similarity:.2f}")


----------------------------------------
Levenshtein distance: 57
Similarity ratio: 0.82
----------------------------------------
Levenshtein distance: 85
Similarity ratio: 0.68
----------------------------------------
Levenshtein distance: 83
Similarity ratio: 0.77
----------------------------------------
Levenshtein distance: 121
Similarity ratio: 0.72
----------------------------------------
Levenshtein distance: 73
Similarity ratio: 0.78
----------------------------------------
Levenshtein distance: 86
Similarity ratio: 0.60
----------------------------------------
Levenshtein distance: 75
Similarity ratio: 0.82
----------------------------------------
Levenshtein distance: 194
Similarity ratio: 0.59
----------------------------------------
Levenshtein distance: 104
Similarity ratio: 0.77


In [33]:
import json
def card_json_to_text(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        card = json.load(f)
    name = card.get('name', '')
    description = card.get('description', '')
    index = card.get('index', '')
    rarity = card.get('rarity', '')
    footer = card.get('footer', '')
    text_block = f"{name}\n\n{description}\n\n{index} {rarity}\n{footer}"
    return text_block


data_path = "../data/gt/"
json_files = [f for f in os.listdir(data_path) if f.endswith('.json')]
txt_files = [f for f in os.listdir(data_path) if f.endswith('.txt')]

for json_file, txt_file in zip(json_files, txt_files):
    card_text = card_json_to_text(os.path.join(data_path, json_file))
    with open(os.path.join(data_path, txt_file), "r", encoding="utf-8") as f:
        file_text = f.read()

    print("-" * 40)
    print(f"File match {txt_file}: {card_text.strip() == file_text.strip()}")

----------------------------------------
File match 0.txt: True
----------------------------------------
File match 1.txt: True
----------------------------------------
File match 2.txt: True
----------------------------------------
File match 3.txt: True
----------------------------------------
File match 4.txt: True
----------------------------------------
File match 5.txt: False
----------------------------------------
File match 6.txt: True
----------------------------------------
File match 7.txt: True
----------------------------------------
File match 8.txt: True


In [19]:
for pred_filename in os.listdir(pred):
    if pred_filename.endswith(".txt"):
        with open(os.path.join(pred, pred_filename), encoding="utf-8") as f_pred:
            pred_text = f_pred.read()

        all_distances = []
        best_match = None
        best_distance = float('inf')
        best_similarity = 0.0

        for gt_filename in os.listdir(gt):
            if gt_filename.endswith(".txt"):
                with open(os.path.join(gt, gt_filename), encoding="utf-8") as f_gt:
                    gt_text = f_gt.read()

                distance = Levenshtein.distance(pred_text, gt_text)
                similarity = Levenshtein.ratio(pred_text, gt_text)
                all_distances.append((gt_filename, distance, similarity))
                if distance < best_distance:
                    best_distance = distance
                    best_similarity = similarity
                    best_match = gt_filename
        print("-" * 40)
        print(f"Best match for {pred_filename} is {best_match} with distance {best_distance} and similarity {best_similarity:.2f}")
        print("All distances and similarities:")
        for gt_filename, distance, similarity in all_distances:
            print(f"{gt_filename}: Distance = {distance}, Similarity = {similarity:.2f}")



----------------------------------------
Best match for 0.txt is 0.txt with distance 57 and similarity 0.82
All distances and similarities:
0.txt: Distance = 57, Similarity = 0.82
1.txt: Distance = 57, Similarity = 0.82
2.txt: Distance = 168, Similarity = 0.46
3.txt: Distance = 208, Similarity = 0.42
4.txt: Distance = 190, Similarity = 0.47
5.txt: Distance = 136, Similarity = 0.48
6.txt: Distance = 190, Similarity = 0.47
7.txt: Distance = 198, Similarity = 0.43
8.txt: Distance = 220, Similarity = 0.44
----------------------------------------
Best match for 1.txt is 0.txt with distance 85 and similarity 0.68
All distances and similarities:
0.txt: Distance = 85, Similarity = 0.68
1.txt: Distance = 85, Similarity = 0.68
2.txt: Distance = 185, Similarity = 0.38
3.txt: Distance = 209, Similarity = 0.36
4.txt: Distance = 211, Similarity = 0.35
5.txt: Distance = 124, Similarity = 0.35
6.txt: Distance = 214, Similarity = 0.35
7.txt: Distance = 196, Similarity = 0.35
8.txt: Distance = 236, Simi