In [None]:
import datetime
from collections import Counter
from typing import List, Dict, Tuple
from rapidfuzz import fuzz
import re
from pandas.tseries.api import guess_datetime_format
log = False

# unwanted characters
unwanted_chars = r'[,.;:\-()\[\]{}\'\"`´]'

def clean_names(name):
    try:
        name = re.sub(r'\[.*?\]', '', name)

        #  unwanted characters
        name = re.sub(unwanted_chars, '', name)

        # multiple spaces 
        name = re.sub(r'\s+', ' ', name).strip()
        return name.lower()

    except Exception as e:
        print(f"Error cleaning name: {name}. Error: {e}")
        

def compare_json(expected: Dict, actual: Dict, total_score) -> Dict: #Messy function that compares two JSON objects
    comparison = {}

    if isinstance(actual, list): #fix for when actual is a list
        actual = actual[0]
    print(f"Expected Application Date {expected['Application_Date']} and got {actual['Application_Date']}") if log else None
    print(f"Expected Publication Date {expected['Publication_Date']} and got {actual['Publication_Date']}") if log else None
    if actual["Title"] is None and expected["Title"] is None:
        total_score["Title"] += 1
        comparison["Title"] = "Title correctly unidentified"
    elif actual["Title"] is None and expected["Title"] is not None:
        comparison["Title"] = "Invalid Title"
    elif actual["Title"] is not None and expected["Title"] is None:  
        comparison["Title"] = "Invalid Title"
    else:
        print(f"Expected Title: {expected['Title']}") if log else None
        print(f"Actual Title: {actual['Title']}") if log else None
        comparison["Title_Correct"] = (fuzz.token_set_ratio(expected["Title"].lower(), actual["Title"].lower()) >= 80)
        if comparison["Title_Correct"]:
            total_score["Title"] += 1
    # Application Date comparison
    if actual["Application_Date"] is not None and guess_datetime_format(actual["Application_Date"]) is None:
        actual["Application_Date"] = None

    if actual["Application_Date"] is None and expected["Application_Date"] is None:
        comparison["Application_Date"] = "Application correctly unidentified"
        total_score["Application_Date"] += 1
        if log:
            print(f"Application correctly unidentified")
    elif actual["Application_Date"] is None and expected["Application_Date"] is not None:
        comparison["Application_Date"] = "Invalid Application Date"
    elif actual["Application_Date"] is not None and expected["Application_Date"] is None:
        comparison["Application_Date"] = "Invalid Application Date"        
    else:
        app_actual_format = (guess_datetime_format(actual["Application_Date"]))
        app_expected= expected["Application_Date"]
        try:
            app_actual = datetime.strptime(actual["Application_Date"], app_actual_format).date()
        except ValueError as e:
            print(f"Invalid Publication_Date in expected: {app_actual}. Error: {e}")
            app_actual = None
        if app_actual is None:
            pass
        elif app_expected == app_actual:
            comparison["Application_Date"] = "DateCorrect"
            if comparison["Application_Date"] == "DateCorrect":
                total_score["Application_Date"] += 1
            if log:
                print(f"Application_Date: {app_expected} == {app_actual}")
        elif app_expected.year == app_actual.year:
            comparison["Application_Date"] = "YearCorrect"
            if comparison["Application_Date"] == "YearCorrect":
                total_score["Application_Date"] += 0.5
            if log:
                print(f"Application Date only year is correct: {app_expected} == {app_actual.year}")
        else:
            comparison["Application_Date"] = f"Off by {abs(app_expected.year - app_actual.year)} Year"
            if log:
                print(f"Application Date is off by {abs(app_expected.year - app_actual.year)} Year: {app_expected} == {app_actual}")
    if actual["Publication_Date"] is not None and guess_datetime_format(actual["Publication_Date"]) is None:
        actual["Publication_Date"] = None

    if actual["Publication_Date"] is None and expected["Publication_Date"] is not None:
        comparison["Publication_Date"] = "Invalid Publication Date"
        if log:
            print(f"Publication Date is None: {pub_actual}")
    elif actual["Publication_Date"] is not None and expected["Publication_Date"] is None:
        comparison["Publication_Date"] = "Invalid Publication Date"
        if log:
            print(f"Publication Date is None: {pub_actual}")
    elif actual["Publication_Date"] is None and expected["Publication_Date"] is  None:
        comparison["Publication_Date"] = "Application correctly unidentified"
        total_score["Publication_Date"] += 1
        if log:
            print(f"Publication Date is None: {pub_actual}")
    else:
        pub_actual_format = (guess_datetime_format(actual["Publication_Date"]))
        pub_expected = expected["Publication_Date"]
        try:
            pub_actual = datetime.strptime(actual["Publication_Date"], pub_actual_format).date()
        except ValueError as e:
            print(f"Invalid Publication_Date in expected: {pub_actual}. Error: {e}") 
            pub_actual = None
        if pub_actual is None:
            pass
        elif pub_expected == pub_actual:
            comparison["Publication_Date"] = "DateCorrect"
            if comparison["Publication_Date"] == "DateCorrect":
                total_score["Publication_Date"] += 1
            if log:
                print(f"Publication Date is matching: {pub_expected} == {pub_actual}")
        elif pub_expected.year == pub_actual.year:
            comparison["Publication_Date"] = "YearCorrect"
            if comparison["Publication_Date"] == "YearCorrect":
                total_score["Publication_Date"] += 0.5
            if log:
                print(f"Publication Date only year is correct: {pub_expected} == {pub_actual.year}") if log else None
        else:
            comparison["Publication_Date"] = f"Off by {abs(pub_expected.year - pub_actual.year)} Year"
            if log:
                print(f"Publication Date is off by {abs(pub_expected.year - pub_actual.year)} Year: {pub_expected} == {pub_actual}") if log else None
    inventors_expected = expected["Inventors"]
    inventors_actual = actual["Inventors"]
    inventors_expected = [inv for inv in inventors_expected if inv is not None]
    inventors_actual = [inv for inv in inventors_actual if inv is not None]
    inventors_expected = [clean_names(inv) for inv in inventors_expected]
    inventors_actual = [clean_names(inv) for inv in inventors_actual]
    matched = 0
    for inv in inventors_expected:
        best_match_score = max((fuzz.token_set_ratio(inv, actual_inv) for actual_inv in inventors_actual), default=0)
        if best_match_score >= 80:  # Threshold can be adjusted
            matched += 1
    local_inv_score = 0
    if matched == len(inventors_expected) and matched == len(inventors_actual):
        comparison["Inventors"] = "Inventor correct"
        if comparison["Inventors"] == "Inventor correct":
            local_inv_score += len(inventors_expected)
    elif matched > 0:
        comparison["Inventors"] = "At least one correct"
        if comparison["Inventors"] == "At least one correct":
            local_inv_score += matched
    elif any(fuzz.token_set_ratio(inv, appl) > 80 for inv in inventors_actual for appl in expected["Applicants"]):
        comparison["Inventors"] = "Inventor not in inventors but in Applicants"
    else:
        comparison["Inventors"] = "No Match"
    local_inv_score= local_inv_score / len(inventors_expected) if len(inventors_expected) > 0 else 0
    if inventors_expected == []:
        comparison["Inventors"] = "No Inventors"
        local_inv_score = 1 
    total_score["Inventors"] += local_inv_score
    # Applicants comparison with fuzzy matching
    applicants_expected = expected["Applicants"]
    applicants_actual = actual["Applicants"]
    applicants_expected = [clean_names(app) for app in applicants_expected]
    # Reduce applicants_expected by removing duplicates using fuzz.token_set_ratio
    reduced_applicants_expected = []
    for app in applicants_expected:
        if not any(fuzz.token_set_ratio(app, existing_app) >= 60 for existing_app in reduced_applicants_expected):
            reduced_applicants_expected.append(app)
    applicants_expected = reduced_applicants_expected
    applicants_actual = [clean_names(app) for app in applicants_actual]
    print(f"Expected: {applicants_expected}") if log else None
    print(f"Predicted: {applicants_actual}") if log else None
    local_app_score = 0
    matched = 0
    for appl in applicants_expected:
        best_match_score = max((max([fuzz.token_set_ratio(appl, actual_appl), fuzz.partial_ratio(appl, actual_appl)]) for actual_appl in applicants_actual), default=0)
        if best_match_score >= 80:
            matched += 1

    if matched == len(applicants_expected) and matched == len(applicants_actual):
        comparison["Applicants"] = "Applicant correct"
        if comparison["Applicants"] == "Applicant correct":
            local_app_score += len(applicants_expected)
    elif matched > 0:
        comparison["Applicants"] = "At least one correct"
        if comparison["Applicants"] == "At least one correct":
            local_app_score += matched
    elif any(fuzz.ratio(appl, inv) > 80 for appl in applicants_actual for inv in expected["Inventors"]):
        comparison["Applicants"] = "Applicant not in applicants but in Inventors"
    else:
        comparison["Applicants"] = "No Match"
    
    local_app_score = local_app_score / len(applicants_expected) if len(applicants_expected) > 0 else 0
    if applicants_expected == []:
        comparison["Applicants"] = "No Applicants"
        local_app_score = 1
    total_score["Applicants"] += local_app_score
    print(f"Applicant score: {local_app_score}") if log else None
    return comparison, total_score


def aggregate_comparisons(expected_actual_pairs: List[Tuple[Dict, Dict]], total_score):
    aggregate = Counter()

    for expected, actual in expected_actual_pairs:
        result, total_score = compare_json(expected, actual, total_score)
        for key, value in result.items():
            aggregate[(key, value)] += 1

    return dict(aggregate), total_score


Metric:
Title: 1 or 0
Application Date: 1 Correct, 0.5 year correct, 0 year not correct
Publication Date: 1 Correct, 0.5 year correct, 0 year not correct
Inventors: 1 Match, 0.5 at least one correct, 0 else
Applicants: 1 Match, 0.5 at least one correct, 0 else

In [None]:
from pymongo import MongoClient
from bson import ObjectId
from datetime import datetime
client = MongoClient("localhost", 29012)
db = client["test-database"]
collection_json = db["collection-txt2"]
def gen_truth(id): #Generate the expected JSON from the MongoDB collection
    item = collection_json.find_one({"_id":ObjectId(id)}, {"_id":0, "Title":1, "C_Application Date":1, "C_Publication Date":1, "Applicant":1, "Inventor":1})
    print(item)
    expected = {
        "Title":None,
        "Application_Date":None,
        "Publication_Date":None,
        "Applicants":[],
        "Inventors":[],
    }
    if "Title" in item:
        expected["Title"] = item["Title"]
    else:
        expected["Title"] = None
    if "C_Application Date" in item:
        if isinstance(item["C_Application Date"], str):
            expected["Application_Date"] = datetime.strptime(item["C_Application Date"], '%Y-%m-%d').date()
        else:
            expected["Application_Date"] = item["C_Application Date"]
    if "C_Publication Date" in item:
        if isinstance(item["C_Publication Date"], str):
            expected["Publication_Date"] = datetime.strptime(item["C_Publication Date"], '%Y-%m-%d').date()
        else:
            expected["Publication_Date"] = item["C_Publication Date"]
    names = []
    if 'Applicant' in item and item['Applicant'] is not None and item['Applicant'] != []:
        for name in item['Applicant']:
            names.append(name)
    if 'Inventor' in item and item['Inventor'] is not None and item['Inventor'] != []:
        for name in item['Inventor']:
            names.append(name)
    unique_named = []
    for name in names:
        if not any(fuzz.token_sort_ratio(name, existing_name) > 70 for existing_name in unique_named):
            unique_named.append(name)
    names = unique_named
    expected["Applicants"] = names
    return expected

In [None]:
# Using global truth
import os
import json
import json_repair
import dateparser
json_data = []
model= 'spacy_de' #select the model you want to use, e.g., 'spacy_de', 'spacy_en', 'spacy_fr', 'gemma'...
model = model.replace('/', '-')
total_score = {
    "Title": 0,
    "Application_Date": 0,
    "Publication_Date": 0,
    "Inventors": 0,
    "Applicants": 0
}

def parse_date_with_language(date_str, language):
    supported_languages = ['fr', 'de', 'en']
    if date_str is None:
        return None

    if language not in supported_languages:
        raise ValueError(f"Unsupported language: '{language}'. Use one of {supported_languages}.")

    try:
        dt = dateparser.parse(date_str, languages=[language])
        return dt
    except Exception as e:
        print(f"Error parsing date: {e}")
        return None


for dirpath, dirnames, filenames in os.walk(f'/scratch/students/ndillenb/metadata/processing/llm/json_compare/{model}_json_compare'):
    valid = 0
    for filename in filenames:
        if filename.endswith('.json'):
            filepath = os.path.join(dirpath, filename)
            with open(filepath, 'r') as file:
                #data = json_repair.loads(file.read()) #if json is not valid, it will try to repair it
                data = json.load(file)
                print(data['predicted']['Publication_Date'])
                if data['predicted'] is not None:
                    if 'spacy' in model.lower():
                        data['predicted']['Application_Date'] = parse_date_with_language(data['predicted']['Application_Date'], 'de')
                        if data['predicted']['Application_Date'] is not None:
                            data['predicted']['Application_Date'] = data['predicted']['Application_Date'].strftime('%Y-%m-%d')
                    if 'spacy' in model.lower():
                        data['predicted']['Publication_Date'] = parse_date_with_language(data['predicted']['Publication_Date'], 'de')
                        if data['predicted']['Publication_Date'] is not None:
                            data['predicted']['Publication_Date'] = data['predicted']['Publication_Date'].strftime('%Y-%m-%d')
                    if 'Inventors' not in data['predicted']:
                        data['predicted']['Inventors'] = []
                    if 'title' in data['predicted']:
                        data['predicted']['Title'] = data['predicted'].pop('title')
                    if data['predicted'] == None:
                        continue
                    if isinstance(data['predicted'], str) and data['predicted'].startswith('{'):
                        actual = json.loads(data['predicted'])
                    elif isinstance(data['predicted'], dict):
                        actual = data['predicted'] 
                    else:
                        print(f"Unexpected format in predicted data: {data['predicted']}")
                        continue
                    names = []
                    if 'KeyEntity' in data['predicted'] and data['predicted']['KeyEntity'] is not None and data['predicted']['KeyEntity'] != []:
                        for name in data['predicted']['KeyEntity']:
                            names.append(name)
                    if 'Applicants' in data['predicted'] and data['predicted']['Applicants'] is not None and data['predicted']['Applicants'] != []:
                        for name in data['predicted']['Applicants']:
                            names.append(name)
                    if 'Inventors' in data['predicted'] and data['predicted']['Inventors'] is not None and data['predicted']['Inventors'] != []:
                        for name in data['predicted']['Inventors']:
                            names.append(name)
                    unique_named = []
                    for name in names:
                        if not any(fuzz.token_sort_ratio(name, existing_name) > 70 for existing_name in unique_named):
                            unique_named.append(name)
                    names = unique_named
                    data['predicted']["Applicants"] = names
                    data['predicted']["Inventors"] = []
                    id = filename.split('_')[2].split('.')[0]
                    expected = gen_truth(id)
                    json_data.append((expected, actual))
                    valid+= 1
print(f"Valid: {valid}")
aggregate_comparisons(json_data, total_score)
