In [None]:
import os
from pymongo import UpdateOne
from tqdm import tqdm
from pymongo import MongoClient
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import re
client = MongoClient("localhost", 29012)
db = client["test-database"]
collection_txt = db["collection-txt2"]
collection_CNN = db["CNN_DE"]

def get_text(publication_number):
    publication_number = f"{publication_number[:2]}.{publication_number[2:-1]}.{publication_number[-1:]}"
    pipeline = [
        {"$match": {"Publication Number": publication_number}},  # Filter documents where 'type' is 'text'
        {"$group": {
            "_id": "$Publication Number",  # Group by 'Publication Number'
            "pages": {"$push": "$page"},   # Collect 'page' values into an array
            "text": {"$push": "$OCR"}  # Collect 'OCR' values into an array
        }}
    ]

    result_db = list(collection_CNN.aggregate(pipeline))
    if not result_db:
        return None
    result ={
        "Publication Number": result_db[0]["_id"],
        "pages": result_db[0]["pages"],
        "OCR": ' '.join([item for item in result_db[0]["text"]])  # Join the text from all pages
    }
    return result

def generate_query(item):
    text_clean = item['OCR'].replace('\n', ' ').strip()
    return text_clean

print(get_text("DE2C"))

## Train spacy

In [None]:
def overlap(first, second):
    start1, end1 = first
    start2, end2 = second
    if start1 < start2:
        return end1 > start2
    else:
        return start1 < end2
def is_overlapping(first, second):
    start1, end1 = first
    start2, end2 = second
    return max(start1, start2) < min(end1, end2)


In [None]:
def custom_ratio(s1,
    s2,
    *,
    processor=None,
    score_cutoff=None,
):
    ratio = fuzz.token_sort_ratio(s1, s2, processor=processor, score_cutoff=score_cutoff)
    difference = abs(len(s1) - len(s2))
    print(f"Length difference: {difference}")
    ratio = ratio - (difference * 0.1)
    return ratio if ratio > 0 else 0

In [None]:
def reduce_string(string, name, start):
    while not string[0].isalnum():
        string = string[1:]
        start += 1
    while not string[-1].isalnum():
        string = string[:-1]
    tokens = string.split()
    score = fuzz.token_sort_ratio(string, name)
    i=0
    max_score = score
    while i<len(tokens) and fuzz.token_sort_ratio(' '.join(tokens[i:]), name) >= max_score:
        max_score = fuzz.token_sort_ratio(' '.join(tokens[i:]), name)
        i += 1
    if i>0:
        i -= 1
    start = start + len(' '.join(tokens[:i])) + 1
    j=len(tokens)
    while j>0 and fuzz.token_sort_ratio(' '.join(tokens[i:j]), name) >= max_score:
        max_score = fuzz.token_sort_ratio(' '.join(tokens[i:j]), name)
        j -= 1
    if j<len(tokens):
        j += 1

    return ' '.join(tokens[i:j]), start

In [None]:
from rapidfuzz import process, fuzz, utils
import re
def find_fuzzy_matches(text, name_list, label, entities, threshold=70):
    text_lower = text.lower()
    words_in_text = text.split()
    for name in name_list:
        name_lower = name.lower()

        # Use fuzzy matching to find the closest substring(s)
        matches = process.extract(
            name_lower,
            [text_lower[i:i+len(name_lower)+10] for i in range(len(text_lower))],
            scorer=fuzz.token_sort_ratio,
            processor=utils.default_process,
            limit=20
        )
        matches_updated = []
        for match in matches:
            new_match, start = reduce_string(match[0], name_lower, match[2])
            difference = abs(len(name) - len(new_match))
            ratio = match[1] - (difference * 0.2)
            matches_updated.append((new_match, ratio, start))
        matches = matches_updated
        for match_str, score, match_start in matches:
            if score < threshold:
                continue
            actual_end = match_start + len(match_str)
            actual_start = match_start
            while actual_start > 0 and actual_start<actual_end and actual_start < len(text_lower):
                if not text[actual_start].isalnum():
                    actual_start+=1
                elif text[actual_start-1].isalnum():
                    actual_start -= 1
                elif text[actual_start].isalnum() and not text[actual_start-1].isalnum():
                    break
                else:
                    break
            #print('___')
            while actual_end > 0 and actual_end>actual_start and actual_end < len(text_lower):
                if not text[actual_end].isalnum():
                    actual_end-=1
                elif actual_end+1 < len(text_lower) and text[actual_end+1].isalnum():
                    actual_end += 1
                elif text[actual_end].isalnum() and actual_end+1 < len(text_lower) and not text[actual_end+1].isalnum():
                    actual_end += 1
                    break
                else:
                    actual_end += 1
                    break
            score = fuzz.token_sort_ratio(text[actual_start:actual_end], name)
            # Prevent overlap: prefer longer matches
            should_add = True
            overlapping = []
            actual_pos = (actual_start, actual_end)
            for already_found in entities:
                already_found_pos = (already_found[0], already_found[1])
                if is_overlapping(already_found_pos, actual_pos):
                    overlapping.append(already_found)
            if all(score > o[3] for o in overlapping):
                # Remove all overlapping lower-score entities
                for o in overlapping:
                    entities.remove(o)
                entities.append((actual_start, actual_end, label, score))
            else:
                # Do not add the new one
                pass
    return entities

In [None]:
month_translation = {
	"January": "Januar",
	"February": "Februar",
	"March": "März",
	"April": "April",
	"May": "Mai",
	"June": "Juni",
	"July": "Juli",
	"August": "August",
	"September": "September",
	"October": "Oktober",
	"November": "November",
	"December": "Dezember"
}

In [None]:
def date_variations(date):
    if isinstance(date, str):
        date = datetime.strptime(date, '%Y-%m-%d')
    date_variations = []
    date_variations.append(date.strftime('%Y-%m-%d'))
    date_variations.append(date.strftime('%d-%m-%Y'))
    date_variations.append(date.strftime('%m-%d-%Y'))
    date_variations.append(date.strftime('%d %B %Y'))
    date_variations.append(date.strftime('%B %d, %Y'))
    date_variations.append(date.strftime('%B %d %Y'))
    date_variations.append(date.strftime('%d %b %Y'))
    date_variations.append(date.strftime('%b %d, %Y'))
    date_variations.append(date.strftime('%b %d %Y'))
    date_variations.append(date.strftime('%d %B %y'))
    date_variations.append(date.strftime('%B %d, %y'))
    date_variations.append(date.strftime('%B %d %y'))
    date_variations.append(date.strftime('%d %b %y'))
    date_variations.append(date.strftime('%b %d, %y'))
    date_variations.append(date.strftime('%b %d %y'))
    date_variations.append(date.strftime('%d %m %Y'))
    date_variations.append(date.strftime('%m %d %Y'))
    date_variations.append(date.strftime('%d %m %y'))
    date_variations.append(date.strftime('%m %d %y'))
    date_variations.append(date.strftime('%d %m %Y'))
    date_variations.append(date.strftime('%m %d %Y'))
    date_variations.append(date.strftime('%d %m %y'))
    date_variations.append(date.strftime('%m %d %y'))
    for item in date_variations:
        for month in month_translation:
            if month in item:
                item = item.replace(month, month_translation[month])
                date_variations.append(item)
    return date_variations

In [None]:
from tqdm import tqdm
import random
query = {'Country': "DE", 'Title': {'$exists': True}, 'C_Application Date': {'$exists': True}, 'C_Publication Date': {'$exists': True}, 'Applicant': {'$exists': True}, 'Inventor': {'$exists': True}}
training_data = []
total_count = collection_txt.count_documents(query)
random_indexes = random.sample(range(total_count), 3000)
count_skipped = 0
for index in tqdm(random_indexes):
    item = collection_txt.find(query).skip(index).limit(1)[0]
    find_ocr = get_text(item['Publication Number'])
    if find_ocr is None:
        print(f"Publication Number {item['Publication Number']} not found")
        count_skipped += 1
        continue
    text = generate_query(find_ocr)
    entities = []
    if 'Applicant' in item and item['Applicant'] is not None and item['Applicant'] != []:
        entities = find_fuzzy_matches(text, item['Applicant'], "APPLICANT", entities)
    if 'Inventor' in item and item['Inventor'] is not None and item['Inventor'] != []:
        entities = find_fuzzy_matches(text, item['Inventor'], "APPLICANT", entities)
    if item['Title'] is not None:
        entities = find_fuzzy_matches(text, [item['Title']], "TITLE", entities)
    if item['C_Application Date'] is not None:
        application_date = date_variations(item['C_Application Date'])
        entities = find_fuzzy_matches(text, application_date, "APPLICATION_DATE", entities)
    if item['C_Publication Date'] is not None:
        publication_date = date_variations(item['C_Publication Date'])
        entities = find_fuzzy_matches(text, publication_date, "PUBLICATION_DATE", entities)
    training_data.append((text, {'entities': entities}))
print(f"Skipped {count_skipped} items")

In [None]:
for element in training_data:
    entities = element[1]['entities']
    for item1 in entities:
        for item2 in entities:
            if item1 != item2:
                if item1[2] == item2[2] and is_overlapping((item1[0], item1[1]), (item2[0], item2[1])):
                    print(f"Found overlapping entities: {item1} and {item2} with text {element[0][item1[0]:item1[1]]} and {element[0][item2[0]:item2[1]]}")

In [None]:
import random

training_data = training_data
random.shuffle(training_data)
train_data = training_data[:int(len(training_data) * 0.7)]
dev_data = training_data[int(len(training_data) * 0.7):]

In [None]:
import spacy
from tqdm import tqdm
from spacy.tokens import DocBin

nlp = spacy.blank("de")  # start with a blank English model
db = DocBin()

for text, annot in tqdm(train_data):
    try:
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label, score in annot["entities"]:
            span = doc.char_span(start, end, label=label)
            if span is None:
                print(f"Skipping: {text[start-10:start]}${text[start:end]}${text[end:end+10]} — invalid span")
            else:
                ents.append(span)
        if len(ents)!=0:
            doc.ents = ents
            db.add(doc)
    except Exception as e:
        print(f"Error processing text {e}")

db.to_disk("./train_de2.spacy")

nlp = spacy.blank("de")  # start with a blank English model
db = DocBin()

for text, annot in tqdm(dev_data):
    try:
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label, score in annot["entities"]:
            span = doc.char_span(start, end, label=label)
            if span is None:
                print(f"Skipping: {text[start-10:start]}${text[start:end]}${text[end:end+10]} — invalid span")
                #print(f"Start: {start}, End: {end}, Label: {label}")
            else:
                ents.append(span)
        if len(ents)!=0:
            doc.ents = ents
            db.add(doc)
    except Exception as e:
        print(f"Error processing text {e}")

db.to_disk("./dev_de2.spacy")

Train model in command line

In [None]:
!python -m spacy init config config.cfg --lang de --pipeline ner


In [None]:
!python -m spacy train config_de.cfg --output ./output_de2 --paths.train ./train_de2.spacy --paths.dev ./dev_de2.spacy --gpu-id 0,1!

In [None]:
import spacy
import json
import random
from rapidfuzz import process, fuzz, utils
predict_dir = "/scratch/students/ndillenb/metadata/processing/llm/json_compare/spacy_de2_json_compare/"
nlp = spacy.load("./output_de2/model-best")
query = {'Country': "DE"}
total_count = collection_txt.count_documents(query)
random_indexes = random.sample(range(total_count), 500)
for item in tqdm(random_indexes):
    item = collection_txt.find(query).skip(item).limit(1)[0]
    find_ocr = get_text(item['Publication Number'])
    if find_ocr is None:
        print(f"Publication Number {item['Publication Number']} not found")
        continue
    text = generate_query(find_ocr)
    names = []
    doc = nlp(text)
    predicted = {
        "Title":None,
        "Application_Date":None,
        "Publication_Date":None,
        "Applicants":[],
    }
    for ent in doc.ents:
        if ent.label_ == "TITLE":
            predicted["title"] = ent.text
        elif ent.label_ == "APPLICATION_DATE":
            predicted["Application_Date"] = ent.text
        elif ent.label_ == "PUBLICATION_DATE":
            predicted["Publication_Date"] = ent.text
        elif ent.label_ == "APPLICANT":
                if not any(fuzz.token_sort_ratio(ent.text.lower(), name.lower()) >= 70 for name in predicted["Applicants"]):
                    predicted["Applicants"].append(ent.text)
        print(f"Entity: {ent.text}, Label: {ent.label_}")
    with open(os.path.join(predict_dir, f"json_llm_{item['_id']}.json"), 'w') as f:
        json.dump({'predicted':predicted}, f)
    print('---')