In [None]:
import os
from pymongo import UpdateOne
from tqdm import tqdm
from pymongo import MongoClient
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import re
client = MongoClient("localhost", 29012)
db = client["test-database"]
collection_json = db["collection-json"]
def generate_query(item):
    text_clean = item['OCR'].replace('\n', ' ').strip()
    return text_clean


## Train spacy

In [None]:
def overlap(first, second):
    start1, end1 = first
    start2, end2 = second
    if start1 < start2:
        return end1 > start2
    else:
        return start1 < end2

def is_overlapping(first, second):
    start1, end1 = first
    start2, end2 = second
    return max(start1, start2) < min(end1, end2)


In [None]:
def custom_ratio(s1,
    s2,
    *,
    processor=None,
    score_cutoff=None,
):
    ratio = fuzz.token_sort_ratio(s1, s2, processor=processor, score_cutoff=score_cutoff)
    difference = abs(len(s1) - len(s2))
    print(f"Length difference: {difference}")
    ratio = ratio - (difference * 0.1)
    return ratio if ratio > 0 else 0

In [None]:
from rapidfuzz import fuzz
def reduce_string(string, name, start):
    while not string[0].isalnum():
        string = string[1:]
        start += 1
    while not string[-1].isalnum():
        string = string[:-1]
    tokens = string.split()
    score = fuzz.token_sort_ratio(string, name)
    i=0
    max_score = score
    while i<len(tokens) and fuzz.token_sort_ratio(' '.join(tokens[i:]), name) >= max_score:
        max_score = fuzz.token_sort_ratio(' '.join(tokens[i:]), name)
        i += 1
    if i>0:
        i -= 1
    start = start + len(' '.join(tokens[:i])) + 1
    j=len(tokens)
    while j>0 and fuzz.token_sort_ratio(' '.join(tokens[i:j]), name) >= max_score:
        max_score = fuzz.token_sort_ratio(' '.join(tokens[i:j]), name)
        j -= 1
    if j<len(tokens):
        j += 1

    return ' '.join(tokens[i:j]), start

In [None]:
# Testing window matching with fuzzy
text_lower = "The inventor is Capable of many things, Mark James Henry, is the inventor and he has been doing a lot about this issue. Mark Henry is really pationate about this whole thing. You know him? Mark HENRY?"
import re
name = "mark henry"
matches = process.extract(
    name,
    [text_lower[i:i+len("mark henry")+10] for i in range(len(text_lower))],
    scorer=fuzz.token_sort_ratio,
    processor=utils.default_process,
    limit=10
)

matches_updated = []
for match in matches:
    print(match)
    new_match, start = reduce_string(match[0], name, match[2])
    difference = abs(len(name) - len(new_match))
    ratio = match[1] - (difference * 0.1)
    matches_updated.append((new_match, ratio, start))
print(matches_updated)

In [None]:
from rapidfuzz import process, fuzz, utils
import re
def find_fuzzy_matches(text, name_list, label, entities, threshold=70):
    text_lower = text.lower()
    words_in_text = text.split()
    for name in name_list:
        name_lower = name.lower()

        # Use fuzzy matching to find the closest substring(s)
        matches = process.extract(
            name_lower,
            [text_lower[i:i+len(name_lower)+10] for i in range(len(text_lower))],
            scorer=fuzz.token_sort_ratio,
            processor=utils.default_process,
            limit=20
        )
        matches_updated = []
        for match in matches:
            new_match, start = reduce_string(match[0], name_lower, match[2])
            difference = abs(len(name) - len(new_match))
            ratio = match[1] - (difference * 0.2)
            matches_updated.append((new_match, ratio, start))
        matches = matches_updated
        for match_str, score, match_start in matches:
            if score < threshold:
                continue
            actual_end = match_start + len(match_str)
            actual_start = match_start
            while actual_start > 0 and actual_start<actual_end and actual_start < len(text_lower):
                if not text[actual_start].isalnum():
                    actual_start+=1
                elif text[actual_start-1].isalnum():
                    actual_start -= 1
                elif text[actual_start].isalnum() and not text[actual_start-1].isalnum():
                    break
                else:
                    break
            while actual_end > 0 and actual_end>actual_start and actual_end < len(text_lower):
                if not text[actual_end].isalnum():
                    actual_end-=1
                elif actual_end+1 < len(text_lower) and text[actual_end+1].isalnum():
                    actual_end += 1
                elif text[actual_end].isalnum() and actual_end+1 < len(text_lower) and not text[actual_end+1].isalnum():
                    actual_end += 1
                    break
                else:
                    actual_end += 1
                    break
            score = fuzz.token_sort_ratio(text[actual_start:actual_end], name)
            # Prevent overlap: prefer longer matches
            should_add = True
            overlapping = []
            actual_pos = (actual_start, actual_end)
            for already_found in entities:
                already_found_pos = (already_found[0], already_found[1])
                if is_overlapping(already_found_pos, actual_pos):
                    overlapping.append(already_found)
            if all(score > o[3] for o in overlapping):
                # Remove all overlapping lower-score entities
                for o in overlapping:
                    entities.remove(o)
                entities.append((actual_start, actual_end, label, score))
            else:
                # Do not add the new one
                pass
    return entities

In [None]:
def date_variations(date):
    if isinstance(date, str):
        date = datetime.strptime(date, '%Y-%m-%d')
    date_variations = []
    date_variations.append(date.strftime('%Y-%m-%d'))
    date_variations.append(date.strftime('%d-%m-%Y'))
    date_variations.append(date.strftime('%m-%d-%Y'))
    date_variations.append(date.strftime('%d %B %Y'))
    date_variations.append(date.strftime('%B %d, %Y'))
    date_variations.append(date.strftime('%B %d %Y'))
    date_variations.append(date.strftime('%d %b %Y'))
    date_variations.append(date.strftime('%b %d, %Y'))
    date_variations.append(date.strftime('%b %d %Y'))
    date_variations.append(date.strftime('%d %B %y'))
    date_variations.append(date.strftime('%B %d, %y'))
    date_variations.append(date.strftime('%B %d %y'))
    date_variations.append(date.strftime('%d %b %y'))
    date_variations.append(date.strftime('%b %d, %y'))
    date_variations.append(date.strftime('%b %d %y'))
    date_variations.append(date.strftime('%d %m %Y'))
    date_variations.append(date.strftime('%m %d %Y'))
    date_variations.append(date.strftime('%d %m %y'))
    date_variations.append(date.strftime('%m %d %y'))
    date_variations.append(date.strftime('%d %m %Y'))
    date_variations.append(date.strftime('%m %d %Y'))
    date_variations.append(date.strftime('%d %m %y'))
    date_variations.append(date.strftime('%m %d %y'))
    return date_variations

In [None]:
for item in tqdm(list(collection_json.find({'Country':"US", 'OCR': {'$exists': True}, 'Title': {'$exists': True}, 'C_Application Date': {'$exists': True}, 'C_Publication Date': {'$exists': True}}).limit(5000))):
    print(item['C_Application Date'])
    print(type(item['C_Application Date']))
    print(date_variations(item['C_Application Date']))
    print(item['C_Publication Date'])
    print(type(item['C_Publication Date']))
    print(date_variations(item['C_Publication Date']))
    break

In [None]:
from tqdm import tqdm
training_data = []
for item in tqdm(list(collection_json.find({'Country':"US", 'OCR': {'$exists': True}, 'Title': {'$exists': True}, 'C_Application Date': {'$exists': True}, 'C_Publication Date': {'$exists': True}}).limit(5000))):
    text = generate_query(item)
    entities = []
    training_data.append((text, {'entities':entities}))

In [None]:
for element in training_data:
    entities = element[1]['entities']
    for item1 in entities:
        for item2 in entities:
            if item1 != item2:
                if item1[2] == item2[2] and is_overlapping((item1[0], item1[1]), (item2[0], item2[1])):
                    print(f"Found overlapping entities: {item1} and {item2} with text {element[0][item1[0]:item1[1]]} and {element[0][item2[0]:item2[1]]}")

In [None]:
import random

random.shuffle(training_data)
train_data = training_data[:int(len(training_data) * 0.7)]
dev_data = training_data[int(len(training_data) * 0.7):]

In [None]:
import spacy
from tqdm import tqdm
from spacy.tokens import DocBin

nlp = spacy.blank("en")  # start with a blank English model
db = DocBin()

for text, annot in tqdm(train_data):
    try:
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label, score in annot["entities"]:
            span = doc.char_span(start, end, label=label)
            if span is None:
                print(f"Skipping: {text[start-10:start]}${text[start:end]}${text[end:end+10]} — invalid span")
            else:
                ents.append(span)
        if len(ents)!=0:
            doc.ents = ents
            db.add(doc)
    except Exception as e:
        print(f"Error processing text {e}")

db.to_disk("./train4.spacy")

nlp = spacy.blank("en")  # start with a blank English model
db = DocBin()

for text, annot in tqdm(dev_data):
    try:
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label, score in annot["entities"]:
            span = doc.char_span(start, end, label=label)
            if span is None:
                print(f"Skipping: {text[start-10:start]}${text[start:end]}${text[end:end+10]} — invalid span")
                #print(f"Start: {start}, End: {end}, Label: {label}")
            else:
                ents.append(span)
        if len(ents)!=0:
            doc.ents = ents
            db.add(doc)
    except Exception as e:
        print(f"Error processing text {e}")

db.to_disk("./dev4.spacy")

Training in the command line

In [None]:
!python -m spacy init config config.cfg --lang en --pipeline ner

In [None]:
!python -m spacy train config.cfg --output ./output4 --paths.train ./train4.spacy --paths.dev ./dev4.spacy --gpu-id 0

Load and execute the model

In [None]:
import spacy
import json
from rapidfuzz import process, fuzz, utils
predict_dir = "/scratch/students/ndillenb/metadata/processing/llm/json_compare/spacy_en_json_compare/"
nlp = spacy.load("/scratch/students/ndillenb/metadata/processing/ocr_extract/spacy_files/output4/model-best")
for item in tqdm(list(collection_json.find({'Country':"US", 'OCR': {'$exists': True}, 'Title': {'$exists': True}, 'C_Application Date': {'$exists': True}, 'C_Publication Date': {'$exists': True}}).limit(1000))):
    text = generate_query(item)
    names = []
    doc = nlp(text)
    predicted = {
        "Title":None,
        "Application_Date":None,
        "Publication_Date":None,
        "Applicants":[],
    }

    for ent in doc.ents:
        if ent.label_ == "TITLE":
            predicted["title"] = ent.text
        elif ent.label_ == "APPLICATION_DATE":
            predicted["Application_Date"] = ent.text
        elif ent.label_ == "PUBLICATION_DATE":
            predicted["Publication_Date"] = ent.text
        elif ent.label_ == "APPLICANT":
                if not any(fuzz.token_sort_ratio(ent.text.lower(), name.lower()) >= 70 for name in predicted["Applicants"]):
                    predicted["Applicants"].append(ent.text)
        print(f"Entity: {ent.text}, Label: {ent.label_}")
    with open(os.path.join(predict_dir, f"json_llm_{item['_id']}.json"), 'w') as f:
        json.dump({'predicted':predicted}, f)
    print('---')

In [None]:
import spacy
from rapidfuzz import process, fuzz, utils
nlp = spacy.load("./output/model-best")
for item in tqdm(list(collection_json.find({'Country':"US", 'OCR': {'$exists': True}, 'Title': {'$exists': True}, 'C_Application Date': {'$exists': True}, 'C_Publication Date': {'$exists': True}}).limit(100))):
    text = generate_query(item)
    names = []
    if item['Applicant'] is not None and item['Applicant'] != []:
        for name in item['Applicant']:
            names.append(name)
    if item['Inventor'] is not None and item['Inventor'] != []:
        for name in item['Inventor']:
            names.append(name)
    unique_named = []
    for name in names:
        if not any(fuzz.token_sort_ratio(name, existing_name) > 70 for existing_name in unique_named):
            unique_named.append(name)
    names = unique_named
    print(names)
    doc = nlp(text)
    names_matches = [0 for i in range(len(names))]
    for ent in doc.ents:
        for knownames in names:
            if fuzz.token_sort_ratio(ent.text.lower(), knownames.lower()) > 70:
                print(f"Found a match: {ent.text} with {knownames}")
                names_matches[names.index(knownames)] += 1                
    for i in range(len(names)):
        if names_matches[i] == 0:
            print(f"❌ Did not find a match for {names[i]}")
        else:
            print(f"✔️ Found a match for {names[i]} with {names_matches[i]}")