# fuzzy-search
Fuzzy search modules for searching lists of words in low quality OCR and HTR text.

## Usage

In [1]:
from fuzzy_search import FuzzyPhraseSearcher
from fuzzy_search import PhraseModel

#from fuzzy_search.fuzzy_phrase_searcher import FuzzyPhraseSearcher
#from fuzzy_search.fuzzy_phrase_model import PhraseModel

# highger matching thresholds for higher quality OCR/HTR (higher precision, recall should be good anyway)
# lower matching thresholds for lower quality OCR/HTR (higher recall, as that's the main problem)

config = {
    "char_match_threshold": 0.6,
    "ngram_threshold": 0.5,
    "levenshtein_threshold": 0.6,
    "ignorecase": False,
    "max_length_variance": 3,
    "ngram_size": 2,
    "skip_size": 2,
}

# create a list of domain phrases
domain_phrases = [
    # terms for the chair and attendants of a meeting
    "PRAESIDE",
    "PRAESENTIBUS",
    # some weekdays in Latin
    "Veneris", 
    "Mercurii",
    # some date phrase where any date in January 1725 should match
    "den .. Januarii 1725"
]

phrase_model = PhraseModel(phrases=domain_phrases)
# initialize a new searcher instance with the config
fuzzy_searcher = FuzzyPhraseSearcher(config=config, phrase_model=domain_phrases)

# take some example texts: meetings of the Dutch States General in January 1725
text1 = "ie Veucris den 5. Januaris 1725. PR&ASIDE, Den Heere Bentinck. PRASENTIEBUS, De Heeren Jan Welderen , van Dam, Torck , met een extraordinaris Gedeputeerde uyt de Provincie van Gelderlandt. Van Maasdam , vanden Boeizelaar , Raadtpenfionaris van Hoornbeeck , met een extraordinaris Gedeputeerde uyt de Provincie van Hollandt ende Welt-Vrieslandt. Velters, Ockere , Noey; van Hoorn , met een extraordinaris Gedeputeerde uyt de Provincie van Zeelandt. Van Renswoude , van Voor{t. Van Schwartzenbergh, vander Waayen, Vegilin Van I{elmuden. Van Iddekinge ‚ van Tamminga."

text2 = "Mercuri: den 10. Jangarii, 1725. ia PRESIDE, Den Heere an Iddekinge. PRA&SENTIBUS, De Heeren /an Welderen , van Dam, van Wynbergen, Torck, met een extraordinaris Gedeputeerde uyt de Provincie van Gelderland. Van Maasdam , Raadtpenfionaris van Hoorn=beeck. Velters, Ockerfe, Noey. Taats van Amerongen, van Renswoude. Vander Waasen , Vegilin, ’ Bentinck, van I(elmaden. Van Tamminga."



The `find_matches` method returns match objects:

In [2]:
# look for matches in the first example text
for match in fuzzy_searcher.find_matches(text1):
    print(match)


PhraseMatch(phrase: "Veneris", variant: "Veneris", string: "Veucris", offset: 3, ignorecase: False, levenshtein_similarity: 0.7142857142857143)
PhraseMatch(phrase: "den .. Januarii 1725", variant: "den .. Januarii 1725", string: "den 5. Januaris 1725", offset: 11, ignorecase: False, levenshtein_similarity: 0.9)
PhraseMatch(phrase: "PRAESIDE", variant: "PRAESIDE", string: "PR&ASIDE", offset: 33, ignorecase: False, levenshtein_similarity: 0.75)
PhraseMatch(phrase: "PRAESENTIBUS", variant: "PRAESENTIBUS", string: "PRASENTIEBUS", offset: 63, ignorecase: False, levenshtein_similarity: 0.8333333333333334)



Printing the matches directly yields the following output:

In [7]:
# look for matches in the first example text
for match in fuzzy_searcher.find_matches(text1):
    print(match)


PhraseMatch(phrase: "Veneris", variant: "Veneris", string: "Veucris", offset: 3, ignorecase: False, levenshtein_similarity: 0.7142857142857143)
PhraseMatch(phrase: "den .. Januarii 1725", variant: "den .. Januarii 1725", string: "den 5. Januaris 1725", offset: 11, ignorecase: False, levenshtein_similarity: 0.9)
PhraseMatch(phrase: "PRAESIDE", variant: "PRAESIDE", string: "PR&ASIDE", offset: 33, ignorecase: False, levenshtein_similarity: 0.75)
PhraseMatch(phrase: "PRAESENTIBUS", variant: "PRAESENTIBUS", string: "PRASENTIEBUS", offset: 63, ignorecase: False, levenshtein_similarity: 0.8333333333333334)


Alternatively, each match object can generate a JSON representation of the match containing all information:


In [5]:
# look for matches in the first example text
for match in fuzzy_searcher.find_matches(text1):
    print(match.json())


{'type': 'PhraseMatch', 'phrase': 'Veneris', 'variant': 'Veneris', 'string': 'Veucris', 'offset': 3, 'label': None, 'ignorecase': False, 'text_id': None, 'match_scores': {'char_match': 0.7142857142857143, 'ngram_match': 0.625, 'levenshtein_similarity': 0.7142857142857143}}
{'type': 'PhraseMatch', 'phrase': 'den .. Januarii 1725', 'variant': 'den .. Januarii 1725', 'string': 'den 5. Januaris 1725', 'offset': 11, 'label': None, 'ignorecase': False, 'text_id': None, 'match_scores': {'char_match': 0.9, 'ngram_match': 0.8095238095238095, 'levenshtein_similarity': 0.9}}
{'type': 'PhraseMatch', 'phrase': 'PRAESIDE', 'variant': 'PRAESIDE', 'string': 'PR&ASIDE', 'offset': 33, 'label': None, 'ignorecase': False, 'text_id': None, 'match_scores': {'char_match': 0.875, 'ngram_match': 0.6666666666666666, 'levenshtein_similarity': 0.75}}
{'type': 'PhraseMatch', 'phrase': 'PRAESENTIBUS', 'variant': 'PRAESENTIBUS', 'string': 'PRASENTIEBUS', 'offset': 63, 'label': None, 'ignorecase': False, 'text_id': N

Running the searcher on the second text:

In [6]:
# look for matches in the second example text
for match in fuzzy_searcher.find_matches(text2):
    print(match.json())


{'type': 'PhraseMatch', 'phrase': 'Mercurii', 'variant': 'Mercurii', 'string': 'Mercuri', 'offset': 0, 'label': None, 'ignorecase': False, 'text_id': None, 'match_scores': {'char_match': 0.875, 'ngram_match': 0.8888888888888888, 'levenshtein_similarity': 0.875}}
{'type': 'PhraseMatch', 'phrase': 'den .. Januarii 1725', 'variant': 'den .. Januarii 1725', 'string': 'den 10. Jangarii, 1725', 'offset': 9, 'label': None, 'ignorecase': False, 'text_id': None, 'match_scores': {'char_match': 0.9, 'ngram_match': 0.7619047619047619, 'levenshtein_similarity': 0.8181818181818181}}
{'type': 'PhraseMatch', 'phrase': 'PRAESIDE', 'variant': 'PRAESIDE', 'string': 'PRESIDE', 'offset': 36, 'label': None, 'ignorecase': False, 'text_id': None, 'match_scores': {'char_match': 0.875, 'ngram_match': 0.7777777777777778, 'levenshtein_similarity': 0.875}}
{'type': 'PhraseMatch', 'phrase': 'PRAESENTIBUS', 'variant': 'PRAESENTIBUS', 'string': 'PRA&SENTIBUS', 'offset': 69, 'label': None, 'ignorecase': False, 'text_i

Match objects can also generate Web Annotation representations:

In [8]:
# look for matches in the second example text
text2_with_id = {
    "text": text2,
    "id": "urn:republic:3783_0076:page=151:para=4"
}
matches = fuzzy_searcher.find_matches(text2_with_id)

import json

print(json.dumps(matches[0].as_web_anno(), indent=2))

{
  "@context": "http://www.w3.org/ns/anno.jsonld",
  "id": "f08a7d9a-5b17-4338-b08d-3cafc8276c15",
  "type": "Annotation",
  "motivation": "classifying",
  "created": "2023-04-18T14:10:28.582779",
  "generator": {
    "id": "https://github.com/marijnkoolen/fuzzy-search",
    "type": "Software",
    "name": "fuzzy-search"
  },
  "target": {
    "source": "urn:republic:3783_0076:page=151:para=4",
    "selector": {
      "type": "TextPositionSelector",
      "start": 0,
      "end": 7
    }
  },
  "body": [
    {
      "type": "TextualBody",
      "purpose": "tagging",
      "format": "text",
      "value": "Mercurii"
    },
    {
      "type": "TextualBody",
      "purpose": "highlighting",
      "format": "text",
      "value": "Mercuri"
    },
    {
      "type": "TextualBody",
      "purpose": "correcting",
      "format": "text",
      "value": "Mercurii"
    }
  ]
}


In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from fuzzy_search.fuzzy_phrase_searcher import FuzzyPhraseSearcher

# init searcher with default parameter settings
fuzzy_searcher = FuzzyPhraseSearcher()
# register phrase you want to search
fuzzy_searcher.index_phrases(['Makelaars', 'Tabak', 'Koffie'])

# A text with OCR mistakes
text = 'De Makelaets sullen verkopen twee balen Tobacco en Javaansche Koffy.'
# Find all fuzzy matches
fuzzy_searcher.find_matches(text)


[PhraseMatch(phrase: "Makelaars", variant: "Makelaars",string: "Makelaets", offset: 3),
 PhraseMatch(phrase: "Koffie", variant: "Koffie",string: "Koffy", offset: 62)]

In [3]:
config = {
    # these thresholds work when there are few OCR errors
    "char_match_threshold": 0.8,
    "ngram_threshold": 0.6,
    "levenshtein_threshold": 0.8,
    # Is upper/lowercase a meaningful signal?
    "ignorecase": False,
    # should matches follow word boundaries?
    "use_word_boundaries": False,
    # for phrases that have variant phrasings
    "include_variants": False,
    # avoid matching with similar but different phrases
    "filter_distractors": False,
    # matching string can be lower/shorter than prhase
    "max_length_variance": 3,
    # higher ngram size allows fewer character differences
    "ngram_size": 3,
    # fewer skips is much faster but less exhaustive
    "skip_size": 1,
}

# init searcher, overriding some defaults
fuzzy_searcher = FuzzyPhraseSearcher(config)


In [4]:
from fuzzy_search.fuzzy_phrase_searcher import FuzzyPhraseSearcher

# init searcher with default parameter settings
fuzzy_searcher = FuzzyPhraseSearcher({'include_variants': True})
# register phrases and optional variants
phrases = [
    {'phrase': 'Makelaars'},
    {'phrase': 'Tabak', 'variants': ['Tobacco']},
    {'phrase': 'Koffie'}
]

fuzzy_searcher.index_phrase_model(phrases)

# A text with OCR mistakes
text = 'De Makelaets sullen verkopen twee balen Tobacco en Javaansche Koffy.'
# Find all fuzzy matches
fuzzy_searcher.find_matches(text)


[PhraseMatch(phrase: "Makelaars", variant: "Makelaars",string: "Makelaets", offset: 3),
 PhraseMatch(phrase: "Tabak", variant: "Tobacco",string: "Tobacco", offset: 40),
 PhraseMatch(phrase: "Koffie", variant: "Koffie",string: "Koffy", offset: 62)]

In [5]:
from fuzzy_search.fuzzy_phrase_model import PhraseModel

phrase_model = PhraseModel(phrases)



In [6]:
# A text with a similar but different phrase
text = 'De Metselaers sullen verkopen twee zaken cement.'
# Find all fuzzy matches
fuzzy_searcher.find_matches(text)


[PhraseMatch(phrase: "Makelaars", variant: "Makelaars",string: "Metselaers", offset: 3)]

In [7]:
# registering a phrase with a distractor
phrases = [{'phrase': 'Makelaars', 'distractors': ['Metselaars']},]

fuzzy_searcher.index_phrase_model(phrases)

# A text with OCR mistakes
text = 'De Metselaers sullen verkopen twee zaken cement.'
# Find all fuzzy matches
fuzzy_searcher.find_matches(text, filter_distractors=True)


[]

In [8]:
fuzzy_searcher = FuzzyPhraseSearcher({'include_variants': True, 'filter_distractors': True})

phrases = [
    {'phrase': 'Makelaars', 'label': ['person_role', 'auction_broker'], 'distractors': ['Metselaars']},
    {'phrase': 'Tabak',     'label': 'auction_good',                    'variants': ['Tobacco']},
    {'phrase': 'Koffie',    'label': 'auction_good'},
]

fuzzy_searcher.index_phrase_model(phrases)

# A text with OCR mistakes
text = 'De Makelaets sullen verkopen twee balen Tobacco en Javaansche Koffy. ' + \
        'De Metselaers sullen verkopen twee zaken cement.'
# Find all fuzzy matches
matches = fuzzy_searcher.find_matches(text)
for match in matches:
    print(f"{match.offset: >4}\t{match.string: <20}\t{match.phrase.phrase_string: <20}", 
          match.label_list)


   3	Makelaets           	Makelaars            ['person_role', 'auction_broker']
  40	Tobacco             	Tabak                ['auction_good']
  62	Koffy               	Koffie               ['auction_good']


In [11]:
text = "Auction op Prime Tobaccos. The Executors of the late JOHN BENNETT," + \
       " Tobacco Merchant,will Sell by AUCTION, at HALL'S Sale Room," + \
       " Commercial Buildings, Cork, TUESDAY the 14th October."


In [12]:
from fuzzysearch import find_near_matches

phrases = [
    {'phrase': 'Makelaars'},
    {'phrase': 'Tabak', 'variants': ['Tobacco']},
    {'phrase': 'Koffie'}
]

# A text with OCR mistakes
text = 'De Makelaets sullen verkopen twee balen Tobacco en Javaansche Koffy.'
for phrase in phrases:
    matches = find_near_matches(phrase['phrase'], text, max_l_dist=2)
    print(matches)

[Match(start=3, end=12, dist=2, matched='Makelaets')]
[Match(start=40, end=45, dist=2, matched='Tobac')]
[Match(start=62, end=68, dist=2, matched='Koffy.')]


In [13]:
# Text from Delpher newspaper archive
text = """n 't Volck inSpanje en Portugacl ten tijdn van de Slag van Almauza , 
tc geven! W i|l de Intikeoingcn in deExchequer van dc 600000 Ponden, toegeftaert 
door middel van Lijfrenten te veikoopcn, door de Alakei&ers by na gecompletecrt 
zijn (die fy Wc'er mei groot propje vetkoopen) werden al itilcke lotekeinngtn te 
niet geraaeckt door etnCl»uful,die bjr dc Lijfremeo-Bil, 
dpwclcke nu ftact te pafleren,gevoegt is }"""

print(find_near_matches('Makelaars', text, max_l_dist=5))

[Match(start=203, end=211, dist=4, matched='akei&ers')]


In [18]:
fuzzy_searcher = FuzzyPhraseSearcher()
fuzzy_searcher.index_phrases(['Makelaars'])
matches = fuzzy_searcher.find_matches(text)
for match in matches:
    print(match.json())

{'type': 'PhraseMatch', 'phrase': 'Makelaars', 'variant': 'Makelaars', 'string': 'Alakei&ers', 'offset': 201, 'label': None, 'text_id': None, 'match_scores': {'char_match': 0.6666666666666666, 'ngram_match': 0.5, 'levenshtein_similarity': 0.5}}


In [34]:
from fuzzy_search.fuzzy_phrase_model import PhraseModel
from fuzzy_search.fuzzy_template_searcher import FuzzyTemplateSearcher, FuzzyTemplate

phrases = [
    {'phrase': 'Makelaars', 'label': ['person_role', 'auction_broker'], 'distractors': ['Metselaars']},
    {'phrase': 'Tabak',     'label': 'auction_good',                    'variants': ['Tobacco']},
    {'phrase': 'Koffie',    'label': 'auction_good'},
]
phrase_model = PhraseModel(phrases)


In [35]:
template = ['auction_broker', 'auction_good']

fuzzy_template = FuzzyTemplate(phrase_model, template)
template_searcher = FuzzyTemplateSearcher(fuzzy_template, {'include_variants': True, 'filter_distractors': True})

# A text with OCR mistakes
text = 'De Makelaets sullen verkopen twee balen Tobacco en Javaansche Koffy. ' + \
        'De Metselaers sullen verkopen twee zaken cement.'
# Find all fuzzy matches
phrase_matches = template_searcher.find_matches(text)
template_matches = template_searcher.find_template_matches(phrase_matches)
for template_match in template_matches:
    for element_match in template_match.element_matches:
        print('Template element:', element_match['label'])
        for phrase_match in element_match['phrase_matches']:
            print(f'\t{phrase_match.phrase.phrase_string: <15}{phrase_match.string: <15}{phrase_match.offset: >4}')

Template element: auction_broker
	Makelaars      Makelaets         3
Template element: auction_good
	Tabak          Tobacco          40
	Koffie         Koffy            62


In [None]:
template = {
    'label': 'auction',
    'ordered': True,
    'type': 'group',
    'elements': [
        {
            'label': 'auction_event',
            'ordered': True,
            'type': 'group',
            'elements': [
                {'label': 'auction_broker',   'required': True,  'cardinality': 'single'},
                {'label': 'auction_location', 'required': True,  'cardinality': 'single'},
                {'label': 'auction_date',     'required': False, 'cardinality': 'single'},
            ]
        },
        {
            'label': 'auction_event',
            'ordered': False,
            'type': 'group',
            'elements': [
                {'label': 'auction_unit',     'required': False,  'cardinality': 'multi'},
                {'label': 'auction_good',     'required': True,  'cardinality': 'multi'},
            ]
        }
    ]
}


In [3]:
from fuzzy_search.tokenization.token import Tokenizer
from fuzzy_search.tokenization.string import text2skipgrams
from fuzzy_search.tokenization.string import SkipGram

tokenizer = Tokenizer()

doc1 = tokenizer.tokenize(text1)
for token in doc1:
    if token.n in fuzzy_searcher.phrase_model.token_in_phrase:
        print(token)
    skip_matches = fuzzy_searcher.find_skipgram_token_matches({'text': token.normalised_string})
    skips = text2skipgrams(token.normalised_string, ngram_size=config['ngram_size'], skip_size=config['skip_size'])
    num_skips = len([skip for skip in skips])
    print(token, token.char_index, len(token.normalised_string), num_skips)
    for phrase in skip_matches.match_offsets:
        doc_skips = []
        for skipgram in skip_matches.match_skipgrams[phrase]:
            doc_skip = SkipGram(skipgram.string, skipgram.offset + token.char_index, skipgram.length)
            doc_skips.append(doc_skip)
            #print(skipgram.string, skipgram.offset)
            #skipgram.offset += token.char_index
        print('\t', phrase.phrase_string)
        print('\t\t', skip_matches.match_set[phrase])
        print('\t\t', skip_matches.match_offsets[phrase])
        print('\t\t', [(skipgram.string, skipgram.offset, skipgram.length) for skipgram in doc_skips])
        

'ie' 0 2 1
'Veucris' 3 7 15
'den' 11 3 3
'5.' 15 2 1
'Januaris' 18 8 18
'1725.' 27 5 9
'PR&ASIDE,' 33 9 21
'Den' 43 3 3
'Heere' 47 5 9
'Bentinck.' 53 9 21
'PRASENTIEBUS,' 63 13 33
'De' 77 2 1
'Heeren' 80 6 12
'Jan' 87 3 3
'Welderen' 91 8 18
',' 100 1 0
'van' 102 3 3
'Dam,' 106 4 6
'Torck' 111 5 9
',' 117 1 0
'met' 119 3 3
'een' 123 3 3
'extraordinaris' 127 14 36
'Gedeputeerde' 142 12 30
'uyt' 155 3 3
'de' 159 2 1
'Provincie' 162 9 21
'van' 172 3 3
'Gelderlandt.' 176 12 30
'Van' 189 3 3
'Maasdam' 193 7 15
',' 201 1 0
'vanden' 203 6 12
'Boeizelaar' 210 10 24
',' 221 1 0
'Raadtpenfionaris' 223 16 42
'van' 240 3 3
'Hoornbeeck' 244 10 24
',' 255 1 0
'met' 257 3 3
'een' 261 3 3
'extraordinaris' 265 14 36
'Gedeputeerde' 280 12 30
'uyt' 293 3 3
'de' 297 2 1
'Provincie' 300 9 21
'van' 310 3 3
'Hollandt' 314 8 18
'ende' 323 4 6
'Welt-Vrieslandt.' 328 16 42
'Velters,' 345 8 18
'Ockere' 354 6 12
',' 361 1 0
'Noey;' 363 5 9
'van' 369 3 3
'Hoorn' 373 5 9
',' 379 1 0
'met' 381 3 3
'een' 385 3 3
'extr

In [None]:
ngram_size = 2
skip_size = 2
