In [71]:
%reload_ext autoreload
%autoreload 2

In [150]:
from fuzzy_search.fuzzy_phrase_searcher import FuzzyPhraseSearcher
from fuzzy_search.fuzzy_phrase_model import PhraseModel

# highger matching thresholds for higher quality OCR/HTR (higher precision, recall should be good anyway)
# lower matching thresholds for lower quality OCR/HTR (higher recall, as that's the main problem)

config = {
    "char_match_threshold": 0.6,
    "ngram_threshold": 0.5,
    "levenshtein_threshold": 0.6,
    "ignorecase": False,
    "max_length_variance": 3,
    "ngram_size": 2,
    "skip_size": 2,
}

# initialize a new searcher instance with the config
fuzzy_searcher = FuzzyPhraseSearcher(config)

# create a list of domain phrases
domain_phrases = [
    # terms for the chair and attendants of a meeting
    "PRAESIDE",
    "PRAESENTIBUS",
    # some weekdays in Latin
    "Veneris", 
    "Mercurii",
    # some date phrase where any date in January 1725 should match
    "den .. Januarii 1725"
]

phrase_model = PhraseModel(phrases=domain_phrases)

# register the keywords with the searcher
fuzzy_searcher.index_phrase_model(phrase_model)

# take some example texts: meetings of the Dutch States General in January 1725
text1 = "ie Veucris den 5. Januaris 1725. PR&ASIDE, Den Heere Bentinck. PRASENTIEBUS, De Heeren Jan Welderen , van Dam, Torck , met een extraordinaris Gedeputeerde uyt de Provincie van Gelderlandt. Van Maasdam , vanden Boeizelaar , Raadtpenfionaris van Hoornbeeck , met een extraordinaris Gedeputeerde uyt de Provincie van Hollandt ende Welt-Vrieslandt. Velters, Ockere , Noey; van Hoorn , met een extraordinaris Gedeputeerde uyt de Provincie van Zeelandt. Van Renswoude , van Voor{t. Van Schwartzenbergh, vander Waayen, Vegilin Van I{elmuden. Van Iddekinge ‚ van Tamminga."

text2 = "Mercuri: den 10. Jangarii, 1725. ia PRESIDE, Den Heere an Iddekinge. PRA&SENTIBUS, De Heeren /an Welderen , van Dam, van Wynbergen, Torck, met een extraordinaris Gedeputeerde uyt de Provincie van Gelderland. Van Maasdam , Raadtpenfionaris van Hoorn=beeck. Velters, Ockerfe, Noey. Taats van Amerongen, van Renswoude. Vander Waasen , Vegilin, ’ Bentinck, van I(elmaden. Van Tamminga."



In [157]:
# look for matches in the first example text
for match in fuzzy_searcher.find_matches(text1):
    print(match.json())


{'phrase': 'Veneris', 'variant': 'Veneris', 'string': 'Veucris', 'offset': 3, 'match_scores': {'char_match': 0.7142857142857143, 'ngram_match': 0.625, 'levenshtein_similarity': 0.7142857142857143}}
{'phrase': 'den .. Januarii 1725', 'variant': 'den .. Januarii 1725', 'string': 'den 5. Januaris 1725.', 'offset': 11, 'match_scores': {'char_match': 0.95, 'ngram_match': 0.7619047619047619, 'levenshtein_similarity': 0.8571428571428572}}
{'phrase': 'PRAESIDE', 'variant': 'PRAESIDE', 'string': 'PR&ASIDE,', 'offset': 33, 'match_scores': {'char_match': 0.875, 'ngram_match': 0.5555555555555556, 'levenshtein_similarity': 0.6666666666666667}}
{'phrase': 'PRAESENTIBUS', 'variant': 'PRAESENTIBUS', 'string': 'PRASENTIEBUS,', 'offset': 63, 'match_scores': {'char_match': 1.0, 'ngram_match': 0.6923076923076923, 'levenshtein_similarity': 0.7692307692307692}}


In [159]:
# look for matches in the second example text
for match in fuzzy_searcher.find_matches(text2):
    print(match.json())


{'phrase': 'Mercurii', 'variant': 'Mercurii', 'string': 'Mercuri:', 'offset': 0, 'match_scores': {'char_match': 0.875, 'ngram_match': 0.7777777777777778, 'levenshtein_similarity': 0.875}}
{'phrase': 'den .. Januarii 1725', 'variant': 'den .. Januarii 1725', 'string': 'den 10. Jangarii, 1725.', 'offset': 9, 'match_scores': {'char_match': 0.95, 'ngram_match': 0.7142857142857143, 'levenshtein_similarity': 0.782608695652174}}
{'phrase': 'PRAESIDE', 'variant': 'PRAESIDE', 'string': 'PRESIDE,', 'offset': 36, 'match_scores': {'char_match': 0.875, 'ngram_match': 0.6666666666666666, 'levenshtein_similarity': 0.75}}
{'phrase': 'PRAESENTIBUS', 'variant': 'PRAESENTIBUS', 'string': 'PRA&SENTIBUS,', 'offset': 69, 'match_scores': {'char_match': 0.9166666666666666, 'ngram_match': 0.7692307692307693, 'levenshtein_similarity': 0.8461538461538461}}
