# Text matching

Now I will write functions to [fuzzy match](https://www.datacamp.com/community/tutorials/fuzzy-string-python) titles grabbed using OCR to cleaned up titles from the dataset.

In [1]:
# text matching
import Levenshtein as lev
import fuzzywuzzy as fuzz
from fuzzywuzzy import fuzz


In [2]:
scraped_titles = ['gg ll  ns bill wi ea aimle',
 'drow diaz junot',
 'rule evle hl  history o world gombrich zoe the',
 'immortality milan kundera i',
 'great gatcrby itzgeral d the',
 'narcissus an d goldmund hermann he  hesse fsg fall',
 'lhe albert camus']

target_corpus = ['billy collins aimless love',
        'junot diaz drown',
         'gombrich a little history of the world',
         'milan kundera immortality',
         'f scott fitzgerald the great gatsby',
         'hermann hesse narcissus and goldmund',
         'the fall albert camus']

In [3]:
# testing fuzzy matching on scraped titles vs manually entered actual book titles

match_dict = {}

for i, Str1 in enumerate(scraped_titles):
    match_dict[Str1] = []
    for id, Str2 in enumerate(target_corpus):
        Ratio = fuzz.ratio(Str1.lower(),Str2.lower())
        #print('Ratio: ', Str1, Str2, Ratio)

        Partial_Ratio = fuzz.partial_ratio(Str1.lower(),Str2.lower())
        #print('Partial Ratio: ', Str1, Str2, Partial_Ratio)

        Token_Sort_Ratio = fuzz.token_sort_ratio(Str1,Str2)
        #print('Token Sort Ratio: ', Str1, Str2, Token_Sort_Ratio)

        Token_Set_Ratio = fuzz.token_set_ratio(Str1,Str2)
        #print('Token Set Ratio: ', Str1, Str2, Token_Set_Ratio)
        
        match_dict[Str1].append({id: (Ratio, Partial_Ratio, Token_Sort_Ratio, Token_Set_Ratio)})


In [4]:
match_dict

{'gg ll  ns bill wi ea aimle': [{0: (46, 46, 63, 63)},
  {1: (24, 31, 24, 24)},
  {2: (34, 35, 35, 35)},
  {3: (35, 36, 40, 40)},
  {4: (36, 35, 33, 33)},
  {5: (26, 31, 33, 33)},
  {6: (38, 39, 39, 39)}],
 'drow diaz junot': [{0: (24, 27, 24, 24)},
  {1: (52, 53, 97, 97)},
  {2: (23, 33, 26, 26)},
  {3: (35, 40, 35, 35)},
  {4: (24, 27, 28, 28)},
  {5: (27, 33, 24, 24)},
  {6: (22, 27, 28, 28)}],
 'rule evle hl  history o world gombrich zoe the': [{0: (31, 35, 37, 37)},
  {1: (23, 31, 23, 23)},
  {2: (48, 53, 75, 81)},
  {3: (28, 32, 29, 29)},
  {4: (32, 37, 35, 38)},
  {5: (32, 36, 32, 32)},
  {6: (27, 33, 36, 36)}],
 'immortality milan kundera i': [{0: (34, 35, 34, 34)},
  {1: (37, 38, 33, 33)},
  {2: (34, 37, 37, 37)},
  {3: (58, 75, 96, 100)},
  {4: (35, 41, 35, 35)},
  {5: (32, 34, 35, 35)},
  {6: (33, 33, 29, 29)}],
 'great gatcrby itzgeral d the': [{0: (30, 31, 26, 26)},
  {1: (27, 31, 27, 27)},
  {2: (42, 46, 39, 45)},
  {3: (34, 36, 34, 34)},
  {4: (51, 57, 60, 60)},
  {5: (2

Based off of this test, I will try using token set ratio with a minimum confidence threshold somewhere between 60-80%. I can tweak that more when I am matching to the actual dataset. 