# Text matching

Now I will write functions to [fuzzy match](https://www.datacamp.com/community/tutorials/fuzzy-string-python) titles grabbed using OCR to cleaned up titles from the dataset.

In [57]:
# text matching
import Levenshtein as lev
import fuzzywuzzy as fuzz
from fuzzywuzzy import fuzz

import pandas as pd
import re
import string

import pickle

## Importing and preprocessing books data

In [58]:
!ls data

[34mBX-CSV-Dump[m[m           rated_books_clean.csv users_clean.csv
[34mBX-SQL-Dump[m[m           rated_books_clean.pkl users_clean.pkl
BX-SQL-Dump.zip       ratings_clean.csv     vt.csv
ISBNs.csv             ratings_clean.pkl     vt_10.csv
all_books_clean.csv   sigma.csv             vt_100.csv
all_books_clean.pkl   u.csv                 vt_250.csv
rated_books_clean     user_item_mat.csv     vt_500.csv


In [60]:
with open('data/all_books_clean.pkl', 'rb') as f:
    books_all = pickle.load(f)
    
with open('data/rated_books_clean.pkl', 'rb') as f:
    books_rated = pickle.load(f)

In [61]:
# same preprocessing function from OCR step

def clean_string(text):
    # clean string to remove non-ASCII text
    text = "".join([c if ord(c) < 128 else "" for c in text])

    # standard cleaning
    text = text.lower().strip()
    
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[‘’“”…–]', '', text)
    text = re.sub('\n', '', text)
    
    return text

### Books all

In [62]:
books_all.columns

Index(['ISBN', 'bookTitle', 'bookAuthor', 'yearOfPublication', 'publisher',
       'imageUrlS', 'imageUrlM', 'imageUrlL'],
      dtype='object')

In [63]:
books_all['title_author'] = (books_all.bookTitle + ' ' + books_all.bookAuthor)

In [64]:
books_all.title_author = books_all.title_author.astype(str)

In [65]:
books_all.title_author = books_all.title_author.map(lambda x: clean_string(x))

In [66]:
books_all.title_author

0                      classical mythology mark p o morford
1                         clara callan richard bruce wright
2                          decision in normandy carlo deste
3         flu the story of the great influenza pandemic ...
4                       the mummies of urumchi e j w barber
                                ...                        
271355             theres a bat in bunk five paula danziger
271356                   from one to one hundred teri sloat
271357    lily dale  the true story of the town that tal...
271358                       republic worlds classics plato
271359    a guided tour of rene descartes meditations on...
Name: title_author, Length: 266664, dtype: object

In [85]:
# pickling df with changes
pd.to_pickle(books_all, 'data/all_books_clean.pkl')

### Books rated

In [87]:
books_rated['title_author'] = (books_rated.bookTitle + ' ' + books_rated.bookAuthor)

In [88]:
books_rated.title_author = books_rated.title_author.astype(str)

In [89]:
books_rated.title_author = books_rated.title_author.map(lambda x: clean_string(x))

In [90]:
books_rated.title_author

1                         clara callan richard bruce wright
3         flu the story of the great influenza pandemic ...
5                             the kitchen gods wife amy tan
18                               the testament john grisham
19         beloved plume contemporary fiction toni morrison
                                ...                        
172571       the moon is a harsh mistress robert a heinlein
173803    the princess diaries the princess diaries vol ...
183917    ferne ufer der  band der groen highland saga d...
184473    fear and loathing in las vegas and other ameri...
186565    diversity alliance star wars young jedi knight...
Name: title_author, Length: 10077, dtype: object

In [92]:
# pickling df with changes
pd.to_pickle(books_rated, 'data/rated_books_clean.pkl')

## Fuzzy matching
### Sample data

In [73]:
scraped_titles = ['saa iia bia ie   bn on s i ecrencaiadili ie      si ey si   f im sli lak sa fale less aimle',
 'drow t diaz junot d',
 'alittle hist e history of poworld gombrich a i  of ti  the w',
 'aimmortality milan k a kundera im',
 'e great g igaisby f itzgeral dew the g',
 'narcissus and nd goldmund  hermann hes n hesse',
 'fall the  albert camus']

target_corpus = pd.Series(['billy collins aimless love',
                          'junot diaz drown',
                          'gombrich a little history of the world',
                          'milan kundera immortality',
                          'f scott fitzgerald the great gatsby',
                          'hermann hesse narcissus and goldmund',
                          'the fall albert camus'])

In [74]:
# testing fuzzy matching on scraped titles vs manually entered actual book titles

match_dict = {}

for i, Str1 in enumerate(scraped_titles):
    match_dict[Str1] = []
    for id, Str2 in enumerate(target_corpus):
        Ratio = fuzz.ratio(Str1.lower(),Str2.lower())
        #print('Ratio: ', Str1, Str2, Ratio)

        Partial_Ratio = fuzz.partial_ratio(Str1.lower(),Str2.lower())
        #print('Partial Ratio: ', Str1, Str2, Partial_Ratio)

        Token_Sort_Ratio = fuzz.token_sort_ratio(Str1,Str2)
        #print('Token Sort Ratio: ', Str1, Str2, Token_Sort_Ratio)

        Token_Set_Ratio = fuzz.token_set_ratio(Str1,Str2)
        #print('Token Set Ratio: ', Str1, Str2, Token_Set_Ratio)
        
        match_dict[Str1].append({id: (Ratio, Partial_Ratio, Token_Sort_Ratio, Token_Set_Ratio)})


In [75]:
match_dict

{'saa iia bia ie   bn on s i ecrencaiadili ie      si ey si   f im sli lak sa fale less aimle': [{0: (29,
    50,
    31,
    33)},
  {1: (13, 44, 14, 15)},
  {2: (25, 37, 28, 30)},
  {3: (24, 36, 28, 30)},
  {4: (25, 40, 26, 29)},
  {5: (27, 33, 32, 34)},
  {6: (21, 48, 27, 29)}],
 'drow t diaz junot d': [{0: (22, 26, 27, 27)},
  {1: (51, 56, 86, 86)},
  {2: (35, 37, 28, 28)},
  {3: (32, 37, 32, 32)},
  {4: (30, 37, 37, 37)},
  {5: (33, 32, 25, 25)},
  {6: (30, 26, 30, 30)}],
 'alittle hist e history of poworld gombrich a i  of ti  the w': [{0: (30,
    38,
    31,
    32)},
  {1: (21, 31, 19, 20)},
  {2: (49, 63, 62, 82)},
  {3: (28, 40, 29, 30)},
  {4: (34, 37, 34, 38)},
  {5: (29, 39, 28, 29)},
  {6: (27, 33, 35, 37)}],
 'aimmortality milan k a kundera im': [{0: (34, 35, 37, 37)},
  {1: (33, 38, 29, 29)},
  {2: (34, 36, 39, 39)},
  {3: (55, 65, 86, 86)},
  {4: (32, 33, 32, 32)},
  {5: (35, 36, 38, 38)},
  {6: (37, 38, 30, 30)}],
 'e great g igaisby f itzgeral dew the g': [{0: (28, 

Based off of this test, I will try using token set ratio with a minimum confidence threshold somewhere between 60-80%. I can tweak that more when I am matching to the actual dataset. 

## Testing on dataset

In [76]:
def best_match(input_str, df, min_confidence=75):
    '''
    gets best fuzzy match of text pulled from OCR to book/author in the book crossing dataset
    
    output:
        1. prints a string with string of title/author as they appear in df
        2. returns ISBN of book
        
    -----------------------
    
    input_str: 
        str, title/author pulled from OCR, should be 'books_all' df
        
    df: 
        series, all cleaned and preprocessed title/authors from dataset (books_all.title_author)
        
    min_confidence:
        int, min fuzzy match ratio for fuzz.token_set_ratio. default set to 70%.
    
    
    '''
        
    match_dict = {}
    
    for book in df.title_author:
        match_ratio = fuzz.token_set_ratio(input_str, book)
        match_dict[match_ratio] = book

    best_match_ratio = max(match_dict.keys())
    best_match_title = match_dict[best_match_ratio]
    best_match_index = df[df.title_author == best_match_title].index[0]
    best_match_isbn = df.iloc[best_match_index].ISBN
    
    if best_match_ratio > min_confidence:
        print("Closest match was '{}' at {}% confidence".format(best_match_title, best_match_ratio))

        return best_match_isbn
    
    else:
        print("No match! Closest match was '{}' at {}% confidence".format(best_match_title, best_match_ratio))



In [77]:
best_match(scraped_titles[1], books_all)

Closest match was 'drown junot diaz' at 86% confidence


'0747235465'

In [78]:
# testing that ISBN is correct

books_all[books_all.ISBN == '1573220418']

Unnamed: 0,ISBN,bookTitle,bookAuthor,yearOfPublication,publisher,imageUrlS,imageUrlM,imageUrlL,title_author
71836,1573220418,Drown,Junot Diaz,1996,Riverhead Books,http://images.amazon.com/images/P/1573220418.0...,http://images.amazon.com/images/P/1573220418.0...,http://images.amazon.com/images/P/1573220418.0...,drown junot diaz


In [79]:
best_match(scraped_titles[3], books_all)

Closest match was 'immortality milan kundera' at 86% confidence


'0553213180'

In [80]:
# testing that ISBN is correct

books_all[books_all.ISBN == '0060974486']

Unnamed: 0,ISBN,bookTitle,bookAuthor,yearOfPublication,publisher,imageUrlS,imageUrlM,imageUrlL,title_author
9573,60974486,Immortality,Milan Kundera,1992,Perennial,http://images.amazon.com/images/P/0060974486.0...,http://images.amazon.com/images/P/0060974486.0...,http://images.amazon.com/images/P/0060974486.0...,immortality milan kundera


In [81]:
best_match(scraped_titles[0], books_all)

No match! Closest match was 'a child shall lead them lessons about hope from children with cancer diane m komp' at 50% confidence


In [82]:
for title in scraped_titles:
    print(title,  '>>>', best_match(title, books_all), '\n')

No match! Closest match was 'a child shall lead them lessons about hope from children with cancer diane m komp' at 50% confidence
saa iia bia ie   bn on s i ecrencaiadili ie      si ey si   f im sli lak sa fale less aimle >>> None 

Closest match was 'drown junot diaz' at 86% confidence
drow t diaz junot d >>> 0747235465 

No match! Closest match was 'the story of art e h gombrich' at 74% confidence
alittle hist e history of poworld gombrich a i  of ti  the w >>> None 

Closest match was 'immortality milan kundera' at 86% confidence
aimmortality milan k a kundera im >>> 0553213180 

No match! Closest match was 'the great brain at the academy john fitzgerald' at 64% confidence
e great g igaisby f itzgeral dew the g >>> None 

Closest match was 'narcissus and goldmund hermann hesse' at 100% confidence
narcissus and nd goldmund  hermann hes n hesse >>> 0671721720 

Closest match was 'the fall  albert camus' at 100% confidence
fall the  albert camus >>> 0385489587 



In [83]:
[best_match(title, books_all) for title in scraped_titles]

No match! Closest match was 'a child shall lead them lessons about hope from children with cancer diane m komp' at 50% confidence
Closest match was 'drown junot diaz' at 86% confidence
No match! Closest match was 'the story of art e h gombrich' at 74% confidence
Closest match was 'immortality milan kundera' at 86% confidence
No match! Closest match was 'the great brain at the academy john fitzgerald' at 64% confidence
Closest match was 'narcissus and goldmund hermann hesse' at 100% confidence
Closest match was 'the fall  albert camus' at 100% confidence


[None, '0747235465', None, '0553213180', None, '0671721720', '0385489587']

In [84]:
books_all[books_all.ISBN == '0373217315']

Unnamed: 0,ISBN,bookTitle,bookAuthor,yearOfPublication,publisher,imageUrlS,imageUrlM,imageUrlL,title_author
70569,373217315,Baby Be Mine (2 Novels in 1),Anne Marie Winston,2002,Silhouette,http://images.amazon.com/images/P/0373217315.0...,http://images.amazon.com/images/P/0373217315.0...,http://images.amazon.com/images/P/0373217315.0...,baby be mine novels in anne marie winston
