In [47]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import pandas as pd
import numpy as np
import difflib
import jellyfish

In [48]:
def get_closest_match(x, list_strings):

  best_match = None
  highest_jw = 0

  for current_string in list_strings:
    current_score = jellyfish.jaro_winkler(x, current_string)

    if(current_score > highest_jw):
      highest_jw = current_score
      best_match = current_string

  return best_match

In [49]:
fuzz.ratio("this is a test", "this is a test!")

97

In [50]:
fuzz.token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")

100

In [51]:
fuzz.token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")

100

In [52]:
fuzz.partial_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")

100

In [53]:
query = 'Barack Obama'
choices = ['Barack H Obama', 'Barack H. Obama', 'B. Obama']
# Get a list of matches ordered by score, default limit to 5
process.extract(query, choices)
# [('Barack H Obama', 95), ('Barack H. Obama', 95), ('B. Obama', 85)]
 
# If we want only the top one
process.extractOne(query, choices)
# ('Barack H Obama', 95)

('Barack H Obama', 95)

In [54]:
from pyphonetics import Soundex
soundex = Soundex()

In [55]:
soundex.phonetics('Rupert')

'R163'

In [56]:
soundex.sounds_like('Robert', 'Rupert')

True

In [57]:
from pyphonetics import Metaphone
metaphone = Metaphone()
metaphone.phonetics('discrimination')

'TSKRMNXN'

In [58]:
from pyphonetics import RefinedSoundex
rs = RefinedSoundex()
rs.distance('Rupert', 'Robert')

0

In [59]:
rs.distance('assign', 'assist', metric='hamming')

2

In [60]:
df1 = pd.DataFrame([[1],[2],[3],[4],[5]], index=['one','two','three','four','five'], columns=['number'])
df1

Unnamed: 0,number
one,1
two,2
three,3
four,4
five,5


In [61]:
df2 = pd.DataFrame([['a'],['b'],['c'],['d'],['e']], index=['one','too','three','fours','five'], columns=['letter'])
df2

Unnamed: 0,letter
one,a
too,b
three,c
fours,d
five,e


In [62]:
df2.index = df2.index.map(lambda x: difflib.get_close_matches(x, df1.index)[0])
df2

Unnamed: 0,letter
one,a
two,b
three,c
four,d
five,e


In [63]:
df1 = pd.DataFrame([[1,'one'],[2,'two'],[3,'three'],[4,'four'],[5,'five']], columns=['number', 'name'])
df2 = pd.DataFrame([['a','one'],['b','too'],['c','three'],['d','fours'],['e','five']], columns=['letter', 'name'])

df2['name'] = df2['name'].apply(lambda x: difflib.get_close_matches(x, df1['name'])[0])
df1.merge(df2)

Unnamed: 0,number,name,letter
0,1,one,a
1,2,two,b
2,3,three,c
3,4,four,d
4,5,five,e


In [65]:
df1 = pd.DataFrame([[1],[2],[3],[4],[5]], index=['one','two','three','four','five'], columns=['number'])
df2 = pd.DataFrame([['a'],['b'],['c'],['d'],['e']], index=['one','too','three','fours','five'], columns=['letter'])

df2.index = df2.index.map(lambda x: get_closest_match(x, df1.index))

df1.join(df2)

Unnamed: 0,number,letter
one,1,a
two,2,b
three,3,c
four,4,d
five,5,e
