In [14]:
!pip install fuzzywuzzy



In [15]:
import numpy as np
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from fuzzywuzzy import fuzz, process

In [16]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
base_names = pd.read_csv('base_names.csv')
name_variations = pd.read_csv('name_variations.csv')

In [18]:
base_names.head()

Unnamed: 0,Base_Name_ID,Base_Name
0,1,John Smith
1,2,Jennifer Brown
2,3,Michael O'Connor
3,4,Maria Garcia
4,5,Robert Lee


In [19]:
name_variations.head()

Unnamed: 0,Variation,Matches_With_Base_Name
0,Thomas King,Thomas King
1,ThomasKing,Thomas King
2,Maria Garcia,Maria Garcia
3,MaryLewis,Mary Lewis
4,Nancy W.,Nancy Wright


In [20]:
base_names.shape, name_variations.shape

((20, 2), (100, 2))

#### Preprocessing both Base Names and Name Variations

In [21]:
def preprocessing(name):
  name = name.lower()
  name = re.sub(r'\s+', ' ', name).strip()
  return name

In [22]:
base_names['cleaned_name'] = base_names['Base_Name'].apply(preprocessing)
name_variations['cleaned_name'] = name_variations['Variation'].apply(preprocessing)

#### Fuzzy Matching Names

In [25]:
def fuzzy_match_variations(variations, base_names, threshold = 80):
  matched_names = []

  for variation in variations:
    best_match, score, _ = process.extractOne(variation, base_names, scorer = fuzz.token_sort_ratio)

    if score >= threshold:
      matched_names.append((variation, best_match, score))
    else:
      matched_names.append((variation, None, score))

  return matched_names

In [30]:
matches = fuzzy_match_variations(name_variations['cleaned_name'], base_names['cleaned_name'], threshold = 70)

In [31]:
matched_df = pd.DataFrame(matches, columns = ['cleaned_name', 'best_match', 'similarity_score'])

In [32]:
results = name_variations.merge(matched_df, on = 'cleaned_name', how = 'left')

In [33]:
results = results.merge(base_names[['cleaned_name', 'Base_Name']],
                        left_on = 'best_match', right_on = 'cleaned_name',
                        how = 'left', suffixes = ('_variation', '_base'))

In [34]:
results = results[['Variation', 'Base_Name', 'similarity_score']]

#### Matched Names

In [35]:
results

Unnamed: 0,Variation,Base_Name,similarity_score
0,Thomas King,Thomas King,100
1,Thomas King,Thomas King,100
2,Thomas King,Thomas King,100
3,Thomas King,Thomas King,100
4,ThomasKing,,57
...,...,...,...
221,Paul Allen.,Paul Allen,100
222,Paul Allen.,Paul Allen,100
223,Paul Allen,Paul Allen,100
224,Paul Allen,Paul Allen,100


#### Variation and Base Name with Similarity Score above 80

In [36]:
res = results[results['similarity_score'] >= 80]
res

Unnamed: 0,Variation,Base_Name,similarity_score
0,Thomas King,Thomas King,100
1,Thomas King,Thomas King,100
2,Thomas King,Thomas King,100
3,Thomas King,Thomas King,100
5,Maria Garcia,Maria Garcia,100
...,...,...,...
221,Paul Allen.,Paul Allen,100
222,Paul Allen.,Paul Allen,100
223,Paul Allen,Paul Allen,100
224,Paul Allen,Paul Allen,100
