# Rule-based Name Matching Algorithm

In [None]:
from rapidfuzz import fuzz

y_pred = [] # for pairwise evaluation
y_truth = [] # ground-truth
false_positive = []
miss_match = []

def check_first_letters(str1, str2):
    def get_first_letters(s):
        return sorted([word[0].lower() for word in s.split() if word])

    first_letters1 = get_first_letters(str1)
    first_letters2 = get_first_letters(str2)
    
    set1 = set(first_letters1)
    set2 = set(first_letters2)

    # Return True if they are equal or if one is a subset of the other
    return set1 == set2 

def check_belong(f1,f2,l1,l2,n1,n2):
    belong = False
    if fuzz.token_ratio(n1,n2) >= 95 or fuzz.partial_ratio(n1,n2) >= 100:
        belong = True
    else:
        if (fuzz.token_ratio(f1,f2) >= 95 or fuzz.partial_ratio(f1,f2) >= 95)  and (fuzz.token_ratio(l1,l2) >= 95 or fuzz.partial_ratio(l1,l2)>= 95):
            belong = True
    return belong

def compare_first_last(f1,f2,l1,l2,n1,n2):
    if fuzz.ratio(n1,n2) >= 60 or fuzz.token_set_ratio(n1,n2) >= 70:
        if fuzz.ratio(l1, l2) >= 70 or fuzz.token_set_ratio(l1, l2) >= 70 or fuzz.partial_ratio(l1, l2) >= 81:
            if fuzz.ratio(f1, f2) >= 70 or fuzz.token_set_ratio(f1, f2) >= 70 or fuzz.partial_ratio(f1, f2) >= 81:
                return True
    return False

def rule_base_name_matching(df, token_ratio, ratio, partial_ratio):
    group = [[] for _ in range(len(df))]
    belong = [0 for _ in range(len(df))]

    names = df['fullname'].values
    firstnames = df['firstname'].values
    lastnames = df['lastname'].values
    indexes = df['Index'].values
    ids = df['ID'].values

    for i in range(len(df)):
        for k in range(len(df)):
            Match = False
            if i != k:
                Match = False
                if ids[i] == ids[k]:
                    y_truth.append(1)
                else:
                    y_truth.append(0)

                if belong[i] != 1 and belong[k] != 1:
                    if not ("." in names[i] and "." in names[k]): 
                        if fuzz.ratio(names[i],names[k]) >= ratio:
                            if fuzz.token_set_ratio(names[i],names[k]) <= token_ratio:
                                if compare_first_last(firstnames[i],firstnames[k],lastnames[i],lastnames[k],names[i],names[k]):
                                    Match = True 
                            else:
                                Match = True
                        else:
                            if fuzz.token_set_ratio(names[i],names[k]) >= token_ratio or fuzz.partial_ratio(names[i],names[k]) >= partial_ratio:
                                Match = True
                            elif check_first_letters(names[i],names[k]):
                                if compare_first_last(firstnames[i],firstnames[k],lastnames[i],lastnames[k],names[i],names[k]):
                                    Match = True
                                    
                    else:
                        if fuzz.ratio(names[i],names[k]) >= 95 or fuzz.token_set_ratio(names[i],names[k]) >= 95 or fuzz.partial_ratio(names[i],names[k]) >= 97:
                            Match = True
            
                if Match:
                    y_pred.append(1)
                    group[i].append(indexes[k]) 
                    if check_belong(firstnames[i],firstnames[k],lastnames[i],lastnames[k],names[i],names[k]):
                        if len(names[i]) < len(names[k]):
                            belong[i] = 1
                        else:
                            belong[k] = 1
                else:
                    y_pred.append(0)
    df['GROUP'] = group
    return df


# Graph-base Clustering Algorithm

In [None]:
from collections import defaultdict

def cluster_by_DFS(df2):
    graph = defaultdict(set)
    for index, row in df2.iterrows():
        person_id = row["Index"]
        groups = row["GROUP"]
        for group in groups:
            graph[person_id].add(group)
            graph[group].add(person_id)

    # Function to perform DFS and find all nodes in the same connected component
    def dfs(node, visited, component):
        stack = [node]
        while stack:
            current = stack.pop()
            if current not in visited:
                visited.add(current)
                component.append(current)
                stack.extend(graph[current] - visited)

    # Find all connected components
    visited = set()
    clusters = []
    for node in graph:
        if node not in visited:
            component = []
            dfs(node, visited, component)
            clusters.append(component)

    def get_cluster_label(node, clusters):
        for i, cluster in enumerate(clusters):
            if node in cluster:
                return i
        return None

    df2["Cluster_graph"] = df2["Index"].apply(
        lambda x: get_cluster_label(x, clusters)
    )

    return df2


# Typos Dataset

In [None]:
import re
import pandas as pd 

df = pd.read_csv('typos_dataset.csv')

# Remove special characters
def remove_special_chars(text):
    return re.sub(r'[^\w\s.]', '', text)

df['fullname'] = df['fullname'].apply(lambda x: remove_special_chars(x))
df['fullname'] = df['fullname'].apply(lambda x: x.lower())
df['firstname'] = df['firstname'].apply(lambda x: x.lower() if not pd.isna(x) else x)
df['firstname'] = df['firstname'].str.replace('.', '', regex=False)
df['firstname'] = df['firstname'].apply(lambda x: remove_special_chars(x))
df['lastname'] = df['lastname'].apply(lambda x: x.lower() if not pd.isna(x) else x)
df['lastname'] = df['lastname'].str.replace('.', '', regex=False)
df['lastname'] = df['lastname'].apply(lambda x: remove_special_chars(x))

df

In [None]:
token_ratio = 86.3
ratio = 82
partial_ratio = 97

df_group_name = rule_base_name_matching(df,token_ratio,ratio,partial_ratio)
df_cluster = cluster_by_DFS(df_group_name)

### Pair-wise evaluation result

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
                
precision = precision_score(y_truth, y_pred)
recall    = recall_score(y_truth, y_pred)
f1        = f1_score(y_truth, y_pred)
accuracy  = accuracy_score(y_truth, y_pred)

print(f"Precision: {precision:.2f}")
print(f"Recall   : {recall:.2f}")
print(f"F1 Score : {f1:.2f}")
print(f"Accuracy : {accuracy:.2f}")

### Cluster-level evaluation result

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

def get_cluster_evaluation(df):
    y_pred = []
    groups = df['Cluster_graph'].values
    for i in range(len(df)):
        for k in range(len(df)):
            if i != k:
                if groups[i] == groups[k]:
                    y_pred.append(1)
                else:
                    y_pred.append(0)
    return y_pred

y_pred = get_cluster_evaluation(df_cluster)

precision = precision_score(y_truth, y_pred)
recall    = recall_score(y_truth, y_pred)
f1        = f1_score(y_truth, y_pred)
accuracy  = accuracy_score(y_truth, y_pred)

print(f"Precision: {precision:.2f}")
print(f"Recall   : {recall:.2f}")
print(f"F1 Score : {f1:.2f}")
print(f"Accuracy : {accuracy:.2f}")

# Phonetic Dataset

In [None]:
import pandas as pd 

df = pd.read_csv('phonetic_name_matching_dataset.csv')
df['Index'] = df.index

In [None]:
token_ratio = 92
ratio = 92
partial_ratio = 97

# Readjust the threshold for first and last names
def compare_first_last(f1,f2,l1,l2,n1,n2):
    if fuzz.ratio(n1,n2) >= 60 or fuzz.token_set_ratio(n1,n2) >= 65:
        if fuzz.ratio(l1, l2) >= 60 or fuzz.token_set_ratio(l1, l2) >= 60 or fuzz.partial_ratio(l1, l2) >= 81:
            if fuzz.ratio(f1, f2) >= 60 or fuzz.token_set_ratio(f1, f2) >= 60 or fuzz.partial_ratio(f1, f2) >= 81:
                return True
    return False

df_group_name = rule_base_name_matching(df,token_ratio,ratio,partial_ratio)
df_cluster = cluster_by_DFS(df_group_name)

### Pair-wise evaluation result

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
                
precision = precision_score(y_truth, y_pred)
recall    = recall_score(y_truth, y_pred)
f1        = f1_score(y_truth, y_pred)
accuracy  = accuracy_score(y_truth, y_pred)

print(f"Precision: {precision:.2f}")
print(f"Recall   : {recall:.2f}")
print(f"F1 Score : {f1:.2f}")
print(f"Accuracy : {accuracy:.2f}")

### Cluster-level evaluation result

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

def get_cluster_evaluation(df):
    y_pred = []
    groups = df['Cluster_graph'].values
    for i in range(len(df)):
        for k in range(len(df)):
            if i != k:
                if groups[i] == groups[k]:
                    y_pred.append(1)
                else:
                    y_pred.append(0)
    return y_pred

y_pred = get_cluster_evaluation(df_cluster)

precision = precision_score(y_truth, y_pred)
recall    = recall_score(y_truth, y_pred)
f1        = f1_score(y_truth, y_pred)
accuracy  = accuracy_score(y_truth, y_pred)

print(f"Precision: {precision:.2f}")
print(f"Recall   : {recall:.2f}")
print(f"F1 Score : {f1:.2f}")
print(f"Accuracy : {accuracy:.2f}")