# Typos Dataset

In [None]:
import pandas as pd 
import re

df = pd.read_csv('typos_dataset.csv')

# Remove special characters
def remove_special_chars(text):
    return re.sub(r'[^\w\s.]', '', text)

df['fullname'] = df['fullname'].apply(lambda x: remove_special_chars(x))
df['fullname'] = df['fullname'].apply(lambda x: x.lower())
df['firstname'] = df['firstname'].apply(lambda x: x.lower() if not pd.isna(x) else x)
df['firstname'] = df['firstname'].str.replace('.', '', regex=False)
df['firstname'] = df['firstname'].apply(lambda x: remove_special_chars(x))
df['lastname'] = df['lastname'].apply(lambda x: x.lower() if not pd.isna(x) else x)
df['lastname'] = df['lastname'].str.replace('.', '', regex=False)
df['lastname'] = df['lastname'].apply(lambda x: remove_special_chars(x))

# Phonetic Dataset

In [None]:
import pandas as pd 

df = pd.read_csv('phonetic_name_matching_dataset.csv')

# Double Metaphone

In [None]:
from metaphone import doublemetaphone

df["phonetic_code_1"] = df["fullname"].apply(
    lambda x: doublemetaphone(str(x))[0]
)
df["phonetic_code_2"] = df["fullname"].apply(
    lambda x: doublemetaphone(str(x))[1]
)

# Combine primary and secondary phonetic codes for clustering
df["phonetic_code"] = df[["phonetic_code_1", "phonetic_code_2"]].apply(
    lambda x: "".join(filter(None, x)), axis=1
)
unique_phonetic_codes = df["phonetic_code"].unique()
phonetic_code_to_num = {code: idx for idx, code in enumerate(unique_phonetic_codes)}
df["Cluster_graph"] = df["phonetic_code"].map(phonetic_code_to_num)

# Evaluation

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

def get_cluster_evaluation(df):
    y_pred = []
    y_truth = []
    groups = df['Cluster_graph'].values
    ids = df['ID'].values
    for i in range(len(df)):
        for k in range(len(df)):
            if i != k:
                if groups[i] == groups[k]:
                    y_pred.append(1)
                else:
                    y_pred.append(0)
            if i != k:
                if ids[i] == ids[k]:
                    y_truth.append(1)
                else:
                    y_truth.append(0)
    return y_pred,y_truth

y_pred,y_truth = get_cluster_evaluation(df)

precision = precision_score(y_truth, y_pred)
recall    = recall_score(y_truth, y_pred)
f1        = f1_score(y_truth, y_pred)
accuracy  = accuracy_score(y_truth, y_pred)

print(f"Precision: {precision:.2f}")
print(f"Recall   : {recall:.2f}")
print(f"F1 Score : {f1:.2f}")
print(f"Accuracy : {accuracy:.2f}")