# Typos Dataset

In [None]:
import pandas as pd 
import re

df = pd.read_csv('typos_dataset.csv')

# Remove special characters
def remove_special_chars(text):
    return re.sub(r'[^\w\s.]', '', text)

df['fullname'] = df['fullname'].apply(lambda x: remove_special_chars(x))
df['fullname'] = df['fullname'].apply(lambda x: x.lower())
df['firstname'] = df['firstname'].apply(lambda x: x.lower() if not pd.isna(x) else x)
df['firstname'] = df['firstname'].str.replace('.', '', regex=False)
df['firstname'] = df['firstname'].apply(lambda x: remove_special_chars(x))
df['lastname'] = df['lastname'].apply(lambda x: x.lower() if not pd.isna(x) else x)
df['lastname'] = df['lastname'].str.replace('.', '', regex=False)
df['lastname'] = df['lastname'].apply(lambda x: remove_special_chars(x))

# Generate random list of 100 IDs
training_ids = [17,
713,
703,
 30,
 45,
 51,
 52,
 55,
 70,
 71,
 85,
 95,
 97,
 128,
 134,
 137,
 144,
 148,
 163,
 185,
 189,
 192,
 196,
 198,
 200,
 221,
 228,
 229,
 242,
 253,
 254,
 257,
 258,
 269,
 271,
 295,
 316,
 337,
 338,
 341,
 352,
 362,
 363,
 364,
 369,
 373,
 385,
 388,
 400,
 405,
 410,
 411,
 413,
 420,
 428,
 439,
 443,
 453,
 457,
 478,
 482,
 498,
 500,
 518,
 523,
 530,
 531,
 537,
 541,
 557,
 563,
 564,
 579,
 580,
 584,
 592,
 606,
 609,
 641,
 659,
 662,
 664,
 668,
 675,
 677,
 679,
 683,
 690,
 694,
 707,
 710,
 716,
 720,
 730,
 734,
 742,
 744,
 746,
 747,
 748,
 759,
 765,
 768,
 769,
 774,
 783,
 784,
 787,
 788,
 810,
 812,
 815,
 817,
 820,
 825,
 833,
 836,
 838,
 862,
 863,
 879,
 884,
 900,
 901,
 912,
 918,
 926,
 929,
 940,
 943,
 949,
 951,
 961,
 962,
 974,
 980,
 996,
 1011,
 1016,
 1017,
 1018,
 1034,
 1037,
 1043,
 1044,
 1053,
 1058,
 1065,
 1081,
 1090,
 1093,
 1097]

# Filter DataFrame where ID is in the selected list
subset_df = df[df['ID'].isin(training_ids)]
subset_df

### Features selection

In [None]:
from rapidfuzz import fuzz
import jellyfish
import Levenshtein

pairs = []
labels = []

fullname_pairs = []
firstname_pairs = []
lastname_pairs = []
labels = []

for i in range(len(subset_df)):
    id = subset_df['ID'].iloc[i]
    for k in range(len(subset_df)):
        if i != k:
            fullname_pairs.append((subset_df['fullname'].iloc[i],subset_df['fullname'].iloc[k]))
            firstname_pairs.append((subset_df['firstname'].iloc[i],subset_df['firstname'].iloc[k]))
            lastname_pairs.append((subset_df['lastname'].iloc[i],subset_df['lastname'].iloc[k]))
            if id == subset_df['ID'].iloc[k]:
                labels.append(1)
            else: 
                labels.append(0)
                
def check_first_letters(str1, str2):
    def get_first_letters(s):
        return sorted([word[0].lower() for word in s.split() if word])

    first_letters1 = get_first_letters(str1)
    first_letters2 = get_first_letters(str2)
    
    set1 = set(first_letters1)
    set2 = set(first_letters2)

    # Return True if they are equal or if one is a subset of the other
    return set1 == set2 or set1.issubset(set2) or set2.issubset(set1)

features = []
for i in range(len(fullname_pairs)):

    features.append({
        'fullname_ratio': fuzz.ratio(fullname_pairs[i][0], fullname_pairs[i][1]),
        'fullname_partial_ratio': fuzz.partial_ratio(fullname_pairs[i][0], fullname_pairs[i][1]),
        'fullname_token_sort_ratio': fuzz.token_sort_ratio(fullname_pairs[i][0], fullname_pairs[i][1]),
        'fullname_token_set_ratio': fuzz.token_set_ratio(fullname_pairs[i][0], fullname_pairs[i][1]),
        'fullname_levenshtein': Levenshtein.distance(fullname_pairs[i][0], fullname_pairs[i][1]),
        'fullname_jaro_winkler': jellyfish.jaro_winkler_similarity(fullname_pairs[i][0], fullname_pairs[i][1]),
        'lastname_ratio':fuzz.ratio(lastname_pairs[i][0], lastname_pairs[i][1]),
        'lastname_partial_ratio': fuzz.partial_ratio(lastname_pairs[i][0], lastname_pairs[i][1]),
        'lastname_token_sort_ratio': fuzz.token_sort_ratio(lastname_pairs[i][0], lastname_pairs[i][1]),
        'lastname_token_set_ratio': fuzz.token_set_ratio(lastname_pairs[i][0], lastname_pairs[i][1]),
        'lastname_levenshtein': Levenshtein.distance(lastname_pairs[i][0], lastname_pairs[i][1]),
        'lastname_jaro_winkler': jellyfish.jaro_winkler_similarity(lastname_pairs[i][0], lastname_pairs[i][1]),
        'firstname_ratio':fuzz.ratio(firstname_pairs[i][0], firstname_pairs[i][1]),
        'firstname_partial_ratio': fuzz.partial_ratio(firstname_pairs[i][0], firstname_pairs[i][1]),
        'firstname_token_sort_ratio': fuzz.token_sort_ratio(firstname_pairs[i][0], firstname_pairs[i][1]),
        'firstname_token_set_ratio': fuzz.token_set_ratio(firstname_pairs[i][0], firstname_pairs[i][1]),
        'firstname_levenshtein': Levenshtein.distance(firstname_pairs[i][0], firstname_pairs[i][1]),
        'firstname_jaro_winkler': jellyfish.jaro_winkler_similarity(firstname_pairs[i][0], firstname_pairs[i][1]),
    })

X = pd.DataFrame(features)
y = pd.Series(labels)


In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train model

model = RandomForestClassifier(n_estimators=100, random_state=42)
# model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))

In [None]:
new_pairs_fullname = []
new_pairs_firstname = []
new_pairs_lastname = []
new_labels = []
indexes = [] 

def check_first_letters(str1, str2):
    def get_first_letters(s):
        return sorted([word[0].lower() for word in s.split() if word])

    first_letters1 = get_first_letters(str1)
    first_letters2 = get_first_letters(str2)
    
    set1 = set(first_letters1)
    set2 = set(first_letters2)

    # Return True if they are equal or if one is a subset of the other
    return set1 == set2 or set1.issubset(set2) or set2.issubset(set1)

fullnames = df['fullname'].values
firstnames = df['firstname'].values
lastnames = df['lastname'].values
ids = df['ID'].values
index = df['Index'].values
for i in range(len(df)):
    id = ids[i]
    for k in range(len(df)):
        if i != k:
            new_pairs_fullname.append((fullnames[i],fullnames[k]))
            new_pairs_firstname.append((firstnames[i],firstnames[k]))
            new_pairs_lastname.append((lastnames[i],lastnames[k]))
            indexes.append((index[i],index[k]))
            if id == ids[k]:
                new_labels.append(1)
            else: 
                new_labels.append(0)

### Pair-wise evaluation

In [None]:
from sklearn.metrics import precision_recall_fscore_support

new_features = pd.DataFrame([{
    'fullname_ratio': fuzz.ratio(new_pairs_fullname[i][0], new_pairs_fullname[i][1]),
    'fullname_partial_ratio': fuzz.partial_ratio(new_pairs_fullname[i][0], new_pairs_fullname[i][1]),
    'fullname_token_sort_ratio': fuzz.token_sort_ratio(new_pairs_fullname[i][0], new_pairs_fullname[i][1]),
    'fullname_token_set_ratio': fuzz.token_set_ratio(new_pairs_fullname[i][0], new_pairs_fullname[i][1]),
    'fullname_levenshtein': Levenshtein.distance(new_pairs_fullname[i][0], new_pairs_fullname[i][1]),
    'fullname_jaro_winkler': jellyfish.jaro_winkler_similarity(new_pairs_fullname[i][0], new_pairs_fullname[i][1]),
    'lastname_ratio':fuzz.ratio(new_pairs_lastname[i][0], new_pairs_lastname[i][1]),
    'lastname_partial_ratio': fuzz.partial_ratio(new_pairs_lastname[i][0], new_pairs_lastname[i][1]),
    'lastname_token_sort_ratio': fuzz.token_sort_ratio(new_pairs_lastname[i][0], new_pairs_lastname[i][1]),
    'lastname_token_set_ratio': fuzz.token_set_ratio(new_pairs_lastname[i][0], new_pairs_lastname[i][1]),
    'lastname_levenshtein': Levenshtein.distance(new_pairs_lastname[i][0], new_pairs_lastname[i][1]),
    'lastname_jaro_winkler': jellyfish.jaro_winkler_similarity(new_pairs_lastname[i][0], new_pairs_lastname[i][1]),
    'firstname_ratio':fuzz.ratio(new_pairs_firstname[i][0], new_pairs_firstname[i][1]),
    'firstname_partial_ratio': fuzz.partial_ratio(new_pairs_firstname[i][0], new_pairs_firstname[i][1]),
    'firstname_token_sort_ratio': fuzz.token_sort_ratio(new_pairs_firstname[i][0], new_pairs_firstname[i][1]),
    'firstname_token_set_ratio': fuzz.token_set_ratio(new_pairs_firstname[i][0], new_pairs_firstname[i][1]),
    'firstname_levenshtein': Levenshtein.distance(new_pairs_firstname[i][0], new_pairs_firstname[i][1]),
    'firstname_jaro_winkler': jellyfish.jaro_winkler_similarity(new_pairs_firstname[i][0], new_pairs_firstname[i][1]),
} for i in range(len(new_pairs_fullname))])

predictions = model.predict_proba(new_features)[:, 1]
best_f1 = 0
best_threshold = None

for i in range(50,91):
    prediction = [1 if pred > i/100 else 0 for pred in predictions]
    precision, recall, f1, _ = precision_recall_fscore_support(new_labels, prediction, average='binary')
        
    print(f"{i} : {precision:9.2f} | {recall:6.2f} | {f1:4.2f}")
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = i

print(f"\nBest threshold : {best_threshold} with F1 score: {best_f1:.2f}")

### Cluster-level evaluation

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

def get_cluster_evaluation(df):
    y_pred = []
    y_truth = []
    groups = df['Cluster_graph'].values
    ids = df['ID'].values
    for i in range(len(df)):
        for k in range(len(df)):
            if i != k:
                if groups[i] == groups[k]:
                    y_pred.append(1)
                else:
                    y_pred.append(0)
            if i != k:
                if ids[i] == ids[k]:
                    y_truth.append(1)
                else:
                    y_truth.append(0)
    return y_pred,y_truth

group = [[] for _ in range(len(df))]
prediction = [1 if pred > 50/100 else 0 for pred in predictions]
for i in range(len(new_labels)):
    if prediction[i] == 1:
        group[indexes[i][0]].append(indexes[i][1])
df['GROUP'] = group


y_pred,y_truth = get_cluster_evaluation(df)

precision = precision_score(y_truth, y_pred)
recall    = recall_score(y_truth, y_pred)
f1        = f1_score(y_truth, y_pred)
accuracy  = accuracy_score(y_truth, y_pred)

print(f"Precision: {precision:.2f}")
print(f"Recall   : {recall:.2f}")
print(f"F1 Score : {f1:.2f}")
print(f"Accuracy : {accuracy:.2f}")

# Phonetic Dataset

In [None]:
import pandas as pd 

df = pd.read_csv('phonetic_name_matching_dataset.csv')

In [None]:
training_ids = [2,6,9,12,14]
# Filter DataFrame where ID is in the selected list
subset_df = df[df['ID'].isin(training_ids)]
subset_df

### Features Selection

In [None]:
from rapidfuzz import fuzz
import jellyfish
import Levenshtein
import os

pairs = []
labels = []

fullname_pairs = []
firstname_pairs = []
lastname_pairs = []
labels = []

for i in range(len(subset_df)):
    id = subset_df['ID'].iloc[i]
    for k in range(len(subset_df)):
        if i != k:
            fullname_pairs.append((subset_df['fullname'].iloc[i],subset_df['fullname'].iloc[k]))
            firstname_pairs.append((subset_df['firstname'].iloc[i],subset_df['firstname'].iloc[k]))
            lastname_pairs.append((subset_df['lastname'].iloc[i],subset_df['lastname'].iloc[k]))
            if id == subset_df['ID'].iloc[k]:
                labels.append(1)
            else: 
                labels.append(0)
                
def check_first_letters(str1, str2):
    def get_first_letters(s):
        return sorted([word[0].lower() for word in s.split() if word])

    first_letters1 = get_first_letters(str1)
    first_letters2 = get_first_letters(str2)
    
    set1 = set(first_letters1)
    set2 = set(first_letters2)

    # Return True if they are equal or if one is a subset of the other
    return set1 == set2 or set1.issubset(set2) or set2.issubset(set1)

features = []
for i in range(len(fullname_pairs)):

    features.append({
        'fdist': jellyfish.jaro_winkler_similarity(firstname_pairs[i][0], firstname_pairs[i][1]),
        'ldist': jellyfish.jaro_winkler_similarity(lastname_pairs[i][0], lastname_pairs[i][1]),

        'exact': int(firstname_pairs[i][0] == firstname_pairs[i][1] and lastname_pairs[i][0] == lastname_pairs[i][1]),

        'f_start': int(bool(firstname_pairs[i][0]) and bool(firstname_pairs[i][1]) and firstname_pairs[i][0][0] == firstname_pairs[i][1][0]),
        'f_end': int(bool(firstname_pairs[i][0]) and bool(firstname_pairs[i][1]) and firstname_pairs[i][0][-1] == firstname_pairs[i][1][-1]),
        'l_start': int(bool(lastname_pairs[i][0]) and bool(lastname_pairs[i][1]) and lastname_pairs[i][0][0] == lastname_pairs[i][1][0]),
        'l_end': int(bool(lastname_pairs[i][0]) and bool(lastname_pairs[i][1]) and lastname_pairs[i][0][-1] == lastname_pairs[i][1][-1]),

        'fsoundex': int(jellyfish.soundex(firstname_pairs[i][0]) == jellyfish.soundex(firstname_pairs[i][1])),
        'lsoundex': int(jellyfish.soundex(lastname_pairs[i][0]) == jellyfish.soundex(lastname_pairs[i][1])),
    })

X = pd.DataFrame(features)
y = pd.Series(labels)


In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train model

model = RandomForestClassifier(n_estimators=100, random_state=42)
# model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))

In [None]:
new_pairs_fullname = []
new_pairs_firstname = []
new_pairs_lastname = []
new_labels = []
indexes = [] 

def check_first_letters(str1, str2):
    def get_first_letters(s):
        return sorted([word[0].lower() for word in s.split() if word])

    first_letters1 = get_first_letters(str1)
    first_letters2 = get_first_letters(str2)
    
    set1 = set(first_letters1)
    set2 = set(first_letters2)

    # Return True if they are equal or if one is a subset of the other
    return set1 == set2 or set1.issubset(set2) or set2.issubset(set1)

fullnames = df['fullname'].values
firstnames = df['firstname'].values
lastnames = df['lastname'].values
ids = df['ID'].values
index = df['Index'].values
for i in range(len(df)):
    id = ids[i]
    for k in range(len(df)):
        if i != k:
            new_pairs_fullname.append((fullnames[i],fullnames[k]))
            new_pairs_firstname.append((firstnames[i],firstnames[k]))
            new_pairs_lastname.append((lastnames[i],lastnames[k]))
            indexes.append((index[i],index[k]))
            if id == ids[k]:
                new_labels.append(1)
            else: 
                new_labels.append(0)

### Pair-wise evaluation

In [None]:
from sklearn.metrics import precision_recall_fscore_support

new_features = pd.DataFrame([{
    'fdist': jellyfish.jaro_winkler_similarity(firstname_pairs[i][0], firstname_pairs[i][1]),
    'ldist': jellyfish.jaro_winkler_similarity(lastname_pairs[i][0], lastname_pairs[i][1]),

    'exact': int(firstname_pairs[i][0] == firstname_pairs[i][1] and lastname_pairs[i][0] == lastname_pairs[i][1]),

    'f_start': int(bool(firstname_pairs[i][0]) and bool(firstname_pairs[i][1]) and firstname_pairs[i][0][0] == firstname_pairs[i][1][0]),
    'f_end': int(bool(firstname_pairs[i][0]) and bool(firstname_pairs[i][1]) and firstname_pairs[i][0][-1] == firstname_pairs[i][1][-1]),
    'l_start': int(bool(lastname_pairs[i][0]) and bool(lastname_pairs[i][1]) and lastname_pairs[i][0][0] == lastname_pairs[i][1][0]),
    'l_end': int(bool(lastname_pairs[i][0]) and bool(lastname_pairs[i][1]) and lastname_pairs[i][0][-1] == lastname_pairs[i][1][-1]),

    'fsoundex': int(jellyfish.soundex(firstname_pairs[i][0]) == jellyfish.soundex(firstname_pairs[i][1])),
    'lsoundex': int(jellyfish.soundex(lastname_pairs[i][0]) == jellyfish.soundex(lastname_pairs[i][1])),
} for i in range(len(new_pairs_fullname))])

predictions = model.predict_proba(new_features)[:, 1]
best_f1 = 0
best_threshold = None

for i in range(50,91):
    prediction = [1 if pred > i/100 else 0 for pred in predictions]
    precision, recall, f1, _ = precision_recall_fscore_support(new_labels, prediction, average='binary')
        
    print(f"{i} : {precision:9.2f} | {recall:6.2f} | {f1:4.2f}")
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = i

print(f"\nBest threshold : {best_threshold} with F1 score: {best_f1:.2f}")

### Cluster-level evaluation

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

def get_cluster_evaluation(df):
    y_pred = []
    y_truth = []
    groups = df['Cluster_graph'].values
    ids = df['ID'].values
    for i in range(len(df)):
        for k in range(len(df)):
            if i != k:
                if groups[i] == groups[k]:
                    y_pred.append(1)
                else:
                    y_pred.append(0)
            if i != k:
                if ids[i] == ids[k]:
                    y_truth.append(1)
                else:
                    y_truth.append(0)
    return y_pred,y_truth

group = [[] for _ in range(len(df))]
prediction = [1 if pred > 50/100 else 0 for pred in predictions]
for i in range(len(new_labels)):
    if prediction[i] == 1:
        group[indexes[i][0]].append(indexes[i][1])
df['GROUP'] = group


y_pred,y_truth = get_cluster_evaluation(df)

precision = precision_score(y_truth, y_pred)
recall    = recall_score(y_truth, y_pred)
f1        = f1_score(y_truth, y_pred)
accuracy  = accuracy_score(y_truth, y_pred)

print(f"Precision: {precision:.2f}")
print(f"Recall   : {recall:.2f}")
print(f"F1 Score : {f1:.2f}")
print(f"Accuracy : {accuracy:.2f}")