In [91]:
import pandas as pd

dataset = pd.read_csv(".\dataset_100_IBAN_prova.csv",index_col=0)
dataset = dataset.drop("Address",axis=1)

print(dataset.columns)
print(len(dataset))

Index(['BIC', 'AccountNumber', 'CTRYbnk', 'Name', 'IsShared', 'Holder'], dtype='object')
657


In [93]:
from itertools import combinations
from nltk.tokenize import RegexpTokenizer
import pandas as pd
import nltk
import string

abbreviation_variations = [
    ["Group", "Grp", "Gr", "Gp"],
    ["Corporation", "Corp", "Corp", "Corp", "Cpn"],
    ["Holdings", "Hldgs", "Hdg", "Hdgs"],
    ["Enterprises", "Ent", "Ents"],
    ["International","Intl", "Int"],
    ["Global", "Glob", "Glb"],
    ["Solutions", "Solns", "Sols"],
    ["Services", "Svcs", "Sv"],
    ["Technologies", "Tech", "Techs"],
    ["Industries", "Inds", "Ind"],
    ["Partners", "Ptnrs", "Pts"],
    ["Systems","Sys", "Syss"],
    ["Worldwide", "WW"],
    ["Ventures", "Vntrs", "Vnts"],
    ["Brothers", "Bros",  "Br"],
    ["Sons", "Sns", "S"],
    ["Company", "Co", "Cpny", "Comp"],
    ["Associates", "Assocs", "Ass"],

    #"Incorporated": 
    ["Inc", "Incorporated"],
    
    #"Limited": 
    ["LLC", "Ltd", "Limited", "Ltée", "LTD", "Ltd", "Ltda", "LTT"],
    
    #"Private Limited": 
    ["Pvt","Public Limited Company",  "plc", "PLCC",  "Public"],
    
    #"Società Italiane
    ["SpA", "Società per Azioni", "Srl", "SAPA", "SAS", "SC", "SA", "SCARL", "SDF", "SU", "SNC", "Scoop"],
]

# nltk.download('punkt')
tokenizer = RegexpTokenizer(r'\w+')

def remove_company_abbreviations(text):
    init = text.split()
    for lista in abbreviation_variations:
        for el in lista:
            if el.lower() in init:
                init = [elem for elem in init if el.lower() != elem] 

    return " ".join(init)

def remove_punctuation(text):
    """ """
    for punct in string.punctuation:
        text = text.replace(punct,"")
    return text
    # return "".join(tokenizer.tokenize(string))

def preprocess_text(text):
    """ """
    removed = text.lower()
    removed = remove_punctuation(removed)
    removed = remove_company_abbreviations(removed)
    # print("PRIMA:\t" + text + "\t\t\t -> " + removed.upper().strip())
    return removed.upper().strip()



for i in range(len(dataset)):    
    if isinstance(dataset.loc[i, "Name"],str):
        dataset.loc[i, "Name"] = preprocess_text(dataset.loc[i, "Name"])
    
dataset = dataset.drop_duplicates()

In [94]:
import jaro
import numpy as np
from Levenshtein import ratio
from scipy.spatial import distance
from sklearn.cluster import AgglomerativeClustering


def computeAvgDistanceInNames(group, iban, d):
    """ """
    couples = list(combinations(group["Name"], 2))
    distances = []
    
    if d == "edit":
        for couple in couples:distances.append(ratio(couple[0], couple[1]))
    elif d == "jaro":
        for couple in couples:distances.append(jaro.jaro_winkler_metric(couple[0], couple[1]))
    elif d == "hamming":
        for couple in couples:distances.append(distance.hamming(list(couple[0]), list(couple[1])))
    else:
        raise Exception("Distance not supported")
    
    return sum(distances) / len(distances)


def computeMedoidsAsName(iban, names, d, clusters=None):
    """ """
    if len(names) == 1:
        return names[0]
    
    distances = []
    best = np.inf
    best_name = ""
    for name1 in names:
        for name2 in [el for el in names if el != name1]:
            if d == "edit":
                distances.append(ratio(name1, name2))
            elif d == "jaro":
                distances.append(jaro.jaro_winkler_metric(name1, name2))
            elif d == "hamming":
                distances.append(distance.hamming(list(name1), list(name2)))
            else:
                raise Exception("Distance not supported")
        
        actual = sum(distances) / (len(names) - 1)
        if best > actual:
            best = actual
            best_name = name1

    return best_name


def AgglomerativeC(iban, names, d, cluster_threshold):
    """ """

    # Compute the pairwise distance matrix
    n = len(names)
    distance_matrix = np.zeros((n, n))

    if d == "edit":
        for i in range(n):
            for j in range(i + 1, n):
                dist = ratio(names[i], names[j])
                distance_matrix[i, j] = 1 -dist
                distance_matrix[j, i] = 1 -dist

    elif d == "jaro":
        for i in range(n):
            for j in range(i + 1, n):
                dist = jaro.jaro_winkler_metric(names[i], names[j])
                distance_matrix[i, j] = dist
                distance_matrix[j, i] = dist        

    # Perform agglomerative clustering
    clustering = AgglomerativeClustering( n_clusters=None, metric='precomputed', linkage='average', distance_threshold=cluster_threshold)
    clusters = clustering.fit_predict(distance_matrix)
    # Print the resulting clusters
    #for idx, cluster in enumerate(clusters):
    #    print(f"String: {names[idx]}, Cluster: {cluster}")    
    
    return clusters
    

In [95]:
from tqdm import tqdm

d = "jaro"
threshold = 0.37
cluster_threshold = 0.4

dataset_result = dataset.copy(deep=True)
dataset_result["is_shared_polito"] = None
dataset_result["party_name_polito"] = None
dataset_result["cluster_polito"] = None

groupped = dataset.groupby("AccountNumber")
for iban, group in tqdm(groupped):
    if len(group) == 1:
        dataset_result.loc[dataset_result["AccountNumber"] == iban, "is_shared_polito"] = 0
        dataset_result.loc[dataset_result["AccountNumber"] == iban, "party_name_polito"] = group["Name"].tolist()[0]
        dataset_result.loc[dataset_result["AccountNumber"] == iban, "cluster_polito"] = 0
    else:
        avg_distance_acc = computeAvgDistanceInNames(group, iban, d)       
        if avg_distance_acc <= threshold:
            name = computeMedoidsAsName(iban, group['Name'].tolist(), d)

            dataset_result.loc[dataset_result["AccountNumber"] == iban, "is_shared_polito"] = 0
            dataset_result.loc[dataset_result["AccountNumber"] == iban, "party_name_polito"] = name
            dataset_result.loc[dataset_result["AccountNumber"] == iban, "cluster_polito"] = 0
        else:
            name_list = group["Name"].tolist()
            clusters = AgglomerativeC(iban, name_list, d, cluster_threshold)
            res = list(enumerate(clusters))
            res = [(name_list[el[0]], el[1]) for el in res]
            clusters = list(set(clusters))

            dataset_result.loc[dataset_result["AccountNumber"] == iban, "is_shared_polito"] = 1
            for c in clusters:
                nomi = [el[0] for el in res if el[1] == c]
                best_name = computeMedoidsAsName(iban, nomi, d)
                dataset_result.loc[(dataset_result["AccountNumber"] == iban) & (dataset_result["Name"].isin(nomi)), "party_name_polito"] = best_name
                dataset_result.loc[(dataset_result["AccountNumber"] == iban) & (dataset_result["Name"].isin(nomi)), "cluster_polito"] = c


100%|██████████| 100/100 [00:00<00:00, 114.89it/s]


In [96]:
tp = 0
tn = 0
fp = 0
fn = 0
holder_right = 0
cluster_rigth = 0
tot_cluster = 0

for account_number, group in dataset_result.groupby(["AccountNumber"]):
    is_shared = group.iloc[0]["IsShared"]
    polito_result = group.iloc[0]["is_shared_polito"]

    if polito_result and is_shared:
        tp += 1
    elif not polito_result and not is_shared:
        tn += 1
    elif polito_result and not is_shared:
        fp += 1
    elif not polito_result and is_shared:
        fn += 1

    names = group["Name"].tolist()
    holder = group["Holder"].tolist()
    party = group["party_name_polito"].tolist()

    for i in range(len(names)):
        p = party[i]
        index_n = names.index(p)
        real_name = holder[index_n]
        if holder[i] == real_name:
            holder_right +=1

    num_cluster = max(set(group["cluster_polito"])) + 1
    num_real_cluster = len(set(group["Holder"]))
    tot_cluster += num_real_cluster
    if num_cluster == num_real_cluster:
        cluster_rigth +=1

cluster_accuracy = holder_right / len(dataset_result)   
accuracy = (tp+tn)/(tp+tn+fp+fn)
precision =(tp)/(tp+fp)
recall = (tp)/(tp+fn)
fscore = (2*precision*recall)/(precision+recall)

print("accuracy\t" + str(round(accuracy, 2)))
print("precision\t" + str(round(precision, 2)))
print("recall\t\t" + str(round(recall, 2)))
print("f-score\t\t"+ str(round(fscore, 2)))
print("cluster\t\t" + str(cluster_accuracy))

accuracy	0.69
precision	0.59
recall		0.95
f-score		0.73
cluster		0.9670781893004116
