In [24]:
import pandas as pd
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [25]:
import Packages.ClusteringHelper as ch
from tqdm import tqdm
from textdistance import DamerauLevenshtein, Levenshtein
from collections import Counter

In [26]:
text, data = ch.read_aida_yago_conll(
    "D:\\Sgmon\\Documents\\Magistrale\\TESI\\ClusteringAndLinking\\aida-yago2-dataset\\AIDA-YAGO2-dataset.tsv")
save = False
if save:
    text_file = open('text.txt', 'w')
    text_file.write(text)
    text_file.close()
data

Unnamed: 0,documents,tokens,indexes,word_indexes,mentions,entities,wikidatas,numeric_codes,alpha_codes
0,0,EU,"(0, 2)",0,,,,,
1,0,rejects,"(3, 10)",1,,,,,
2,0,German,"(11, 17)",2,German,Germany,http://en.wikipedia.org/wiki/Germany,11867,/m/0345h
3,0,call,"(18, 22)",3,,,,,
4,0,to,"(23, 25)",4,,,,,
...,...,...,...,...,...,...,...,...,...
285584,1392,younger,"(1342, 1349)",265,,,,,
285585,1392,brother,"(1350, 1357)",266,,,,,
285586,1392,",","(1358, 1359)",267,,,,,
285587,1392,Bobby,"(1360, 1365)",268,Bobby,Bobby_Charlton,http://en.wikipedia.org/wiki/Bobby_Charlton,4224,/m/01c8x


In [27]:
data = ch.filter_data(data, 3)
data

Unnamed: 0,documents,tokens,indexes,word_indexes,mentions,entities,wikidatas,numeric_codes,alpha_codes
2,0,German,"(11, 17)",2,German,Germany,http://en.wikipedia.org/wiki/Germany,11867,/m/0345h
6,0,British,"(34, 41)",6,British,United_Kingdom,http://en.wikipedia.org/wiki/United_Kingdom,31717,/m/07ssc
10,0,BRUSSELS,"(55, 63)",11,BRUSSELS,Brussels,http://en.wikipedia.org/wiki/Brussels,3708,/m/0177z
13,0,European Commission,"(79, 87)",14,European Commission,European_Commission,http://en.wikipedia.org/wiki/European_Commission,9974,/m/02q9k
20,0,German,"(123, 129)",22,German,Germany,http://en.wikipedia.org/wiki/Germany,11867,/m/0345h
...,...,...,...,...,...,...,...,...,...
285535,1392,Germany,"(1087, 1094)",213,Germany,Germany,http://en.wikipedia.org/wiki/Germany,11867,/m/0345h
285539,1392,Irish,"(1107, 1112)",217,Irish,Republic_of_Ireland_national_football_team,http://en.wikipedia.org/wiki/Republic_of_Irela...,578510,/m/02ryyk
285543,1392,England,"(1133, 1140)",221,England,England_national_football_team,http://en.wikipedia.org/wiki/England_national_...,9904,/m/02pp1
285554,1392,Leeds United,"(1194, 1199)",232,Leeds United,Leeds_United_A.F.C.,http://en.wikipedia.org/wiki/Leeds_United_A.F.C.,7609513,/m/01xn7x1


### General Info

In [28]:
n_entities = sum([x is not '' for x in list(data['entities'])])
n_ass_ents = sum([x is not '' for x in list(data['numeric_codes'])])
# n_tokens = sum([1 for x in list(data['entities'])])
n_tokens = sum([len(x.split()) for x in text])


In [29]:
print('{0:<35} {1:>10} '.format("Numero totale di entità:", n_entities))
print('{0:<35} {1:>10} '.format("Numero totale di tokens:", n_tokens))
print('{0:<35} {1:>10} '.format("1 entità ogni:", round(n_tokens / n_entities, 2)))

Numero totale di entità:                 23267 
Numero totale di tokens:                301418 
1 entità ogni:                           12.95 


### Gold standard

In [30]:
golden_standard_dict = ch.get_gold_standard_dict(data)

In [31]:
ents_data = data[data['entities'] != '']
golden_standard_entities = ents_data['entities'].values
mentions = ents_data['mentions'].values
mentions = [x.lower() for x in mentions]


### Clustering by Levenshtein distance and DBSCAN

In [32]:
# Way with dbscan algorithm
import numpy as np
from sklearn.cluster import dbscan

clustering = False
if clustering:
    def damerau_lev_metric(x, y):
        i, j = int(x[0]), int(y[0])  # extract indices
        if len(mentions[i]) < 4:
            if mentions[i] == mentions[j]:
                return 0
            else:
                return DamerauLevenshtein().distance(mentions[i].lower(), mentions[j].lower()) + 3
        else:
            return DamerauLevenshtein().distance(mentions[i].lower(), mentions[j].lower())

    def lev_metric(x, y):
        i, j = int(x[0]), int(y[0])  # extract indices
        if len(mentions[i]) < 4:
            if mentions[i] == mentions[j]:
                return 0
            else:
                return Levenshtein().distance(mentions[i].lower(), mentions[j].lower()) + 3
        else:
            return Levenshtein().distance(mentions[i].lower(), mentions[j].lower())


    X = np.arange(len(mentions)).reshape(-1, 1)
    _, leven_cluster = dbscan(X, metric=lev_metric, eps=1, min_samples=0, n_jobs=-1)
    np.savetxt('db_cluster_levestein_0_3.txt', leven_cluster, delimiter=',')
else:
    leven_cluster = np.loadtxt("../aida-yago2-dataset/db_cluster_levestein_0_3.txt", dtype=np.int32)

In [33]:
# Now we create a dict for each cluster that contains entities and entities count
lev_cluster_dict = {}
for i, x in enumerate(leven_cluster):
    try:
        lev_cluster_dict[x].append((mentions[i],golden_standard_entities[i]))
    except:
        lev_cluster_dict[x] = [(mentions[i],golden_standard_entities[i])]
# """se uso un cluster diverso da minimo 0 allora scommento"""
# del lev_cluster_dict[-1]

In [34]:
for key in tqdm(lev_cluster_dict):
    cluster_list = [x[1] for x in lev_cluster_dict[key]]
    cluster_dict = Counter(cluster_list)
    lev_cluster_dict[key] = cluster_dict

100%|██████████| 2750/2750 [00:00<00:00, 229178.72it/s]


### evaluation levestein

optimal alignment

In [35]:
max_lev_cluster_dict = ch.get_optimal_alignment(lev_cluster_dict, set(golden_standard_entities))

CEAFm precision

In [36]:
# CEAFm_levenshtein_precision
CEAFm_levenshtein_precision = sum([x for x in max_lev_cluster_dict.values()]) / sum(
    [y for x in lev_cluster_dict.values() for y in x.values()])
CEAFm_levenshtein_precision

0.7875102075901491

CEAFm recall

In [37]:
# CEAFm_levenshtein_recall
CEAFm_levenshtein_recall = sum([x for x in max_lev_cluster_dict.values()]) / ents_data.shape[0]
CEAFm_levenshtein_recall

0.7875102075901491

CEAFm Fscore

In [38]:
# CEAFm_levenshtein_f1
CEAFm_levenshtein_f1 = (2 * (CEAFm_levenshtein_recall * CEAFm_levenshtein_precision)) / (
        CEAFm_levenshtein_precision + CEAFm_levenshtein_recall)
CEAFm_levenshtein_f1

0.7875102075901491

### B-CUBED

In [39]:
# B-cubed - precision
bcubed_precision_num = 0
for gold_key in tqdm(golden_standard_dict.keys()):
    for lev_key in lev_cluster_dict.keys():
        try:
            bcubed_precision_num = bcubed_precision_num + (pow(lev_cluster_dict[lev_key][gold_key], 2) /
                                       sum([x for x in lev_cluster_dict[lev_key].values()]))
        except:
            pass
bcubed_precision = bcubed_precision_num/sum([y for x in lev_cluster_dict.values() for y in x.values()])
bcubed_precision

100%|██████████| 2089/2089 [00:06<00:00, 322.31it/s]


0.881592346067853

In [40]:
# B-cubed - recall
bcubed_recall_num = 0
for gold_key in tqdm(golden_standard_dict.keys()):
    for lev_key in lev_cluster_dict.keys():
        try:
            bcubed_recall_num = bcubed_recall_num + (pow(lev_cluster_dict[lev_key][gold_key], 2) /
                                       golden_standard_dict[gold_key])
        except:
            pass
bcubed_recall = bcubed_recall_num/ents_data.shape[0]
bcubed_recall

100%|██████████| 2089/2089 [00:04<00:00, 511.50it/s]


0.7904009008331572

In [41]:
bcubed_f1 = (2 * (bcubed_recall * bcubed_precision)) / (
        bcubed_precision + bcubed_recall)
bcubed_f1

0.8335098072808208