In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import Packages.ClusteringHelper as ch
from Packages.TimeEvolving import DataEvolver
from textdistance import DamerauLevenshtein, Levenshtein
import numpy as np
from sklearn.cluster import DBSCAN, AgglomerativeClustering
from Packages.TimeEvolving import Cluster
from tqdm.notebook import tqdm
import math
import sys
from collections import Counter
import time

In [4]:
text, data = ch.read_aida_yago_conll(
    "D:\\Sgmon\\Documents\\Magistrale\\TESI\\ClusteringAndLinking\\aida-yago2-dataset\\AIDA-YAGO2-dataset.tsv")
save = False
if save:
    text_file = open('text.txt', 'w')
    text_file.write(text)
    text_file.close()
ents_data = data[data['entities'] != ''].copy()

In [5]:
ents_data = ch.add_entities_embedding(ents_data,
                                      "D:\\Sgmon\\Documents\\Magistrale\\TESI\\ClusteringAndLinking\\aida-yago2-dataset\\encodings")
ents_data_filtered = ents_data.copy()
documents = set(ents_data.documents)

In [6]:
evolving = DataEvolver(documents, ents_data, randomly=False, step=10)
gold_entities = []
total_clusters = []
n=0

## Let the cycle start


In [7]:
tic = time.perf_counter()
for iteration in tqdm(evolving, total=math.ceil(len(evolving.documents) / evolving.step)):
    current_mentions = list(evolving.get_current_data().mentions)
    current_encodings = list(evolving.get_current_data()['encodings'].values)
    current_entities = list(evolving.get_current_data()['entities'].values)


    def lev_metric(x, y):
        i, j = int(x[0]), int(y[0])  # extract indices
        if len(current_mentions[i]) < 4:
            if current_mentions[i] == current_mentions[j]:
                return 0
            else:
                return Levenshtein().distance(current_mentions[i].lower(), current_mentions[j].lower()) + 3
        else:
            return Levenshtein().distance(current_mentions[i].lower(), current_mentions[j].lower())


    def dam_lev_metric(x, y):
        i, j = int(x[0]), int(y[0])  # extract indices
        if len(current_mentions[i]) < 4:
            if current_mentions[i] == current_mentions[j]:
                return 0
            else:
                return DamerauLevenshtein().distance(current_mentions[i].lower(), current_mentions[j].lower()) + 3
        else:
            return DamerauLevenshtein().distance(current_mentions[i].lower(), current_mentions[j].lower())


    X = np.arange(len(current_mentions)).reshape(-1, 1)
    clusterizator1 = DBSCAN(metric=lev_metric, eps=1, min_samples=0, n_jobs=-1)
    cluster_numbers = clusterizator1.fit_predict(X)

    cee_dict = {k: {'entities': [], 'mentions': [], 'encodings': [], 'sotto_clusters': None} for k in
                set(cluster_numbers)}
    for i, cluster in enumerate(cluster_numbers):
        cee_dict[cluster]['entities'].append(current_entities[i])
        cee_dict[cluster]['mentions'].append(current_mentions[i])
        cee_dict[cluster]['encodings'].append(current_encodings[i])
    cee_list = cee_dict.values()
    clusterizator2 = AgglomerativeClustering(n_clusters=None, affinity='cosine', distance_threshold=0.035,
                                             linkage="single")
    for cluster in cee_dict.keys():
        try:
            cee_dict[cluster]['sotto_clusters'] = clusterizator2.fit_predict(cee_dict[cluster]['encodings'])
        except ValueError:
            cee_dict[cluster]['sotto_clusters'] = np.zeros(1, dtype=np.int8)

    sottocluster_list = []
    for el in cee_list:
        sotto_cluster = {k: Cluster() for k in set(el['sotto_clusters'])}
        for i, key in enumerate(el['sotto_clusters']):
            sotto_cluster[key].add_element(mention=el['mentions'][i], entity=el['entities'][i],
                                           encodings=el['encodings'][i])
        sottocluster_list.append(sotto_cluster)
    sottocluster_list = [clusters_dict[key] for clusters_dict in sottocluster_list for key in clusters_dict]

    current_clusters = total_clusters + sottocluster_list
    sotto_encodings = [x.encodings_mean for x in current_clusters]
    clusterizator3 = AgglomerativeClustering(n_clusters=None, affinity='cosine', distance_threshold=0.015,
                                             linkage="single")
    cluster_numbers = clusterizator3.fit_predict(sotto_encodings)
    final_clusters = {k: Cluster() for k in set(cluster_numbers)}
    for i, x in enumerate(current_clusters):
        try:
            final_clusters[cluster_numbers[i]] = final_clusters[cluster_numbers[i]] + x
        except:
            print(cluster_numbers[i], final_clusters[cluster_numbers[i]], x)
    gold_entities = gold_entities + current_entities
    total_clusters = list(final_clusters.values())

    #CEAFm
    best_alignment = ch.get_optimal_alignment([x.count_ents for x in total_clusters], set(gold_entities), is_dict=False)
    CEAFm_f1 = sum(best_alignment.values()) / len(gold_entities)
    original_stdout = sys.stdout
    # with open(".\\Results\\step" + str(n) + ".txt", "a") as f:
    print("CEAFm:", CEAFm_f1)
    print("Clusters:")
    print(*total_clusters, sep="\n")
    # print("Gold_standard:")
    # print(Counter(gold_entities))
    n = n +1
    break
toc = time.perf_counter()
print(toc - tic)


  0%|          | 0/139 [00:00<?, ?it/s]

CEAFm: 0.9096774193548387
Clusters:
Cluster{'<b>Sweden</b>': {'SWEDISH': 1, 'SWEDEN': 1, '#': 2}}; #_elements = 2
Cluster{"<b>People's_Republic_of_China</b>": {'China': 6, 'Chinese': 1, '#': 7}}; #_elements = 7
Cluster{'<b>United_Kingdom</b>': {'British': 6, 'Britain': 4, '#': 10}}; #_elements = 10
Cluster{'<b>Middle_East</b>': {'Middle East': 1, '#': 1}}; #_elements = 1
Cluster{'<b>Russia</b>': {'Moscow': 1, '#': 1}}; #_elements = 1
Cluster{'<b>United_States</b>': {'United States': 1, '#': 1}}; #_elements = 1
Cluster{'<b>Golan_Heights</b>': {'Golan': 2, '#': 2}}; #_elements = 2
Cluster{'<b>Hafez_al-Assad</b>': {'Assad': 1, '#': 1}}; #_elements = 1
Cluster{'<b>Golan_Heights</b>': {'Golan Heights': 1, '#': 1}}; #_elements = 1
Cluster{'<b>Channel_2_(Israel)</b>': {'Channel Two': 1, '#': 1}}; #_elements = 1
Cluster{'<b>Itamar_Rabinovich</b>': {'Rabinovich': 1, '#': 1}}; #_elements = 1
Cluster{'<b>Benjamin_Netanyahu</b>': {'Netanyahu': 2, '#': 2}}; #_elements = 2
Cluster{'<b>Itamar_Rabinov