In [2]:
!pip install fuzzywuzzy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [3]:
import json
import pandas as pd

from fuzzywuzzy import fuzz



In [4]:
def load_authors():
    documents = pd.read_csv('papersPreprocessed (2).csv')

    authors_with_duplicates = []
    print("Generando autores sin duplicados")
    for index, row in documents.iterrows():
        authors_with_duplicates.extend(row['authorFull'].split(';'))

    print("Autores generados")

    return list(set(authors_with_duplicates))

In [5]:
# Se ordenan los nombres y se ordenan alfabeticamente
def normalize_name(name):
    name_parts = name.split()
    name_parts = [part.strip(' .,') for part in name_parts]
    return ' '.join(sorted(name_parts)).lower()

In [6]:
# A partir de clusterización con fuzzybuzz, se valida la distancia de Levenshtein y genera clusters de nombres similares
def cluster_names(names, threshold=90):
    print("Clusterizando nombres")
    normalized_names = [normalize_name(name) for name in names]
    clusters = []

    for name, norm_name in zip(names, normalized_names):
        for cluster in clusters:
            if fuzz.token_set_ratio(cluster[0], norm_name) >= threshold:
                cluster.append(name)
                break
        else:
            clusters.append([name])

    print("Clusterización finalizada")
    return clusters

In [7]:
# Genera el diccionario de nombres desambiguados, usando como llave, el nombre más corto
def create_disambiguated_dict(names_clusters):
    print("Generando diccionario")
    disambiguated_dict = {}

    for cluster in names_clusters:
        representative_name = sorted(cluster, key=len)[0]
        disambiguated_dict[representative_name] = cluster

    print("Diccionario generado")
    return disambiguated_dict

In [8]:
def generate_file(authors_dict):
    with open('authors_test.txt', 'w') as file:
        json.dump(authors_dict, file)

In [None]:
authors = load_authors()

print("Iniciando desambiguación")
names_clusters = cluster_names(authors)
print(len(names_clusters))
disambiguated_dict = create_disambiguated_dict(names_clusters)
print("Autores desambiguados")

generate_file(disambiguated_dict)

Generando autores sin duplicados
Autores generados
Iniciando desambiguación
Clusterizando nombres
