In [None]:
#Creates text file with the considered criteria for the authors disambiguation
import pandas as pd
from unidecode import unidecode

def normalize_name(name):
    normalized_name = unidecode(name)
    normalized_name = str.lower(normalized_name)
    return normalized_name

#Reads in all authors' names and associated dataset metadata
df = pd.read_csv("DSKG_BETA_DISAMBIGUATION.csv")

number_id = 10000
author_id_names = []
author_id_coauthors = []

final_list = []

i = 0
while i < len(df):
    authors_title = df['title'][i]
    authors_years = df["issued"][i]
    authors_years = str(authors_years).split("-")[0]
    authors_dataset = df['dataset'][i] 
    authors_publisher = df['publisherName'][i] 
    authors_contributor = df['contributorName'][i]
    authors_dataset_topic = df['theme'][i]
    authors_string = df['creatorPersonName'][i]
    authors = str(authors_string).split(", ")
    authors_dataset_lda_topic_distribution = df['LDA_Topic_Distribution'][i]

    for author in authors:
        if str(author).startswith(" "):
            max_len = len(author)
            author = author[1:max_len]
                
        author_id = str(number_id) + '\t' + author
        coauthors = str(authors_string).replace(author, "")
        coauthors = coauthors.replace(", ,", ", ")
        coauthors_normalized = normalize_name(coauthors)
        author_name_normalized = normalize_name(author)
        
        authors_title_words = authors_title.split(" ")
        authors_title_words_usefull = ""
        for word in authors_title_words:
            if len(word) > 4:
                authors_title_words_usefull = authors_title_words_usefull + word + ", "
        max_len = len(authors_title_words_usefull)
        unwanted_cut_off = max_len - 2
        authors_title_words_usefull = authors_title_words_usefull[0:unwanted_cut_off]
  
        if coauthors_normalized.startswith(", "):
            max_len = len(coauthors)
            coauthors_normalized = coauthors_normalized[2:max_len]
        if coauthors_normalized.endswith(", "):
            max_len = len(coauthors)
            unwanted_cut_off = max_len - 2
            coauthors_normalized = coauthors_normalized[0:unwanted_cut_off]
        if str(authors_dataset_topic) == "nan":
            authors_dataset_topic = ""
        if str(coauthors) == "nan":
            coauthors = ""
        if str(authors_publisher) == "nan":
            authors_publisher = ""
        if str(authors_contributor) == "nan":
            authors_contributor = ""
        if str(authors_years) == "nan":
            authors_years = ""

        if str(author) != "nan":
            final_list.append(author_name_normalized + '\t' + str(author_id) + '\t' + str(authors_dataset_topic) + '\t' + str(authors_dataset) + '\t' + str(coauthors_normalized) + '\t' + str(authors_publisher) + '\t' + str(authors_contributor) + '\t' + str(authors_years) + '\t' + str(authors_title_words_usefull) + '\t' + str(authors_dataset_lda_topic_distribution) + '\n')
            number_id += 1
    i += 1

final_list.sort()

#Saves required criteria for author disambiguation in text file
with open("/Author_Disambiguation.txt", "w") as outp:
    for line in final_list:
        outp.write(line)

In [None]:
#Developed Author Disambiguation
import pandas as pd
from pyjarowinkler import distance
import itertools
import math
import re
from scipy import spatial

def compare_first_names(author1, author2):
    firstName1 = author1.split("\t")[0].strip().rsplit(' ', 1)[0]
    firstName2 = author2.split("\t")[0].strip().rsplit(' ', 1)[0]
    if len(str(firstName1)) >= 5:
        if firstName1 == firstName2:
            return 2
        else:
            return 0
    else:
        if firstName1 == firstName2:
            return 1
        else:
            return 0
        
def compare_initials(author1, author2):
    names1 = author1.split("\t")[0].strip().split(" ")
    names2 = author2.split("\t")[0].strip().split(" ")
    initials1 = ""
    initials2 = ""
    for name in names1:
        initials1 += name[:1]
    for name in names2:
        initials2 += name[:1]
    if len(initials1) == 1:
        if initials1 == initials2:
            return 1
        else: 
            return 0
    elif len(initials1) == 2:
        if initials1 == initials2:
            return 2
        else:
            return 0
    else:
        if initials1 == initials2:
            return 3
        else:
            return 0

def compare_titles(author1, author2):
    titles1 = author1.split("\t")[9].strip().split(", ")
    titles2 = author2.split("\t")[9].strip().split(", ")
    if len(titles1) == 0 or len(titles2) == 0:
        return 0
    else:
        titles1_set = set(titles1)
        titles2_set = set(titles2)
        return len(titles1_set.intersection(titles2_set))
    
def compare_years(author1, author2):
    if author1.split("\t")[8].strip() == "" or author2.split("\t")[8].strip() == "":
        return False
    else:
        year1 = author1.split("\t")[8].strip()
        year2 = author2.split("\t")[8].strip()
        return abs(int(year1) - int(year2)) < 10 
      
def compare_coauthors(author1, author2):
    coauthors1 = set(author1.split("\t")[5].strip().split(", "))
    coauthors2 = set(author2.split("\t")[5].strip().split(", "))
    if len(coauthors1) == 0 or len(coauthors2) == 0:
        return 0
    else:
        return len(coauthors1.intersection(coauthors2))
    
def compare_topic_datasets(author1, author2): 
    topics1 = set(author1.split("\t")[3].strip().split(", "))
    topics2 = set(author2.split("\t")[3].strip().split(", "))
    if len(topics1) == 0 or len(topics2) == 0:
        return 0
    else:
        return len(topics1.intersection(topics2))
    
#Cosinus-Similarity LDA-Vectors
def compare_lda_topic_distribution(author1, author2): 
    author1 = author1.split("\t")[10].strip()
    author2 = author2.split("\t")[10].strip()
    
    author1 = re.sub(r'[0-9], ', '', author1)
    author1 = author1.replace("[", "").replace("]", "").replace("(", "").replace(")", "")
    lda_topic_distribution1 = [float(percent) for percent in author1.split(', ')] 
    
    author2 = re.sub(r'[0-9], ', '', author2)
    author2 = author2.replace("[", "").replace("]", "").replace("(", "").replace(")", "")
    lda_topic_distribution2 = [float(percent) for percent in author2.split(', ')] 
    
    cos_similarity = 1 - spatial.distance.cosine(lda_topic_distribution1, lda_topic_distribution2)
    return cos_similarity

def compare_publisher(author1, author2): 
    publisher1 = set(author1.split("\t")[6].strip().split(", "))
    publisher2 = set(author2.split("\t")[6].strip().split(", "))
    if len(publisher1) == 0 or len(publisher2) == 0:
        return 0
    else:
        return len(publisher1.intersection(publisher2))
    
def compare_contributer(author1, author2):  
    contributer1 = set(author1.split("\t")[7].strip().split(", "))
    contributer2 = set(author2.split("\t")[7].strip().split(", "))
    if len(contributer1) < 3 or len(contributer2) < 3:
        return 0
    else:
        return len(contributer1.intersection(contributer2))
    
def compare_authors(author1, author2): 
    score = 0
    
    if compare_initials(author1, author2) == 2:
        score += 3 
    elif compare_initials(author1, author2) == 3:
        score += 7
    elif compare_initials(author1, author2) == 0:
        score -= 5
    
    if compare_first_names(author1, author2) == 1:
        score += 2
    elif compare_first_names(author1, author2) == 2:
        score += 5
    
    if compare_coauthors(author1, author2) == 1:
        score += 4
    elif compare_coauthors(author1, author2) == 2:
        score += 7
    elif compare_coauthors(author1, author2) > 2:
        score += 10
        
    if compare_titles(author1, author2) == 1:
        score += 2
    elif compare_titles(author1, author2) == 2:
        score += 4 
    elif compare_titles(author1, author2) > 2:
        score += 6 
        
    if compare_lda_topic_distribution(author1, author2) >= 0.99:
        score += 4  
    elif compare_lda_topic_distribution(author1, author2) >= 0.95:
        score += 3 
    elif compare_lda_topic_distribution(author1, author2) >= 0.75:
        score += 2 

    if compare_years(author1, author2):
        score += 1
        
    if compare_topic_datasets(author1, author2) >= 1:
        score += 1
 
    if compare_publisher(author1, author2) >= 1:
        score += 1
      
    if compare_contributer(author1, author2) >= 1:
        score += 3

    return score

def get_id(author):
    return author.split("\t")[1]

def add_to_mapping(dict_of_maps, entry1, entry2):
    if entry2 not in dict_of_maps:
        dict_of_maps[entry1] = entry2
        return dict_of_maps
    else:
        return add_to_mapping(dict_of_maps, entry1, dict_of_maps[entry2])
    
def merge_authors(tuple_of_authors):
    author1 = tuple_of_authors[0].strip("\n").split("\t")
    author2 = tuple_of_authors[1].strip("\n").split("\t")
    output = "\t".join(author1[0:11])
    return output

def disambiguate(list_of_authors, result, positive, negative):
    author_dictionary = {get_id(author): author.strip("\n") for author in list_of_authors}
    author_list = [get_id(author) for author in list_of_authors]
    mapping = {}
    result = result.copy()
    #Creates tuples of two authors each 
    comparisons = list(itertools.combinations(author_list, 2)) 
    for item in comparisons:
        try:
            if compare_authors(author_dictionary[item[0]], author_dictionary[item[1]]) >= 11:
                positive += 1
                if item[0] not in mapping:
                    mapping = add_to_mapping(mapping, item[1], item[0])
                    result = add_to_mapping(result, item[1], item[0])
                    author_dictionary[item[0]] = merge_authors((author_dictionary[item[0]], author_dictionary[item[1]]))
                    del author_dictionary[item[1]] 
                else:
                    author_dictionary[mapping[item[0]]] = merge_authors((author_dictionary[mapping[item[0]]], author_dictionary[item[1]]))
                    mapping = add_to_mapping(mapping, item[1], item[0])
                    result = add_to_mapping(result, item[1], item[0])
                    del author_dictionary[item[1]]
            else:
                negative += 1
        except KeyError:
            pass 
    if not len(mapping) == 0:
        return disambiguate([author_dictionary[author] for author in author_dictionary], result, positive, negative)
    else:
        return author_dictionary, result, positive, negative

In [None]:
#Execute the author disambiguation
with open("/Author_Disambiguation.txt", "r") as inp: 
    with open("/results_summary.txt", "w") as outp:
        with open("/all_positives.txt", "w") as outp2:
            with open("/disambiguated_file.txt", "w") as outp3:
                positive = 0
                negative = 0
                previous_name = ""
                current_authors = []
                line_count = 1

                for line in inp:
                    name = line.split("\t")[0].strip()
                    if previous_name == "":
                        previous_name = name
                        current_authors.append(line)
                    elif distance.get_jaro_distance(str.lower(name), str.lower(previous_name), winkler=True, scaling=0.1) > 0.9:
                        previous_name = name
                        current_authors.append(line)
                    else:
                        result = {}
                        authors, result, positive, negative = disambiguate(current_authors, result, positive, negative)
                        previous_name = name
                        current_authors = [line]
                        for item in authors:
                            outp3.write(authors[item] + "\n")
                        for item in result:
                            outp2.write(item + "\t" + result[item] + "\n")
                    line_count += 1
                result = {} 
                authors, result, positive, negative = disambiguate(current_authors, result, positive, negative)
                for item in authors:
                    outp3.write(authors[item] + "\n")
                for item in result:
                    outp2.write(item + "\t" + result[item] + "\n")

        total_comparisons = positive + negative
        outp.write("Total comparisons: " + str(total_comparisons) + "\n")
        outp.write("Total positives: " + str(positive) + ": " + str(positive/total_comparisons) + "\n")
        outp.write("Total negatives: " + str(negative) + ": " + str(negative/total_comparisons))