In [None]:
import pickle
import boto3
import re
import json
import random
import unicodedata
import unidecode
import pandas as pd
import numpy as np
from nameparser import HumanName

from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import IntegerType, StringType, FloatType, ArrayType, DoubleType, StructType, StructField

In [None]:
spark.catalog.clearCache()

In [None]:
base_save_path = "<S3path>"
iteration_save_path = "<S3path>"

#### Getting all author data

In [None]:
author_names = spark.read.parquet(f"{base_save_path}static_affiliations") \
    .select(F.col('paper_id').alias('work_id'), F.col('author_sequence_number').alias('seq_no'),
        F.trim(F.col('original_author')).alias('original_author')) \
    .filter(F.col('original_author')!="")
author_names.cache().count()

#### Transforming author names

In [None]:
def transform_author_name(author):
    """Function that takes in a raw author name and normalizes it before name disambiguation begins."""

    # Removing bad data at beginning of strings
    if author.startswith("None "):
        author = author.replace("None ", "")
    elif author.startswith("Array "):
        author = author.replace("Array ", "")

    # Using python libraries to perform various normalizations
    author = unicodedata.normalize('NFKC', author)
    
    author_name = HumanName(" ".join(author.split()))

    if (author_name.title == 'Dr.') | (author_name.title == ''):
        temp_new_author_name = f"{author_name.first} {author_name.middle} {author_name.last}"
    else:
        temp_new_author_name = f"{author_name.title} {author_name.first} {author_name.middle} {author_name.last}"

    new_author_name = " ".join(temp_new_author_name.split())

    author_names = new_author_name.split(" ")
    
    # The following tries to make sure names are in the correct order (or at least an order that works for disambiguation)
    if (author_name.title != '') : 
        final_author_name = new_author_name
    else:
        if len(author_names) == 1:
            final_author_name = new_author_name
        elif len(author_names) == 2:
            if (len(author_names[1]) == 1) & (len(author_names[0]) > 3):
                final_author_name = f"{author_names[1]} {author_names[0]}"
            elif (len(author_names[1]) == 2) & (len(author_names[0]) > 3):
                if (author_names[1][1]=="."):
                    final_author_name = f"{author_names[1]} {author_names[0]}"
                else:
                    final_author_name = new_author_name
            else:
                final_author_name = new_author_name
        elif len(author_names) == 3:
            if (len(author_names[1]) == 1) & (len(author_names[2]) == 1) & (len(author_names[0]) > 3):
                final_author_name = f"{author_names[1]} {author_names[2]} {author_names[0]}"
            elif (len(author_names[1]) == 2) & (len(author_names[2]) == 2) & (len(author_names[0]) > 3):
                if (author_names[1][1]==".") & (author_names[2][1]=="."):
                    final_author_name = f"{author_names[1]} {author_names[2]} {author_names[0]}"
                else:
                    final_author_name = new_author_name
            else:
                final_author_name = new_author_name
        elif len(author_names) == 4:
            if (len(author_names[1]) == 1) & (len(author_names[2]) == 1) & (len(author_names[3]) == 1) & (len(author_names[0]) > 3):
                final_author_name = f"{author_names[1]} {author_names[2]} {author_names[3]} {author_names[0]}"
            elif (len(author_names[1]) == 2) & (len(author_names[2]) == 2) & (len(author_names[3]) == 2) & (len(author_names[0]) > 3):
                if (author_names[1][1]==".") & (author_names[2][1]==".") & (author_names[3][1]=="."):
                    final_author_name = f"{author_names[1]} {author_names[2]} {author_names[3]} {author_names[0]}"
                else:
                    final_author_name = new_author_name
            else:
                final_author_name = new_author_name
        else:
            final_author_name = new_author_name
    return final_author_name
  
def remove_current_author(author, coauthors):
    return [x for x in coauthors if x!=author][:250]

remove_current_author_udf = F.udf(remove_current_author,  ArrayType(StringType()))
transform_author_name_udf = F.udf(transform_author_name, StringType())

In [None]:
author_names \
    .select('original_author') \
    .dropDuplicates() \
    .withColumn('transformed_name', transform_author_name_udf(F.col('original_author'))) \
    .write.mode('overwrite').parquet(f"{iteration_save_path}final_model_data/author_name_transformations")

#### Compiling Coauthors

In [None]:
author_name_transforms = spark.read \
    .parquet(f"{iteration_save_path}final_model_data/author_name_transformations")
author_name_transforms.cache().count()

In [None]:
author_names \
    .join(author_name_transforms, how='left', on='original_author') \
    .dropDuplicates(subset=['work_id','transformed_name']) \
    .groupby('work_id') \
    .agg(F.collect_list(F.col('transformed_name')).alias('all_authors')) \
    .write.mode('overwrite').parquet(f"{iteration_save_path}final_model_data/all_authors_for_each_work")

#### Getting Author Name Index Created

In [None]:
author_name_transforms = spark.read \
    .parquet(f"{iteration_save_path}final_model_data/author_name_transformations") \
    .select(F.col('transformed_name').alias('author_name')) \
    .dropDuplicates() \
    .filter(F.col('author_name')!='') \
    .withColumn("id", F.monotonically_increasing_id())
author_name_transforms.cache().count()

In [None]:
author_name_transforms \
.write.mode('overwrite').parquet(f"{iteration_save_path}final_model_data/all_authors_for_each_work_indexed")

#### Looking into different author name characters

In [None]:
def check_latin_character(text):
    try:        
        str(text).encode('latin-1')
        return 1
    except:
        return 0

In [None]:
@udf(returnType=FloatType())
def check_latin_name(text):
    text = text.replace(" ", "").replace(".", "")
    if text:
        name_check = [check_latin_character(x) for x in text]
        return sum(name_check)/len(name_check)
    else:
        return 0.0

In [None]:
@udf(returnType=StringType())
def transform_name_for_search(name):
    name = unidecode.unidecode(unicodedata.normalize('NFKC', name))
    name = name.lower().replace(" ", " ").replace(".", " ").replace(",", " ").replace("|", " ").replace(")", "").replace("(", "")\
        .replace("-", "").replace("&", "").replace("$", "").replace("#", "").replace("@", "").replace("%", "").replace("0", "") \
        .replace("1", "").replace("2", "").replace("3", "").replace("4", "").replace("5", "").replace("6", "").replace("7", "") \
        .replace("8", "").replace("9", "").replace("*", "").replace("^", "").replace("{", "").replace("}", "").replace("+", "") \
        .replace("=", "").replace("_", "").replace("~", "").replace("`", "").replace("[", "").replace("]", "").replace("\\", "") \
        .replace("<", "").replace(">", "").replace("?", "").replace("/", "").replace(";", "").replace(":", "").replace("\'", "") \
        .replace("\"", "")
    name = " ".join(name.split())
    return name

In [None]:
@udf(returnType=ArrayType(StringType()))
def group_non_latin_characters(text):
    groups = []
    text = text.replace(".", "").replace(" ", "")
    for char in text:
        try:
            script = unicodedata.name(char).split(" ")[0]
            if script == 'LATIN':
                pass
            else:
                if script not in groups:
                    groups.append(script)
        except:
            if "UNK" not in groups:
                groups.append("UNK")
    return groups

In [None]:
@udf(returnType=IntegerType())
def name_to_keep_ind(groups):
    groups_to_skip = ['HIRAGANA', 'CJK', 'KATAKANA','ARABIC', 'HANGUL', 'THAI','DEVANAGARI','BENGALI',
                      'THAANA','GUJARATI']
    
    if any(x in groups_to_skip for x in groups):
        return 0
    else:
        return 1

In [None]:
author_names = spark.read.parquet(f"{iteration_save_path}final_model_data/all_authors_for_each_work_indexed") \
    .withColumn('latin_characters_per', check_latin_name(F.col('author_name'))) \
    .withColumn('transformed_search_name', transform_name_for_search(F.col('author_name'))) \
    .withColumn('name_len', F.length(F.col('transformed_search_name'))) \
    .filter(F.col('name_len')>1) \
    .withColumn('last_name', F.split(F.lower(F.col('transformed_search_name')), " ")) \
    .select('author_name','transformed_search_name','latin_characters_per',
            F.element_at(F.col('last_name'), -1).alias('last_name')) \
    .withColumn('transformed_last_name', transform_name_for_search(F.col('last_name'))) \
    .withColumn('non_latin_groups', group_non_latin_characters(F.col('author_name')))
author_names.cache().count()

In [None]:
author_names \
    .withColumn('name_to_keep_ind', name_to_keep_ind('non_latin_groups')) \
    .filter(F.col('name_to_keep_ind')==1) \
    .withColumn('last_name_size', F.length(F.col('last_name'))) \
    .filter(F.col('last_name_size') > 1) \
    .select('last_name') \
    .dropDuplicates() \
    .write.mode('overwrite') \
    .parquet(f"{iteration_save_path}final_model_data/block_creation/last_names_to_search")

In [None]:
@udf(returnType=ArrayType(StringType()))
def find_non_subsets(strings):
    non_subsets = []
    for i in range(len(strings)):
        is_subset = False
        for j in range(len(strings)):
            if i != j:  # Avoid self-comparison
                if strings[i] in strings[j]:
                    is_subset = True
                    break
        if not is_subset:
            non_subsets.append(strings[i])
    return non_subsets

In [None]:
@udf(returnType=ArrayType(StringType()))
def only_get_first_and_last(all_names):
    if len(all_names) > 2:
        return [all_names[0], all_names[-1]]
    else:
        return all_names

In [None]:
names_to_search = spark.read \
    .parquet(f"{iteration_save_path}final_model_data/block_creation/last_names_to_search") \
    .select(F.col('last_name').alias('block'))

names_to_search.cache().count()

In [None]:
author_names \
    .withColumn('name_to_keep_ind', name_to_keep_ind('non_latin_groups')) \
    .filter(F.col('name_to_keep_ind')==1) \
    .withColumn('potential_blocks', F.split(F.col('transformed_search_name'), " ")) \
    .withColumn('potential_blocks', only_get_first_and_last(F.col('potential_blocks'))) \
    .select('transformed_search_name', 'potential_blocks') \
    .write.mode('overwrite') \
    .parquet(f"{iteration_save_path}final_model_data/block_creation/names_to_be_blocked")

In [None]:
author_names \
    .withColumn('name_to_keep_ind', name_to_keep_ind('non_latin_groups')) \
    .filter(F.col('name_to_keep_ind')==0) \
    .write.mode('overwrite') \
    .parquet(f"{iteration_save_path}final_model_data/block_creation/names_to_not_be_blocked")

In [None]:
author_names = spark.read.parquet(f"{iteration_save_path}final_model_data/block_creation/names_to_be_blocked") \
    .withColumn('potential_blocks', find_non_subsets(F.col('potential_blocks'))) \
    .select('transformed_search_name', F.explode('potential_blocks').alias('block')) \
    .withColumn('block_removed', F.expr("regexp_replace(transformed_search_name, block, '')")) \
    .withColumn('new_block_removed', F.expr("regexp_replace(block_removed, ' ', '')")) \
    .withColumn('letters_split', F.split(F.col('new_block_removed'), '(?!$)')) \
    .select('transformed_search_name','block',F.explode('letters_split').alias('letters_in_name')) \
    .dropDuplicates()
author_names.cache().count()

In [None]:
author_names \
    .show(20)

In [None]:
author_names \
    .join(names_to_search, how='inner', on='block') \
    .write.mode('overwrite') \
    .parquet(f"{iteration_save_path}final_model_data/block_creation/names_blocked_to_name_check")