In [0]:
import pickle
import boto3
import re
import json
import random
import unicodedata
# import unidecode
import pandas as pd
import numpy as np
import time
from datetime import datetime, timedelta
# from nameparser import HumanName

from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import IntegerType, StringType, FloatType, ArrayType, DoubleType, StructType, StructField



In [0]:
spark.catalog.clearCache()

In [0]:
base_save_path = "<S3path>"
iteration_save_path = "<S3path>"

#### Load affiliations data

In [0]:
@udf(returnType=ArrayType(StringType()))
def group_non_latin_characters(text):
    groups = []
    text = text.replace(".", "").replace(" ", "")
    for char in text:
        try:
            script = unicodedata.name(char).split(" ")[0]
            if script == 'LATIN':
                pass
            else:
                if script not in groups:
                    groups.append(script)
        except:
            if "UNK" not in groups:
                groups.append("UNK")
    return groups

In [0]:
@udf(returnType=IntegerType())
def name_to_keep_ind(groups):
    groups_to_skip = ['HIRAGANA', 'CJK', 'KATAKANA','ARABIC', 'HANGUL', 'THAI','DEVANAGARI','BENGALI',
                      'THAANA','GUJARATI']
    
    if any(x in groups_to_skip for x in groups):
        return 0
    else:
        return 1

In [0]:
def length_greater_than_6(x):
    return (x > 6)

def concept_L0_removed(x):
    return ~x.isin(['17744445','138885662','162324750','144133560','15744967','33923547','71924100','86803240','41008148','127313418','185592680','142362112','144024400','127413603','205649164','95457728','192562407','121332964','39432304'])

In [0]:
aff_data = spark.read.parquet(f"{iteration_save_path}final_model_data/all_sample_data_for_all_work_authors") \
    .select(F.col('work_id').alias('work_id_1'), F.col('orcid').alias('orcid_1'), F.col('coauthors').alias('coauthors_1'), 
            F.col('citations').alias('citations_1'), F.col('institutions').alias('institutions_1'), 
            F.col('original_author').alias('author_1'), F.col('concepts').alias('concepts_1')) \
    .withColumn('non_latin_groups', group_non_latin_characters(F.col('author_1')))
    .withColumn('paper_id_1', F.split(F.col('work_id_1'), "_").getItem(0)) \
    .withColumn('concepts_1', F.array_distinct(F.col('concepts_1'))) \
    .withColumn('concepts_shorter_1', F.filter(F.col('concepts_1'), concept_L0_removed)) \
    .withColumn('coauthors_shorter_1', F.filter(F.col('coauthors_1'), length_greater_than_6))

aff_data.cache().count()

Out[8]: 633827872

In [0]:
aff_data \
    .select(F.col('work_id_1').alias('work_id'), F.col('orcid_1').alias('orcid'), 
        F.col('institutions_1').alias('institutions'), F.col('citations_1').alias('citations'),
        F.col('author_1').alias('author'), F.col('concepts_shorter_1').alias('concepts_shorter'), 
        F.col('paper_id_1').alias('paper_id'), F.col('coauthors_shorter_1').alias('coauthors_shorter')) \
    .write.mode('overwrite') \
    .parquet(f"{iteration_save_path}final_model_data/all_sample_data_for_all_work_authors_model_features")

In [0]:
aff_data = spark.read.parquet(f"{iteration_save_path}final_model_data/all_sample_data_for_all_work_authors_model_features") \
    .filter(F.col('author').isNotNull()) \
    .filter(F.col('author')!='') \
    .withColumn('non_latin_groups', group_non_latin_characters(F.col('author'))) \
    .withColumn('name_to_keep_ind', name_to_keep_ind('non_latin_groups')) \
    .filter(F.col('name_to_keep_ind')==1)

aff_data.cache().count()

590722794

In [0]:
aff_data \
    .select(F.col('work_id'), F.col('orcid'), 
        F.col('institutions'), F.col('citations'),
        F.col('author'), F.col('concepts_shorter'), 
        F.col('paper_id'), F.col('coauthors_shorter')) \
    .write.format('parquet').bucketBy(4192, 'work_id').sortBy('work_id').mode('overwrite') \
    .saveAsTable("aff_join_latin_xbuckets")

### Read Tables

In [0]:
aff_join_1 = spark.read.table("aff_join_latin_xbuckets") \
    .select(F.col('work_id').alias('work_id_1'), F.col('orcid').alias('orcid_1'),
        F.col('institutions').alias('institutions_1'), F.col('citations').alias('citations_1'),
        F.col('author').alias('author_1'),
        F.col('concepts_shorter').alias('concepts_shorter_1'), F.col('paper_id').alias('paper_id_1'), 
        F.col('coauthors_shorter').alias('coauthors_shorter_1'))
    
aff_join_1.cache().count()

590722794

In [0]:
aff_join_2 = spark.read.table("aff_join_latin_xbuckets") \
    .select(F.col('work_id').alias('work_id_2'), F.col('orcid').alias('orcid_2'),
        F.col('institutions').alias('institutions_2'), F.col('citations').alias('citations_2'),
        F.col('author').alias('author_2'),
        F.col('concepts_shorter').alias('concepts_shorter_2'), F.col('paper_id').alias('paper_id_2'), 
        F.col('coauthors_shorter').alias('coauthors_shorter_2'))
    
aff_join_2.cache().count()

590722794

#### Join to DF and create features

In [None]:
num_partitions = # number of partitions of the data (partitioned by names)

In [None]:
for i in range(num_partitions):
    start_time = time.time()
    print(i, (datetime.now() - timedelta(hours=4)).strftime("%m/%d/%y %H:%M"))

    part_num = i

    test_df = spark.read \
        .parquet(f"{iteration_save_path}final_model_data/block_creation/all_names_blocked_and_matched_work_ids/random_part={part_num}/") \
        .join(aff_join_1, how='inner', on='work_id_1') \
        .join(aff_join_2, how='inner', on='work_id_2')

    test_df \
        .filter((F.col('orcid_1') == F.col('orcid_2')) & 
                (F.col('orcid_1')!='')) \
        .write.mode('overwrite') \
        .parquet(f"{iteration_save_path}final_model_data/orcid_init_pairs/part={part_num}/")
    print("-------ORCID done")
    
    test_df \
        .filter(~((F.col('orcid_1') == F.col('orcid_2')) & 
                (F.col('orcid_1')!=''))) \
        .filter(~((F.col('orcid_1')!=F.col('orcid_2')) & (F.col('orcid_1')!='') & (F.col('orcid_2')!=''))) \
        .write.mode('overwrite') \
        .parquet(f"{iteration_save_path}final_model_data/data_to_score/part={part_num}/")

    print(f"-------------- total time: {round((time.time()-start_time)/60/60, 3)} hours")

### Loading the model and scoring data

In [0]:
with open("<local-path-to-model>/Disambiguator.pkl", "rb") as f:
    disambiguator_model = pickle.load(f)

broadcast_disambiguator_model = spark.sparkContext.broadcast(disambiguator_model)

In [0]:
@udf(returnType=IntegerType())
def get_random_int_udf(block_id):
    return random.randint(0, 1000000)

In [0]:
@udf (returnType=ArrayType(ArrayType(StringType())))
def score_data(full_arr):
    full_arr = np.array(full_arr)
    data_arr = full_arr[:,2:].astype('float')
    block_arr = full_arr[:,0]
    label_arr = full_arr[:,1]
    model_preds = broadcast_disambiguator_model.value.predict_proba(data_arr)[:,1]
    return np.vstack([block_arr[model_preds>0.2], label_arr[model_preds>0.2], model_preds[model_preds>0.2].astype('str')]).T.tolist()

In [None]:
for i in range(num_partitions):
    start_time = time.time()
    print(i, (datetime.now() - timedelta(hours=4)).strftime("%m/%d/%y %H:%M"))

    part_num = i

    spark.read \
        .parquet(f"{iteration_save_path}final_model_data/data_to_score/part={part_num}/") \
        .withColumn('row_label', F.when(F.col('work_id_1') > F.col('work_id_2'), 
                                        F.concat_ws("|", F.col('work_id_1'), F.col('work_id_2'))) \
                                            .otherwise(F.concat_ws("|", F.col('work_id_2'), F.col('work_id_1')))) \
        .withColumn('work_1_in_citations_2', F.array_contains(F.col('citations_2'), F.col('paper_id_1')).cast(IntegerType())) \
        .withColumn('work_2_in_citations_1', F.array_contains(F.col('citations_1'), F.col('paper_id_2')).cast(IntegerType())) \
        .withColumn('citation_work_match', F.when((F.col('work_2_in_citations_1')==1) | (F.col('work_1_in_citations_2')==1), 1).otherwise(0)) \
        .withColumn('insts_inter', F.size(F.array_intersect(F.col('institutions_1'), F.col('institutions_2')))) \
        .withColumn('coauths_inter', F.size(F.array_intersect(F.col('coauthors_shorter_1'), F.col('coauthors_shorter_2')))) \
        .withColumn('concps_inter', F.size(F.array_intersect(F.col('concepts_shorter_1'), F.col('concepts_shorter_2')))) \
        .withColumn('cites_inter', F.size(F.array_intersect(F.col('citations_1'), F.col('citations_2')))) \
        .withColumn('coauths_union', F.size(F.array_union(F.col('coauthors_shorter_1'), F.col('coauthors_shorter_2')))) \
        .withColumn('concps_union', F.size(F.array_union(F.col('concepts_shorter_1'), F.col('concepts_shorter_2')))) \
        .withColumn('cites_union', F.size(F.array_union(F.col('citations_1'), F.col('citations_2')))) \
        .withColumn('inst_per', F.when(F.col('insts_inter')>0, 1).otherwise(0)) \
        .withColumn('coauthors_shorter_per', F.round(F.when(F.col('coauths_union')>0, 
                                                            F.col('coauths_inter')/F.col('coauths_union')).otherwise(0.0), 4)) \
        .withColumn('concepts_shorter_per', F.round(F.when(F.col('concps_union')>0, F.col('concps_inter')/F.col('concps_union')).otherwise(0.0), 4)) \
        .withColumn('citation_per', F.round(F.when(F.col('cites_union')>0, F.col('cites_inter')/F.col('cites_union')).otherwise(0.0), 4)) \
        .withColumn('exact_match', F.when(F.col('author_1')==F.col('author_2'), 1).otherwise(0)) \
        .withColumn('name_1_len', F.length(F.col('author_1'))) \
        .withColumn('name_1_spaces', F.size(F.split(F.col('author_1'), " "))) \
        .select('block','row_label', 'inst_per','concepts_shorter_per', 'coauthors_shorter_per', 
            (F.col('exact_match')*F.col('name_1_len')).alias('exact_match_len'),
            (F.col('exact_match')*F.col('name_1_spaces')).alias('exact_match_spaces'), 'citation_per', 'citation_work_match') \
        .withColumn('random_int', get_random_int_udf(F.col('block'))) \
        .withColumn('concat_cols', F.array(F.col('block'), F.col('row_label').cast(StringType()), F.col('inst_per').cast(StringType()), 
                                            F.col('concepts_shorter_per').cast(StringType()), F.col('coauthors_shorter_per').cast(StringType()), 
                                            F.col('exact_match_len').cast(StringType()), F.col('exact_match_spaces').cast(StringType()), 
                                            F.col('citation_per').cast(StringType()), F.col('citation_work_match').cast(StringType()))) \
        .groupby('random_int') \
        .agg(F.collect_list(F.col('concat_cols')).alias('data_to_score')) \
        .withColumn('scored_data', score_data(F.col('data_to_score'))) \
        .select('scored_data') \
        .write.mode('overwrite') \
        .parquet(f"{iteration_save_path}final_model_data/data_scored/part={part_num}/")