In [0]:
import pickle
import boto3
import re
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt



In [0]:
from pyspark.sql import SparkSession
sc = spark.sparkContext
from pyspark.sql import SQLContext
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import IntegerType, StringType, FloatType, ArrayType, DoubleType, StructType, StructField
sqlContext = SQLContext(sc)



In [0]:
institutions_v2_data_path = "s3://institutional-affiliation-classification/V3/"
base_save_path = "s3://openalex-data-copy/snapshot_2023_02_15/"
iteration_save_path = "s3://author-disambiguation/V3/"

### Getting all data

In [0]:
def transform_coauthors(coauthors):
    if isinstance(coauthors, list):
        final_coauthors = []
        for coauthor in coauthors:
            final_coauthors.append(check_author_name(coauthor))
    else:
        final_coauthors = []
    return final_coauthors
  
def remove_current_author(author, coauthors):
    return [x.strip() for x in coauthors if x!=author.strip()][:250]

transform_coauthors_udf = F.udf(transform_coauthors,  ArrayType(StringType()))
remove_current_author_udf = F.udf(remove_current_author,  ArrayType(StringType()))

In [0]:
concepts = spark.read.parquet(f"{base_save_path}static_concepts_v3").dropDuplicates() \
.select(F.col('paper_id').alias('work_id'),'field_of_study',F.col('score').cast(FloatType()).alias('score')) \
.filter(F.col('score') >=0.32) \
.groupby('work_id') \
.agg(F.collect_list(F.col('field_of_study')).alias('concepts'))

concepts.cache().count()

Out[5]: 224140989

In [0]:
string_to_institution = spark.read.parquet(f"{institutions_v2_data_path}processed_bulk_data_for_V2") \
.select(F.col('affiliation_string').alias('original_affiliation'), F.col('affiliation_id').alias('institutions'))

string_to_institution.cache().count()

Out[6]: 71070238

In [0]:
string_to_institution.filter(F.col('original_affiliation')=='MESA+ Institute').show()

+--------------------+------------+
|original_affiliation|institutions|
+--------------------+------------+
|     MESA+ Institute|[4210162103]|
+--------------------+------------+



In [0]:
affiliations = spark.read.parquet(f"{base_save_path}static_affiliations")
affiliations.cache().count()

Out[7]: 634179075

In [0]:
# affiliations.dropDuplicates() \
# .groupby('paper_id') \
# .agg(F.collect_set(F.col('original_author')).alias('all_authors')) \
# .write.mode('overwrite').parquet(f"{iteration_save_path}all_authors_for_each_work")

In [0]:
coauthors = spark.read.parquet(f"{iteration_save_path}all_authors_for_each_work") \
.select(F.col('paper_id').alias('work_id'), 'all_authors')
coauthors.cache().count()

Out[15]: 227593673

In [0]:
works = spark.read.parquet(f"{base_save_path}static_works").select('paper_id',
                                                                   F.trim(F.col('doi')).alias('doi')) \
.dropDuplicates(subset=['paper_id'])
works.cache().count()

Out[4]: 248663413

#### Creating Base ORCID clusters

In [0]:
orcid_names = spark.read.parquet(f"{iteration_save_path}orcid_names_data_dump.parquet").dropDuplicates(subset=['orcid'])

In [0]:
orcid_names.cache().count()

Out[16]: 14845875

In [0]:
orcid_dois = spark.read.option('header', True).csv(f"{iteration_save_path}orcid_ids_to_work_dois") \
.select(F.trim(F.col('doi')).alias('doi'),F.col('orcid').alias('orcid'))

In [0]:
orcid_dois.cache().count()

Out[6]: 84040875

In [0]:
works.join(orcid_dois, how='inner', on='doi').count()

Out[7]: 66985160

#### Saving DOIs that are showing up in ORCID but not OpenAlex

In [0]:
orcid_dois.join(works, how='left', on='doi').filter(F.col('paper_id').isNull()).count()

Out[21]: 17065817

In [0]:
17065817*0.001

Out[23]: 17065.817

In [0]:
orcid_dois.join(works, how='left', on='doi').filter(F.col('paper_id').isNull()).sample(0.001) \
.coalesce(1).write.mode('overwrite').parquet(f"{iteration_save_path}doi_missing_in_openalex_sample")

In [0]:
def get_author_string_match_(given, family, potential_names):
    
    skip_list = [" ", "," ,"." ,"-" ,":" ,"/"]
    if given:
        given_processed = [i for i in given.lower() if i not in skip_list]
    else:
        given_processed = []
    
    if family:
        family_processed = [i for i in family.lower() if i not in skip_list]
    else:
        family_processed = []
    full_name = list(set(given_processed + family_processed))
    
    final_name = []
    for pot_name in potential_names:
        pot_name_processed = "".join([i for i in pot_name.lower() if i not in skip_list])
        if family_processed:
            if "".join(family_processed) in pot_name_processed:
                final_name.append(pot_name)
            else:
                pass
        else:
            pass
    
    if len(final_name) > 1:
        final_name = []
#         curr_total = 0
#         for pot_name in final_name:
#             pot_name_processed = [i for i in pot_name.lower() if i not in skip_list]
#             if given_processed:
#                 string_to_test = [x for x in "".join("".join(pot_name_processed).split("".join(family_processed)))]
#                 temp_total = sum(x in given_processed for x in list(set(string_to_test)))
#                 if temp_total > curr_total:
#                     curr_total = temp_total
#                     final_name = [pot_name]
#                 else:
#                     pass
#             else:
#                 pass
        
    if final_name:
        return final_name[0]
    else:
        return "NO_MATCH_POSSIBLE"


get_author_string_match = F.udf(get_author_string_match_, StringType())

In [0]:
def clean_family_name_(fam_name):
    skip_list = [" ", "," ,"." ,"-" ,":" ,"/"]
    new_fam_name = fam_name.replace("\t", "").strip()
    new_fam_name = new_fam_name.replace("-", " ").strip()
    new_fam_name = new_fam_name.title().replace(" ", "")
    new_fam_name = "".join([x for x in new_fam_name if x not in skip_list])
    return new_fam_name

clean_family_name = F.udf(clean_family_name_, StringType())

In [0]:
def turn_string_to_list_(str_array):
    return [x for x in str_array[1:-1].split(", ") if x!='-1']

turn_string_to_list = F.udf(turn_string_to_list_, ArrayType(StringType()))

In [0]:
grouped_aff_data = affiliations.select(F.col('paper_id').alias('work_id'),'original_author').dropDuplicates() \
.filter(~F.col('original_author').isNull()) \
.filter(F.col('original_author')!='') \
.groupby('work_id') \
.agg(F.collect_list(F.col('original_author')).alias('potential_author_strings')) \
.write.mode('overwrite').parquet(f"{iteration_save_path}grouped_aff_data_for_orcid_string_match")

In [0]:
grouped_aff_data = spark.read.parquet(f"{iteration_save_path}grouped_aff_data_for_orcid_string_match") \
.select(F.col('work_id').alias('paper_id'), 'potential_author_strings')
grouped_aff_data.cache().count()

Out[19]: 227519559

In [0]:
works\
.join(orcid_dois, how='inner', on='doi') \
.join(orcid_names, how='inner', on='orcid') \
.join(grouped_aff_data, how='inner', on='paper_id') \
.dropDuplicates(subset=['paper_id','orcid','doi','given_names','family_name']) \
.withColumn('raw_name', get_author_string_match(F.col('given_names'), F.col('family_name'), F.col('potential_author_strings'))) \
.write.mode('overwrite').parquet(f"{iteration_save_path}orcid_base_cluster_new_method_raw")

In [0]:
orcid_parsed_strings = spark.read.parquet(f"{iteration_save_path}orcid_base_cluster_new_method_raw")

In [0]:
orcid_parsed_strings.cache().count()

Out[21]: 42567198

In [0]:
orcid_parsed_strings \
    .filter((F.col('raw_name')=='NO_MATCH_POSSIBLE') | 
            (F.col('given_names').isNull()) |
            (F.col('family_name').isNull())) \
    .count()

Out[29]: 5848465

In [0]:
orcid_parsed_strings \
    .filter((F.col('raw_name')=='NO_MATCH_POSSIBLE') | 
            (F.col('given_names').isNull()) |
            (F.col('family_name').isNull())) \
    .count()

Out[62]: 4094975

In [0]:
orcid_parsed_strings \
    .filter(F.col('raw_name')!='NO_MATCH_POSSIBLE') \
    .filter(~F.col('given_names').isNull()) \
    .filter(~F.col('family_name').isNull()) \
    .withColumn('given_len', F.length(F.col('given_names'))) \
    .withColumn('family_len', F.length(F.col('family_name'))) \
    .withColumn('raw_len', F.length(F.col('raw_name'))) \
    .withColumn('ratio', (F.col('given_len') + F.col('family_len'))/F.col('raw_len')) \
    .withColumn('family_name_clean', clean_family_name(F.col('family_name'))) \
    .withColumn('raw_name_clean', clean_family_name(F.col('raw_name'))) \
    .filter(~((F.col('raw_name').contains(F.col('family_name'))) | 
              (F.col('raw_name').contains(F.col('family_name_clean'))) | 
              (F.col('raw_name_clean').contains(F.col('family_name_clean'))))) \
    .select('paper_id','orcid','raw_name','given_names','family_name') \
    .sample(0.001).show(25)

+----------+-------------------+--------------------+--------------------+-------------------+
|  paper_id|              orcid|            raw_name|         given_names|        family_name|
+----------+-------------------+--------------------+--------------------+-------------------+
|2763356662|0000-0001-9224-5731|Shimaa Abd El-Sal...|              Shimaa|            Elsayed|
|4280612547|0000-0001-8715-2776|Mohamed M. Elshar...|             Mohamed|        El Sharkawy|
|2588316133|0000-0002-2639-6571|         D. Dannheim|               Sarah|               Heim|
|2500435244|0000-0002-1592-4869|              WF Lau|             Wilfred|          W. F. Lau|
|2083095730|0000-0002-8626-1564|Surinder Kumar Mehta|               Ahmad|               Umar|
|2944075536|0000-0002-0655-0981|Fahimeh Zeraat He...|             Fahimeh|       ZeraatHerfeh|
|3126848917|0000-0002-3235-2698|S. Mohammad J. Mi...|Seyed Mohammad Javad|Mirzapour Alehashem|
|3206543893|0000-0001-7079-6954|           Bo-Bo Z

In [0]:
orcid_parsed_strings \
    .filter(F.col('raw_name')!='NO_MATCH_POSSIBLE') \
    .filter(~F.col('given_names').isNull()) \
    .filter(~F.col('family_name').isNull()) \
    .withColumn('given_len', F.length(F.col('given_names'))) \
    .withColumn('family_len', F.length(F.col('family_name'))) \
    .withColumn('raw_len', F.length(F.col('raw_name'))) \
    .withColumn('ratio', (F.col('given_len') + F.col('family_len'))/F.col('raw_len')) \
    .withColumn('family_name_clean', clean_family_name(F.col('family_name'))) \
    .withColumn('raw_name_clean', clean_family_name(F.col('raw_name'))) \
    .filter(((F.col('raw_name').contains(F.col('family_name'))) | 
              (F.col('raw_name').contains(F.col('family_name_clean'))) | 
              (F.col('raw_name_clean').contains(F.col('family_name_clean'))))) \
    .select('paper_id','orcid','raw_name','given_names','family_name') \
    .count()

Out[79]: 38391392

In [0]:
orcid_parsed_strings \
    .filter(F.col('raw_name')!='NO_MATCH_POSSIBLE') \
    .filter(~F.col('given_names').isNull()) \
    .filter(~F.col('family_name').isNull()) \
    .withColumn('family_name_clean', clean_family_name(F.col('family_name'))) \
    .withColumn('raw_name_clean', clean_family_name(F.col('raw_name'))) \
    .filter((F.col('raw_name').contains(F.col('family_name'))) | 
              (F.col('raw_name').contains(F.col('family_name_clean'))) | 
              (F.col('raw_name_clean').contains(F.col('family_name_clean')))) \
    .select('paper_id','orcid','raw_name','given_names','family_name') \
    .write.mode('overwrite').parquet(f"{iteration_save_path}orcid_base_cluster_new_method_data")

In [0]:
all_orcid_pdf_data = spark.read.parquet(f"{iteration_save_path}orcid_base_cluster_new_method_data") \
    .select(F.col('paper_id').alias('work_id'),'orcid',F.col('raw_name').alias('original_author'))
all_orcid_pdf_data.cache().count()

Out[22]: 36667168

In [0]:
all_orcid_pdf_data.filter(F.col('orcid')=='0000-0001-7115-9105').show(60, truncate=False)

+----------+-------------------+---------------+
|work_id   |orcid              |original_author|
+----------+-------------------+---------------+
|1881537177|0000-0001-7115-9105|Yukio Ando     |
|1974274535|0000-0001-7115-9105|Yuki Ando      |
|2017528973|0000-0001-7115-9105|Yoshihiro Ando |
|2043918647|0000-0001-7115-9105|Y. Ando        |
|2286723441|0000-0001-7115-9105|Yukio Ando     |
|2520228726|0000-0001-7115-9105|Yuichi Ando    |
|2549968178|0000-0001-7115-9105|Yoshiaki Ando  |
|2587313190|0000-0001-7115-9105|Yuichi Ando    |
|2606937982|0000-0001-7115-9105|Yukio Ando     |
|2761572844|0000-0001-7115-9105|Yasutoshi Ando |
|1885692834|0000-0001-7115-9105|Yuichi Ando    |
|2017613824|0000-0001-7115-9105|Yukio Ando     |
|2019150792|0000-0001-7115-9105|Yutaka Ando    |
|2032921998|0000-0001-7115-9105|Yuki Ando      |
|2033406345|0000-0001-7115-9105|Yuhko Ando     |
|2067534382|0000-0001-7115-9105|Yuichi Ando    |
|2077481177|0000-0001-7115-9105|Yoichi Ando    |
|2081266975|0000-000

In [0]:
affiliations.select(F.col('paper_id').alias('work_id'),'original_author',
                    F.col('author_sequence_number').alias('seq_no'),'original_affiliation') \
.dropDuplicates(subset=['work_id','original_author']) \
.join(all_orcid_pdf_data, how='inner', on=['work_id','original_author']) \
.join(string_to_institution, how='left', on='original_affiliation') \
.join(concepts, how='left', on='work_id') \
.na.fill("NONE", subset=['institutions']) \
.write.mode('overwrite').parquet(f"{iteration_save_path}orcid_base_cluster_pdf_to_combine")

In [0]:
data_to_check = spark.read.parquet(f"{iteration_save_path}orcid_base_cluster_new_method_clusters")
data_to_check.cache().count()

Out[34]: 2878652

In [0]:
data_to_check.coalesce(1).write.mode('overwrite').parquet(f"{iteration_save_path}orcid_base_cluster_new_method_clusters_single_file")

Can probably get the same training dataset for a given ORCID author by linking ORCID data with OpenAlex, no need to do string matching. Get ORCID, author name, institutions, and DOI from ORCID data, get coauthors and concepts from OpenAlex. Otherwise, I could still do string matching and get institutions/author name from OpenAlex (but slightly harder and less accurate)

In [0]:
orcid_base_clusters = \
affiliations.select(F.col('paper_id').alias('work_id'), 'original_author', F.col('author_sequence_number').alias('seq_no'), 
                    'original_affiliation', F.col('original_orcid').alias('orcid')) \
.filter(F.col('orcid')!='') \
.dropDuplicates(subset=['work_id','original_author','orcid']) \
.join(string_to_institution, how='left', on='original_affiliation') \
.join(concepts, how='left', on='work_id')

In [0]:
orcid_base_clusters.write.mode('overwrite').parquet(f"{iteration_save_path}orcid_raw_base_cluster_data")

In [0]:
orcid_base_clusters = spark.read.parquet(f"{iteration_save_path}orcid_raw_base_cluster_data").dropDuplicates() \
.withColumn('institutions', turn_string_to_list(F.col('institutions'))) \
.select(F.concat_ws("_", F.col('work_id'), F.col('seq_no')).alias('work_id'),'original_affiliation','original_author','orcid', 
        F.explode(F.col('institutions')).alias('institution'), 'concepts') \
.select('work_id','original_affiliation','original_author','orcid','institution', 
        F.explode(F.col('concepts')).alias('concept')) \
.groupby('orcid') \
.agg(F.collect_set(F.col('work_id')).alias('works'), 
     F.collect_set(F.col('original_author')).alias('names'), 
     F.collect_set(F.col('original_affiliation')).alias('aff_strings'), 
     F.collect_set(F.col('institution')).alias('institutions'), 
     F.collect_set(F.col('concept')).alias('concepts'))

In [0]:
orcid_base_clusters.cache().count()

Out[45]: 2017976

In [0]:
orcid_base_clusters.coalesce(1).write.mode('overwrite').parquet(f"{iteration_save_path}orcid_base_cluster_inst_and_concepts")

#### Combining ORCID from pdf and oa

In [0]:
orcid_pdf_to_combine = spark.read.parquet(f"{iteration_save_path}orcid_base_cluster_pdf_to_combine").dropDuplicates()
orcid_pdf_to_combine.cache().count()

Out[7]: 36667168

In [0]:
orcid_oa_to_combine = spark.read.parquet(f"{iteration_save_path}orcid_raw_base_cluster_data").dropDuplicates()
orcid_oa_to_combine.cache().count()

Out[8]: 6373480

In [0]:
orcid_pdf_to_combine.union(orcid_oa_to_combine.select(*orcid_pdf_to_combine.columns)) \
.withColumn('institutions', turn_string_to_list(F.col('institutions'))) \
.join(coauthors, how='left', on='work_id') \
.withColumn('coauthors', remove_current_author_udf(F.col('original_author'),F.col('all_authors'))) \
.select(F.concat_ws("_", F.col('work_id'), F.col('seq_no')).alias('work_id'),'original_affiliation','original_author','orcid', 
        'institutions', 'concepts', 'coauthors') \
.coalesce(20).write.mode('overwrite').parquet(f"{iteration_save_path}orcid_all_data_for_base_clusters")

In [0]:
all_data = spark.read.parquet(f"{iteration_save_path}orcid_all_data_for_base_clusters")

In [0]:
all_data.cache().count()

Out[23]: 43040648

In [0]:
all_data.dropDuplicates(subset=['orcid']).count()

Out[33]: 4490210

In [0]:
all_data \
.coalesce(1).write.mode('overwrite').parquet(f"{iteration_save_path}orcid_all_data_for_base_clusters_single_file")

In [0]:
# .select('work_id','original_affiliation','original_author','orcid', 
#         F.explode(F.col('institutions')).alias('institution'), 'concepts') \
# .select('work_id','original_affiliation','original_author','orcid','institution', 
#         F.explode(F.col('concepts')).alias('concept')) \
# .groupby('orcid') \
# .agg(F.collect_set(F.col('work_id')).alias('works'), 
#      F.collect_set(F.col('original_author')).alias('names'), 
#      F.collect_set(F.col('original_affiliation')).alias('aff_strings'), 
#      F.collect_set(F.col('institution')).alias('institutions'), 
#      F.collect_set(F.col('concept')).alias('concepts'))

#### Getting Clusters From Non-Orcid

* get clusters where author is exactly the same, institution matches, and there are at least 2 of the same coauthors

In [0]:
def create_ind_for_drop_column_(col_list):
    if isinstance(col_list, list):
        if col_list:
            if col_list[0] == -1:
                return 0
            else:
                return 1
        else:
            return 0
    else:
        return 0
    
create_ind_for_drop_column = F.udf(create_ind_for_drop_column_, IntegerType())

In [0]:
affiliations.select(F.col('paper_id').alias('work_id'), F.col('author_sequence_number').alias('seq_no'), 'original_author',
                    'original_orcid','original_affiliation').dropDuplicates() \
.join(concepts.dropDuplicates(subset=['work_id']), how='inner', on='work_id') \
.join(coauthors, how='inner', on='work_id') \
.join(string_to_institution, how='inner', on='original_affiliation') \
.na.fill("NONE", subset=['institutions']) \
.withColumn('institutions', turn_string_to_list(F.col('institutions'))) \
.withColumn('coauthors', remove_current_author_udf(F.col('original_author'),F.col('all_authors'))) \
.withColumn('concepts_ind', create_ind_for_drop_column(F.col('concepts'))) \
.withColumn('coauthors_ind', create_ind_for_drop_column(F.col('coauthors'))) \
.withColumn('institutions_ind', create_ind_for_drop_column(F.col('institutions'))) \
.filter(F.col('concepts_ind')==1) \
.filter(F.col('coauthors_ind')==1) \
.filter(F.col('institutions_ind')==1) \
.filter(F.col('original_author')!='') \
.select(F.concat_ws("_", F.col('work_id'), F.col('seq_no')).alias('work_id'), 'original_author', F.col('original_orcid').alias('orcid'),
        'original_affiliation', 'institutions','coauthors','concepts') \
.write.mode('overwrite').parquet(f"{iteration_save_path}openalex_groupby_data_for_clusters")

In [0]:
spark.read.parquet(f"{iteration_save_path}openalex_groupby_data_for_clusters") \
.select('work_id','original_author','orcid',F.explode(F.col('institutions')).alias('institution'),'coauthors','concepts') \
.select('work_id','original_author','orcid','institution',F.explode(F.col('coauthors')).alias('coauthor'),'concepts') \
.select('work_id','original_author','orcid','institution','coauthor',F.explode(F.col('concepts')).alias('concept')) \
.groupby(['original_author','institution','coauthor','concept']) \
.agg(F.collect_set(F.col('orcid')).alias('orcids'), 
     F.collect_list(F.col('work_id')).alias('work_ids')) \
.withColumn('works_len', F.size(F.col('work_ids'))) \
.filter(F.col('works_len') > 1) \
.write.mode('overwrite').parquet(f"{iteration_save_path}openalex_grouped_for_clusters")

In [0]:
def get_string_list_(norm_list):
    norm_list.sort()
    new_list = "||".join(norm_list)
    return new_list

def get_normal_list_(string_list):
    norm_list = string_list.split("||")
    return norm_list

get_string_list = F.udf(get_string_list_, StringType())
get_normal_list = F.udf(get_normal_list_, ArrayType(StringType()))

In [0]:
w = Window().orderBy(F.lit('A'))

openalex_groupby_cluster_data = spark.read.parquet(f"{iteration_save_path}openalex_grouped_for_clusters") \
.withColumn('string_works', get_string_list(F.col('work_ids'))) \
.select('string_works').dropDuplicates() \
.withColumn('work_ids', get_normal_list(F.col('string_works'))) \
.select('work_ids') \
.withColumn('cluster_1', F.row_number().over(w))
openalex_groupby_cluster_data.cache().count()

Out[10]: 382757306

In [0]:
openalex_groupby_cluster_data \
.write.mode('overwrite').parquet(f"{iteration_save_path}openalex_grouped_for_clusters_1")

In [0]:
w1 = Window().orderBy(F.lit('B'))
openalex_groupby_cluster_data.select(F.explode(F.col('work_ids')).alias('work_id'), 'cluster_1') \
.groupby('work_id') \
.agg(F.collect_set(F.col('cluster_1')).alias('cluster_1')) \
.write.mode('overwrite').parquet(f"{iteration_save_path}openalex_grouped_for_clusters_2")

In [0]:
test_clusters = spark.read.parquet(f"{iteration_save_path}openalex_grouped_for_clusters_2")
test_clusters.cache().count()

Out[4]: 162408549

In [0]:
test_clusters.show(20)

+-------------+--------------------+
|      work_id|           cluster_1|
+-------------+--------------------+
| 3210905390_2|[106775609, 27148...|
| 3210905390_3|[249187943, 14163...|
| 3210905390_4|[278364092, 26901...|
| 3210905390_5|[336640890, 96792...|
| 3210905390_6|[259336455, 26207...|
| 3210905390_9|[165114459, 13230...|
| 3210905450_1|[66252686, 226759...|
| 3210905450_4|         [358857971]|
| 3210905458_2|[20760843, 214615...|
| 3210905458_3|[252476453, 27058...|
| 3210905488_1|[354823536, 37722...|
|3210905488_10|[339235, 15753042...|
|3210905488_13|[291348831, 29587...|
|3210905488_14|[196191183, 59618...|
|3210905488_15|[378420841, 50232...|
|3210905488_16|[147025960, 37324...|
|3210905488_18|[131428658, 21975...|
| 3210905488_2|[284099729, 40684...|
|3210905488_20|[263794454, 21660...|
|3210905488_21|[215276238, 31964...|
+-------------+--------------------+
only showing top 20 rows



In [0]:
test_clusters \
.coalesce(6).write.mode('overwrite').parquet(f"{iteration_save_path}openalex_grouped_for_clusters_2_single")

##### Situations to manipulate for training

1. Name is different
2. Institution is missing
3. Coauthors are missing
4. No concepts (or reduced number of concepts?)

sort by author sequence value?

Check for orcid (join to cluster if orcid is found)
Check for coauthor/institution/concepts (if all null, don't send through disambiguator)

### For creating dataset
1. Go through each sample and create 10-20 copies of dataframe
2. For each sample, transform data to either keep as is or remove some feature to trick model
3. Get mapping of similar names for all orcids
4. Need to create a mapping of orcid -> orcid of pos/neg pairs
5. Join data to orcid mapping using pyspark and randommly select rows in each join
* join sample data on orcid for first ID
* then sample data
* join on second ID to first orcid
* sample that data one more time
* dataset complete (do for both positive and negative)

In [0]:
all_base_data = spark.read.parquet(f"{iteration_save_path}orcid_all_data_for_base_clusters")
all_base_data.cache().count()

Out[4]: 43040648

In [0]:
for i in range(10):
    all_base_data.select('work_id','orcid','original_author','institutions','concepts','coauthors') \
        .coalesce(20).write.mode('append').parquet(f"{iteration_save_path}disambiguator_base_data_copies")

In [0]:
all_base_data = spark.read.parquet(f"{iteration_save_path}disambiguator_base_data_copies")
all_base_data.cache().count()

Out[4]: 430406480

In [0]:
all_base_data.sample(0.001).show(40)

+-------------+-------------------+--------------------+--------------------+--------------------+--------------------+
|      work_id|              orcid|     original_author|        institutions|            concepts|           coauthors|
+-------------+-------------------+--------------------+--------------------+--------------------+--------------------+
| 4206655184_4|0000-0002-4580-3849|   Cauane Blumenberg|         [169248161]|[2776867660, 4555...|[Deborah Carvalho...|
| 4206739739_5|0000-0003-4636-9578|          Rudi Fasan|           [5388228]|[21951064, 277660...|[Daniela M. Carmi...|
|4210375849_10|0000-0001-9712-4939|  Karine R. Mayilyan|[99064381, 421013...|[2777910003, 1569...|[Vergine Madelian...|
| 4210674858_2|0000-0002-5112-4498|       Piotr Narloch|         [108403487]|[2780021121, 2781...|    [Łukasz Rosicki]|
| 4211201280_1|0000-0002-6379-0199|     M. Joseph Sirgy|                  []|[75630572, 277603...|                  []|
| 4213246594_4|0000-0002-0071-8229|     

In [0]:
all_base_data \
    .select(F.size(F.col('institutions')).alias('inst_len'), 
            F.size(F.col('concepts')).alias('con_len'), 
            F.size(F.col('coauthors')).alias('co_len')) \
    .select(F.min('inst_len').alias('inst_min'), 
            F.min('con_len').alias('con_min'), 
            F.min('co_len').alias('co_min'), 
            F.mean('inst_len').alias('inst_mean'), 
            F.mean('con_len').alias('con_mean'), 
            F.mean('co_len').alias('co_mean'),
            F.max('inst_len').alias('inst_max'), 
            F.max('con_len').alias('con_max'), 
            F.max('co_len').alias('co_max')) \
    .show(10)

+--------+-------+------+------------------+-----------------+-----------------+--------+-------+------+
|inst_min|con_min|co_min|         inst_mean|         con_mean|          co_mean|inst_max|con_max|co_max|
+--------+-------+------+------------------+-----------------+-----------------+--------+-------+------+
|       0|     -1|     0|0.8706168178508836|9.208078256628479|8.871159049464126|      11|     25|   250|
+--------+-------+------+------------------+-----------------+-----------------+--------+-------+------+



In [0]:
import random

In [0]:
def create_training_sample_for_orcid_(author_name, institutions, concepts, coauthors):
    author_name = " ".join(author_name.split())
    rand_num = random.random()
    if (isinstance(institutions, list) & isinstance(concepts, list) & isinstance(coauthors, list)):
        if rand_num < 0.40:
            return f"{author_name}||||{'|||'.join(institutions)}||||{'|||'.join(concepts)}||||{'|||'.join(coauthors)}"
        elif rand_num < 0.50:
            return f"{author_name}||||||||||||{'|||'.join(coauthors)}"
        elif rand_num < 0.60:
            return f"{author_name}||||||||{'|||'.join(concepts)}||||{'|||'.join(coauthors)}"
        elif rand_num < 0.70:
            return f"{author_name}||||{'|||'.join(institutions)}||||||||"
        elif rand_num < 0.80:
            return f"{author_name}||||{'|||'.join(institutions)}||||||||{'|||'.join(coauthors)}"
        elif rand_num < 0.90:
            return f"{author_name}||||||||{'|||'.join(concepts)}||||"
        else:
            return f"{author_name}||||{'|||'.join(institutions)}||||{'|||'.join(concepts)}||||"
    elif (isinstance(institutions, list) & isinstance(concepts, list) ):
        if rand_num < 0.5:
            return f"{author_name}||||{'|||'.join(institutions)}||||{'|||'.join(concepts)}||||"
        elif rand_num < 0.75:
            return f"{author_name}||||{'|||'.join(institutions)}||||||||"
        else:
            return f"{author_name}||||||||{'|||'.join(concepts)}||||"
    elif (isinstance(institutions, list) & isinstance(coauthors, list)):
        if rand_num < 0.5:
            return f"{author_name}||||{'|||'.join(institutions)}||||||||{'|||'.join(coauthors)}"
        elif rand_num < 0.75:
            return f"{author_name}||||||||||||{'|||'.join(coauthors)}"
        else:
            return f"{author_name}||||{'|||'.join(institutions)}||||||||"
    elif (isinstance(concepts, list) & isinstance(coauthors, list)):
        if rand_num < 0.5:
            return f"{author_name}||||||||{'|||'.join(concepts)}||||{'|||'.join(coauthors)}"
        elif rand_num < 0.75:
            return f"{author_name}||||||||{'|||'.join(concepts)}||||"
        else:
            return f"{author_name}||||||||||||{'|||'.join(coauthors)}"
    elif isinstance(institutions, list):
        return f"{author_name}||||{'|||'.join(institutions)}||||||||"
    elif isinstance(concepts, list):
        return f"{author_name}||||||||{'|||'.join(concepts)}||||"
    elif isinstance(coauthors, list):
        return f"{author_name}||||||||||||{'|||'.join(coauthors)}"
    else:
        return ""

create_training_sample_for_orcid = F.udf(create_training_sample_for_orcid_, StringType())

In [0]:
all_base_data \
    .withColumn('new_training_sample', create_training_sample_for_orcid(F.col('original_author'), 
    F.col('institutions'), F.col('concepts'), F.col('coauthors'))) \
    .select('work_id', 'orcid', 'new_training_sample') \
    .dropDuplicates() \
    .filter(F.col('new_training_sample')!='') \
    .write.mode('overwrite').parquet(f"{iteration_save_path}disambiguator_processed_training_samples")

In [0]:
dis_data_proc = spark.read.parquet(f"{iteration_save_path}disambiguator_processed_training_samples")
dis_data_proc.cache().count()

Out[4]: 190816087

In [0]:
dis_data_proc.sample(0.001).show(10, truncate=False)

+-------------+-------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|work_id      |orcid              |new_training_sample                                                                                                                                                                                                                                                                                                                                                                       |
+-------------+-------------------+---------------------------------------------------------------------------------------------------------------------------------------