In [0]:
import pickle
import boto3
import re
import json
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# from langdetect import detect
# from unidecode import unidecode
# from nameparser import HumanName



In [0]:
from pyspark.sql import SparkSession
sc = spark.sparkContext
from pyspark.sql import SQLContext
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import IntegerType, StringType, FloatType, ArrayType, DoubleType, StructType, StructField
sqlContext = SQLContext(sc)



In [0]:
base_save_path = "s3://openalex-data-copy/snapshot_2023_02_15/"
iteration_save_path = "s3://author-disambiguation/V3/"

#### Getting some author names in various languages

In [0]:
def get_language_(given, family=None):
    if given and family:
        all_text = f"{given} {family}"
    elif given:
        all_text = f"{given}"
    else:
        all_text = f"{family}"
        
    try:
        string_lang = detect(all_text)
    except:
        string_lang = 'en'
        
    return string_lang

get_language = F.udf(get_language_, StringType())

In [0]:
affiliations = spark.read.parquet(f"{base_save_path}static_affiliations")
affiliations.cache().count()

Out[7]: 634179075

In [0]:
affiliations \
.withColumn('author_str_len', F.length(F.col('original_author'))) \
.select('original_author', 'author_str_len') \
.filter(F.col('author_str_len') < 6) \
.filter(F.col('author_str_len') > 1) \
.dropDuplicates() \
.withColumn('name_lang', get_language(F.col('original_author'))) \
.write.mode('overwrite').parquet(f"{iteration_save_path}all_processed_data_for_model/extra_character_data")

In [0]:
extra_char_data = spark.read.parquet(f"{iteration_save_path}all_processed_data_for_model/extra_character_data")
extra_char_data.cache().count()

Out[8]: 5920942

In [0]:
extra_char_data.groupby('name_lang').count().orderBy('count', ascending=False).show(40)

+---------+-------+
|name_lang|  count|
+---------+-------+
|       ko|3891168|
|    zh-cn|1606346|
|    zh-tw| 148705|
|       ja|  69381|
|       de|  29163|
|       en|  17171|
|       sw|  14854|
|       so|  13031|
|       id|  12201|
|       tl|  10270|
|       cy|   9409|
|       vi|   9367|
|       tr|   7988|
|       pl|   6072|
|       fr|   5414|
|       nl|   4886|
|       ca|   4883|
|       pt|   4795|
|       hu|   4755|
|       ro|   4754|
|       fi|   4236|
|       lt|   4080|
|       it|   4007|
|       hr|   3743|
|       et|   3665|
|       af|   3509|
|       es|   3408|
|       sl|   3094|
|       sq|   2314|
|       da|   2173|
|       no|   1983|
|       sv|   1928|
|       sk|   1480|
|       cs|   1376|
|       lv|   1347|
|       ru|    862|
|       bg|    774|
|       ar|    673|
|       uk|    507|
|       fa|    490|
+---------+-------+
only showing top 40 rows



In [0]:
extra_char_data.filter(F.col('name_lang').isin('ko','zh-cn','zh-tw','ja')).sample(0.001).show(50)

+---------------+--------------+---------+
|original_author|author_str_len|name_lang|
+---------------+--------------+---------+
|           颂华|             2|    zh-cn|
|      寛人 山本|             5|       ko|
|         张国锋|             3|    zh-cn|
|           白璐|             2|    zh-cn|
|      淑子 間宮|             5|       ko|
|      柴田 未里|             5|       ko|
|      峰子 刀谷|             5|       ko|
|      智幸 武田|             5|       ko|
|         章于川|             3|    zh-cn|
|      正司 邑田|             5|    zh-cn|
|      優里 藤川|             5|       ko|
|         熊友谊|             3|       ko|
|      善男 小篠|             5|    zh-cn|
|         孙仁义|             3|    zh-cn|
|        類臣 森|             4|       ko|
|         김재진|             3|       ko|
|         정장영|             3|       ko|
|      圭司 伊藤|             5|       ko|
|         申在权|             3|    zh-cn|
|         叶钦军|             3|    zh-cn|
|      裕子 湯口|             5|       ko|
|         罗远扬|             3|    zh-cn|

In [0]:
extra_char_data.filter(F.col('name_lang').isin('ko','zh-cn','zh-tw','ja')).sample(0.1).count()

Out[15]: 5712

In [0]:
extra_char_data.filter(F.col('name_lang').isin('ko','zh-cn','zh-tw','ja'))\
.write.mode('overwrite').parquet(f"{iteration_save_path}all_processed_data_for_model/all_training_data_other_languages")

### Getting all data

In [0]:
# crossref_names_parsed = spark.read.option('header','true').csv(f"{iteration_save_path}parsed-names.csv.gz")

In [0]:
# crossref_names_parsed.write.mode('overwrite').parquet(f"{iteration_save_path}parsed_names")

In [0]:
crossref_names_parsed = spark.read.parquet(f"{iteration_save_path}parsed_names")
crossref_names_parsed.cache().count()

Out[14]: 784810

In [0]:
# crossref_names_no_raw_text = spark.read.csv(f"{iteration_save_path}crossref_name_parts") \
# .select(F.col('_c0').alias('family_name'),F.col('_c1').alias('given_name'),F.col('_c2').alias('work_id'))

In [0]:
# crossref_names_no_raw_text \
# .select('given_name','family_name','work_id') \
# .write.mode('overwrite').parquet(f"{iteration_save_path}crossref_name_parts_parquet")

In [0]:
crossref_names_no_raw_text = spark.read.parquet(f"{iteration_save_path}crossref_name_parts_parquet") \
.withColumn('work_id', F.substring(F.col('work_id'), 2, 15))

crossref_names_no_raw_text.cache().count()

In [0]:
orcid_names = spark.read.parquet(f"{iteration_save_path}orcid_names_data_dump.parquet")

In [0]:
orcid_names.cache().count()

Out[11]: 14845875

In [0]:
# authors = spark.read.parquet(f"{base_save_path}static_authors")
# works = spark.read.parquet(f"{base_save_path}static_works")
# journals = spark.read.parquet(f"{base_save_path}static_journals")
affiliations = spark.read.parquet(f"{base_save_path}static_affiliations")
# institutions = spark.read.parquet(f"{base_save_path}static_institutions")
# citations = spark.read.parquet(f"{base_save_path}static_citations")

In [0]:
affiliations.cache().count()

Out[13]: 634179075

In [0]:
# works.cache().count()

### Already has matching string

In [0]:
crossref_names_parsed.sample(0.01).select('given_name','family_name','raw_name').show(10, truncate=False)

+---------------+-----------+-----------------------------+
|given_name     |family_name|raw_name                     |
+---------------+-----------+-----------------------------+
|V              |Deschamps  |Deschamps, V                 |
|Leon           |Lack       |Leon Lack                    |
|Changli        |Lu         |Changli Lu                   |
|Kalpa          |Kharicha   |Kalpa Kharicha               |
|Shirley Shapiro|Ben David  |Shirley Shapiro Ben David, MD|
|Teodora        |Nikolova   |Nikolova, Teodora            |
|J.A.           |Otter      |J.A. Otter                   |
|Christopher    |Alexander  |Christopher Alexander        |
|T              |HARGREAVE  |T.B. Hargreave               |
|Yan            |Wang       |Yan Wang                     |
+---------------+-----------+-----------------------------+
only showing top 10 rows



In [0]:
crossref_names_parsed \
.write.mode('overwrite').parquet(f"{iteration_save_path}all_processed_data_for_model/crossref_names_parsed")

Can link with ROR ID to current aff data

In [0]:
affiliations.sample(0.01).show(5)

+--------+----------+-----------------+----------------------+--------------------+--------------+--------------+------------+
|paper_id| author_id|  original_author|author_sequence_number|original_affiliation|original_orcid|affiliation_id|match_author|
+--------+----------+-----------------+----------------------+--------------------+--------------+--------------+------------+
|  113794|2423819275|Vladik Kreinovich|                     2|Department of Com...|              |     164936912|            |
|  121243|1936008844|    Roger D. Kamm|                     2|                    |              |          null|            |
|  126581|2652807660|   I Schwanzerová|                     3|                    |              |          null|            |
|  199092|2145393475|    Bethan Cowley|                    10|                    |              |          null|            |
|  212904|2123159365|      G. Van Sant|                     1|                    |              |          nul

In [0]:
orcid_names.sample(0.01).show(10)

+-------------------+-------------+------------------+
|              orcid|  given_names|       family_name|
+-------------------+-------------+------------------+
|0000-0001-5308-9000|Sandra Regina|         Albinante|
|0000-0001-5405-7000|        Clara|           Gambart|
|0000-0001-5772-5000|      Tingjun|                Ye|
|0000-0001-5875-2000|     NAOWARA |          AL-ARAFI|
|0000-0001-5932-5000|        Ziqi |              Wang|
|0000-0001-5999-0000|          Yue|             Zhang|
|0000-0001-6439-4000|          Yue|               Liu|
|0000-0001-6416-7000|       Vikash|         Chaurasia|
|0000-0001-6610-3000|           涛|                陈|
|0000-0001-7020-9000|        Mario|Bustamante Alarcón|
+-------------------+-------------+------------------+
only showing top 10 rows



In [0]:
matched_with_orcid = orcid_names.join(affiliations.select(F.col('original_orcid').alias('orcid'), 
                                                          'original_author').dropDuplicates(), how='inner', on='orcid') \
    .select(F.col('given_names').alias('given_name'), 
            'family_name', 
            F.col('original_author').alias('raw_name')).dropDuplicates()

In [0]:
matched_with_orcid.cache().count()

Out[26]: 3095877

In [0]:
matched_with_orcid \
.write.mode('overwrite').parquet(f"{iteration_save_path}all_processed_data_for_model/matched_with_orcid")

### Needs matching string created

See if we can do simple string matching to work using names

In [0]:
def get_author_string_match_(given, family, potential_names):
    
    
    skip_list = [" ", "," ,"." ,"-" ,":" ,"/"]
    if given:
        given_processed = [i for i in given.lower() if i not in skip_list]
    else:
        given_processed = []
    
    if family:
        family_processed = [i for i in family.lower() if i not in skip_list]
    else:
        family_processed = []
    full_name = list(set(given_processed + family_processed))
    
    curr_total = 0
    final_name = []
    for pot_name in potential_names:
        pot_name_processed = list(set([i for i in pot_name.lower() if i not in skip_list]))
        temp_total = sum(x in full_name for x in pot_name_processed)
        if temp_total > curr_total:
            curr_total = temp_total
            final_name = [pot_name]
        else:
            pass
    
    if final_name:
        return final_name[0]
    else:
        return "NO_MATCH_POSSIBLE"


get_author_string_match = F.udf(get_author_string_match_, StringType())

In [0]:
grouped_aff_data = affiliations.select(F.col('paper_id').alias('work_id'),'original_author').dropDuplicates() \
.filter(~F.col('original_author').isNull()) \
.filter(F.col('original_author')!='') \
.groupby('work_id') \
.agg(F.collect_list(F.col('original_author')).alias('potential_author_strings'))

In [0]:
grouped_aff_data.cache().count()

Out[52]: 227519559

In [0]:
grouped_aff_data.sample(0.01).show(10)

+-------+--------------------------------+
|work_id|        potential_author_strings|
+-------+--------------------------------+
| 317456|            [Kenjiro Ogawa, M...|
|1510044|[曾莉娅, 怀化市艺术馆,湖南,怀...|
|1928305|            [Faraja Teddy Igira]|
|2248676|  [俊治 横澤, 靖士 榎本, 英孝...|
|4861616|        [马志远, 韩春琏, 吉敬合]|
|5252084|            [Corinne Pedroletti]|
|5253716|                 [D.B.J. Peters]|
|6097392|                  [David Kimber]|
|6143899|            [Shubhendu Banerjee]|
|7759235|                   [Joyce Skeet]|
+-------+--------------------------------+
only showing top 10 rows



In [0]:
crossref_names_no_raw_text.sample(0.001).show(10)

+----------+-----------+----------+
|given_name|family_name|   work_id|
+----------+-----------+----------+
|     Peter|      Young| 275493881|
|    Martin|      Crook|2960281211|
|      Ning|         Ma|2921364369|
|   Jack E.|    Fincham|4252733870|
| Thomas R.|    Palfrey|3139109763|
|        S.|     Sahina|4206707983|
|    Rainer|    Geißler|4241954072|
|     Pedro|    Cardoso|3201974630|
| Toshikazu|       Niki|2073738003|
|      John|        Fry| 584940973|
+----------+-----------+----------+
only showing top 10 rows



In [0]:
cross_ref_no_text_with_affs = crossref_names_no_raw_text.dropDuplicates().join(grouped_aff_data, how='inner', on='work_id') \
.withColumn('raw_name', get_author_string_match(F.col('given_name'), F.col('family_name'), F.col('potential_author_strings'))) \
.select('given_name','family_name','raw_name')

In [0]:
cross_ref_no_text_with_affs.cache().count()

Out[56]: 372567202

In [0]:
cross_ref_no_text_with_affs.sample(0.01).show(20, truncate=False)

+---------------+-----------+-----------------------+
|given_name     |family_name|raw_name               |
+---------------+-----------+-----------------------+
|Fangdong       |Zhou       |Fangdong Zhou          |
|J.             |Nowak      |J. Nowak               |
|Didier         |Dubois     |Didier Dubois          |
|P. A.          |Steerenberg|P. A. Steerenberg      |
|Lawrence       |Schovanec  |Lawrence Schovanec     |
|H.             |Küchmeister|H. Küchmeister         |
|Robert J.      |Boucek     |Robert J. Boucek       |
|Steve          |McNulty    |Steve McNulty          |
|Michael C.     |Fishbein   |Michael C. Fishbein    |
|Gabriella      |Gobbi      |Gabriella Gobbi        |
|G.             |Oppenheim  |G. Oppenheim           |
|CE             |Terrill    |CE Terrill             |
|Hiromi         |Matsubara  |Hiromi Matsubara       |
|W.             |Pimpl      |W. Pimpl               |
|Yoshitoki      |Ishii      |Yoshitoki Ishii        |
|Youngsun       |Sohn       

In [0]:
cross_ref_no_text_with_affs \
.write.mode('overwrite').parquet(f"{iteration_save_path}all_processed_data_for_model/crossref_no_text_matched_affs")

### Take final data and get into dataset for training

In [0]:
crossref_names_parsed = \
    spark.read.parquet(f"{iteration_save_path}all_processed_data_for_model/crossref_names_parsed") \
    .select('given_name','family_name','raw_name') \
    .withColumn('data_type', F.lit('crossref_parsed')) \
    .dropDuplicates()
crossref_names_parsed.cache().count()

Out[9]: 705798

In [0]:
crossref_names_parsed.sample(0.01).show(5, truncate=False)

+----------+-----------+-------------------+
|given_name|family_name|raw_name           |
+----------+-----------+-------------------+
|Bruce H.  |McCormick  |McCormick, Bruce H.|
|Zahra     |Kargarpour |Zahra Kargarpour   |
|R.        |Rohini     |R. Rohini          |
|W. W.     |McMillian  |W. W. McMillian    |
|Zhiyuan   |Xu         |Zhiyuan Xu         |
+----------+-----------+-------------------+
only showing top 5 rows



In [0]:
matched_with_orcid = \
    spark.read.parquet(f"{iteration_save_path}all_processed_data_for_model/matched_with_orcid") \
    .select('given_name','family_name','raw_name') \
    .withColumn('data_type', F.lit('orcid_matched')) \
    .dropDuplicates()
matched_with_orcid.cache().count()

Out[10]: 3095877

In [0]:
matched_with_orcid.sample(0.01).show(5, truncate=False)

+---------------+-----------+----------------------+
|given_name     |family_name|raw_name              |
+---------------+-----------+----------------------+
|zhongliang     |sun        |Zhong-Liang Sun       |
|null           |null       |Mariel Friberg        |
|Alan Kwok Shing|Chiang     |Alan Kwok Shing Chiang|
|qiuping        |gu         |Qiuping Gu            |
|Rafia          |Afzal      |Rafia Afzal           |
+---------------+-----------+----------------------+
only showing top 5 rows



In [0]:
cross_ref_no_text_with_affs = \
    spark.read.parquet(f"{iteration_save_path}all_processed_data_for_model/crossref_no_text_matched_affs") \
    .filter(F.col('raw_name')!='NO_MATCH_POSSIBLE') \
    .withColumn('given_len', F.length(F.col('given_name'))) \
    .withColumn('family_len', F.length(F.col('family_name'))) \
    .withColumn('raw_len', F.length(F.col('raw_name'))) \
    .withColumn('ratio', (F.col('given_len') + F.col('family_len'))/F.col('raw_len')) \
    .filter((F.col('ratio') > 0.4) & (F.col('raw_name').contains(F.col('family_name')))) \
    .filter(F.col('ratio') <= 1.0) \
    .select('given_name','family_name','raw_name') \
    .withColumn('data_type', F.lit('crossref_matched')) \
    .dropDuplicates()

cross_ref_no_text_with_affs.cache().count()

Out[11]: 59903052

In [0]:
cross_ref_no_text_with_affs\
.sample(0.01).show(5, truncate=False)

+----------+-----------+---------------+----------------+
|given_name|family_name|raw_name       |data_type       |
+----------+-----------+---------------+----------------+
|Tomohiro  |Takaki     |Tomohiro Takaki|crossref_matched|
|A.K.      |Amir-Jahed |A.K. Amir-Jahed|crossref_matched|
|M.        |Flaherty   |M. Flaherty    |crossref_matched|
|Victor P. |Pasko      |Victor P. Pasko|crossref_matched|
|D.E.      |Dorfan     |D.E. Dorfan    |crossref_matched|
+----------+-----------+---------------+----------------+
only showing top 5 rows



### Combining all data

In [0]:
def get_language_(given, family=None):
    if given and family:
        all_text = f"{given} {family}"
    elif given:
        all_text = f"{given}"
    else:
        all_text = f"{family}"
        
    try:
        string_lang = detect(all_text)
    except:
        string_lang = 'en'
        
    return string_lang

get_language = F.udf(get_language_, StringType())

In [0]:
# crossref_names_parsed \
#     .unionByName(matched_with_orcid) \
#     .unionByName(cross_ref_no_text_with_affs) \
#     .dropDuplicates(subset=['given_name','family_name','raw_name']) \
#     .write.mode('overwrite').parquet(f"{iteration_save_path}all_processed_data_for_model/all_author_data_combined")

In [0]:
# all_author_data \
# .select('given_name') \
# .filter(~F.col('given_name').isNull()) \
# .dropDuplicates() \
# .withColumn('name_lang', get_language(F.col('given_name'))) \
# .write.mode('overwrite').parquet(f"{iteration_save_path}all_processed_data_for_model/given_name_lang")

In [0]:
# all_author_data \
# .select('family_name') \
# .filter(~F.col('family_name').isNull()) \
# .dropDuplicates() \
# .withColumn('name_lang', get_language(F.col('family_name'))) \
# .write.mode('overwrite').parquet(f"{iteration_save_path}all_processed_data_for_model/family_name_lang")

In [0]:
author_family_name_lang = \
    spark.read.parquet(f"{iteration_save_path}all_processed_data_for_model/family_name_lang")

author_family_name_lang.cache().count()

Out[4]: 7703454

In [0]:
all_author_data = \
    spark.read.parquet(f"{iteration_save_path}all_processed_data_for_model/all_author_data_combined") \
    .filter(F.col('raw_name') != '') \
    .filter(~F.col('raw_name').isNull()) \
    .filter((~F.col('given_name').isNull()) & (~F.col('family_name').isNull())) \
    .join(author_family_name_lang.select('family_name','name_lang'), how='left', on='family_name')

all_author_data.cache().count()

Out[5]: 61719291

In [0]:
all_author_data.filter(F.col('name_lang').isNull()).count()

Out[27]: 0

In [0]:
all_author_data.sample(0.00001).show(50, truncate=False)

+----------------+-------------------+------------------------------+----------------+---------+
|family_name     |given_name         |raw_name                      |data_type       |name_lang|
+----------------+-------------------+------------------------------+----------------+---------+
|Alruwaili       |Mohammed K         |Mohammed K Alruwaili          |crossref_matched|sw       |
|Correia         |Jozelia            |Jozélia Maria de Souza Correia|orcid_matched   |pt       |
|Deswal          |Suman              |Suman Deswal                  |crossref_parsed |af       |
|Schmiz          |Karl               |Karl Schmiz                   |crossref_matched|de       |
|Brocklehurst    |Anna               |Anna Brocklehurst             |crossref_matched|sv       |
|Lozano          |Ana Guiomar        |Ana Guiomar Lozano            |crossref_matched|sl       |
|Pontes          |Elenir             |Elenir Pontes                 |crossref_matched|pt       |
|Scott           |J. V.       

In [0]:
# all_author_data.groupby('name_lang') \
# .agg(F.count(F.col('given_name')).alias('count'), 
#      F.first(F.col('family_name')).alias('family_example')).show(60)

In [0]:
all_author_data.filter(F.col('name_lang')=='af').sample(0.1).show(10, truncate=False)

+------------+--------------+-------------------------+----------------+---------+
|family_name |given_name    |raw_name                 |data_type       |name_lang|
+------------+--------------+-------------------------+----------------+---------+
|Ahlewat     |Rajesh        |Rajesh Ahlewat           |crossref_matched|af       |
|AlKafaween  |Mohammad      |Mohammad AlKafaween      |crossref_matched|af       |
|Aledrees    |Adibah        |Adibah Aledrees          |crossref_matched|af       |
|Audigier    |Raymond       |Raymond Audigier         |crossref_matched|af       |
|Audigier    |J.-C.         |J.-C. Audigier           |crossref_matched|af       |
|Audigier    |Jean-Christian|Jean-Christian Audigier  |crossref_matched|af       |
|Audigier    |David S.      |David S. Audigier        |crossref_matched|af       |
|Audigier    |Y             |Yves Audigier            |crossref_matched|af       |
|Audigier    |Émilie        |Émilie Geneviève Audigier|crossref_matched|af       |
|Aye

In [0]:
all_author_data.groupby('data_type').count().show()

+----------------+--------+
|       data_type|   count|
+----------------+--------+
|crossref_matched|58752189|
| crossref_parsed|  506677|
|   orcid_matched| 2460425|
+----------------+--------+



In [0]:
def mix_up_author_name_(given, family, lang):
  
    work_with_spaces = ['zh-cn','ne','ur','ko','zh-tw','th','ta','pa','mr','kn','ja','hi','he','gu','fa','ar']
    different_but_same = ['uk','el','ru','mk']
    to_remove = ['bn']
    
    if lang in work_with_spaces:
        if given and family:
            all_text = f"{given} {family}"
            all_text_split = all_text.split(" ")
            if len(all_text_split) > 3:
                pass
            else:
                random_number = random.random()
                if random_number < 0.6:
                    return all_text
                else:
                    return f"{family} {given}"
        elif given:
            all_text = f"{given}"
            all_text_split = all_text.split(" ")
            if len(all_text_split) > 3:
                pass
            else:
                random_number = random.random()
                if random_number < 0.6:
                    return all_text
                else:
                    return f"{all_text_split[2]} {all_text_split[0]} {all_text_split[1]}"
        else:
            all_text = f"{family}"
            all_text_split = all_text.split(" ")
            if len(all_text_split) > 3:
                pass
            else:
                random_number = random.random()
                if random_number < 0.6:
                    return all_text
                else:
                    return f"{all_text_split[2]} {all_text_split[0]} {all_text_split[1]}"
        
    else:
        if given and family:
            all_text = f"{given} {family}"
            random_number = random.random()
            if random_number < 0.025:
                return all_text.replace(".", "")
            elif random_number < 0.05:
                return all_text.replace(".", " ").replace("  ", " ")
            elif random_number < 0.15:
                return f"{family}, {given}".replace(".", "")
            elif random_number < 0.25:
                return f"{family} {given}".replace(".", "")
            elif random_number < 0.35:
                given_split = [x for x in given.split(" ") if x]
                if len(given_split) == 1:
                    return f"{given[0]} {family}".title()
                elif len(given_split) == 2:
                    return f"{given_split[0][0]}. {given_split[1][0]}. {family}".title()
                elif len(given_split) == 3:
                    return f"{given_split[0][0]}. {given_split[1][0]}. {given_split[2][0]}. {family}".title()
                elif len(given_split) == 4:
                    return f"{given_split[0][0]}. {given_split[1][0]}. {given_split[2][0]}. {given_split[3][0]}. {family}".title()
                elif len(given_split) == 5:
                    return f"{given_split[0][0]}. {given_split[1][0]}. {given_split[2][0]}. {given_split[3][0]}. {given_split[4][0]} {family}".title()
                else:
                    return all_text.title()
            elif random_number < 0.5:
                return unidecode(all_text).title()
            elif random_number < 0.625:
                return f"{family} {given}"
            elif random_number < 0.75:
                return f"{family}, {given}"
            elif random_number < 0.85:
                given_split = [x for x in given.split(" ") if x]
                if len(given_split) == 1:
                    return f"{given[0]} {family}"
                elif len(given_split) == 2:
                    return f"{given_split[0][0]}{given_split[1][0]} {family}"
                elif len(given_split) == 3:
                    return f"{given_split[0][0]}{given_split[1][0]}{given_split[2][0]} {family}"
                elif len(given_split) == 4:
                    return f"{given_split[0][0]}{given_split[1][0]}{given_split[2][0]}{given_split[3][0]} {family}"
                elif len(given_split) == 5:
                    return f"{given_split[0][0]}{given_split[1][0]}{given_split[2][0]}{given_split[3][0]}{given_split[4][0]} {family}"
                else:
                    return all_text
            elif random_number < 0.95:
                given_split = [x for x in given.split(" ") if x]
                if len(given_split) == 1:
                    return f"{family} {given[0]}."
                elif len(given_split) == 2:
                    return f" {family} {given_split[0][0]}.{given_split[1][0]}."
                elif len(given_split) == 3:
                    return f"{family} {given_split[0][0]}.{given_split[1][0]}.{given_split[2][0]}."
                elif len(given_split) == 4:
                    return f"{family} {given_split[0][0]}.{given_split[1][0]}.{given_split[2][0]}.{given_split[3][0]}."
                elif len(given_split) == 5:
                    return f"{family} {given_split[0][0]}.{given_split[1][0]}.{given_split[2][0]}.{given_split[3][0]}.{given_split[4][0]}."
                else:
                    return all_text
            else:
                return f"{family}, {given}"
        elif given:
            all_text = f"{given}"
        else:
            all_text = f"{family}"

mix_up_author_name = F.udf(mix_up_author_name_, StringType())

In [0]:
all_author_data \
.withColumn('new_raw_name', mix_up_author_name(F.col('given_name'), F.col('family_name'), F.col('name_lang'))) \
.select('given_name','family_name', 'new_raw_name','name_lang') \
.write.mode('overwrite').parquet(f"{iteration_save_path}all_processed_data_for_model/art_names_1")

In [0]:
all_author_data \
.withColumn('new_raw_name', mix_up_author_name(F.col('given_name'), F.col('family_name'), F.col('name_lang'))) \
.select('given_name','family_name', 'new_raw_name','name_lang') \
.write.mode('overwrite').parquet(f"{iteration_save_path}all_processed_data_for_model/art_names_2")

In [0]:
all_author_data \
.withColumn('new_raw_name', mix_up_author_name(F.col('given_name'), F.col('family_name'), F.col('name_lang'))) \
.select('given_name','family_name', 'new_raw_name','name_lang') \
.write.mode('overwrite').parquet(f"{iteration_save_path}all_processed_data_for_model/art_names_3")

In [0]:
all_author_data \
.withColumn('new_raw_name', mix_up_author_name(F.col('given_name'), F.col('family_name'), F.col('name_lang'))) \
.select('given_name','family_name', 'new_raw_name','name_lang') \
.write.mode('overwrite').parquet(f"{iteration_save_path}all_processed_data_for_model/art_names_4")

In [0]:
all_author_data \
.withColumn('new_raw_name', mix_up_author_name(F.col('given_name'), F.col('family_name'), F.col('name_lang'))) \
.select('given_name','family_name', 'new_raw_name','name_lang') \
.write.mode('overwrite').parquet(f"{iteration_save_path}all_processed_data_for_model/art_names_5")

#### Pulling all data together

In [0]:
def get_title_cased_text_(output_name, lang):
    work_with_spaces = ['zh-cn','ne','ur','ko','zh-tw','th','ta','pa','mr','kn','ja','hi','he','gu','fa','ar']
    different_but_same = ['uk','el','ru','mk']
    to_remove = ['bn']
    
    if lang in work_with_spaces:
        return output_name
    else:
        return output_name.title()
      
get_title_cased_text = F.udf(get_title_cased_text_, StringType())

In [0]:
author_family_name_lang = \
    spark.read.parquet(f"{iteration_save_path}all_processed_data_for_model/family_name_lang")

author_family_name_lang.cache().count()

og_data = \
    spark.read.parquet(f"{iteration_save_path}all_processed_data_for_model/all_author_data_combined") \
    .filter(F.col('raw_name') != '') \
    .filter(~F.col('raw_name').isNull()) \
    .filter((~F.col('given_name').isNull()) & (~F.col('family_name').isNull())) \
    .join(author_family_name_lang.select('family_name','name_lang'), how='left', on='family_name') \
    .select('given_name','family_name',F.col('raw_name').alias('new_raw_name'), 'name_lang')
    
og_data.cache().count()

Out[5]: 61719291

In [0]:
art_1 = spark.read.parquet(f"{iteration_save_path}all_processed_data_for_model/art_names_1")
art_1.cache().count()

Out[6]: 61719291

In [0]:
art_2 = spark.read.parquet(f"{iteration_save_path}all_processed_data_for_model/art_names_2")
art_2.cache().count()

Out[7]: 61719291

In [0]:
art_3 = spark.read.parquet(f"{iteration_save_path}all_processed_data_for_model/art_names_3")
art_3.cache().count()

Out[8]: 61719291

In [0]:
art_4 = spark.read.parquet(f"{iteration_save_path}all_processed_data_for_model/art_names_4")
art_4.cache().count()

Out[9]: 61719291

In [0]:
art_5 = spark.read.parquet(f"{iteration_save_path}all_processed_data_for_model/art_names_5")
art_5.cache().count()

Out[10]: 61719291

In [0]:
og_data \
.unionByName(art_1) \
.unionByName(art_2) \
.unionByName(art_3) \
.unionByName(art_4) \
.unionByName(art_5) \
.dropDuplicates() \
.filter(~F.col('given_name').isNull()) \
.filter(F.col('given_name')!='') \
.withColumn("output_name", F.concat_ws(' ', F.col('given_name'), F.col('family_name'))) \
.withColumn("output_name", get_title_cased_text(F.col('output_name'), F.col('name_lang'))) \
.select(F.col('new_raw_name').alias('raw_input'), "output_name") \
.coalesce(100).write.mode('overwrite').parquet(f"{iteration_save_path}all_processed_data_for_model/all_training_data")

In [0]:
og_data \
.unionByName(art_1) \
.unionByName(art_2) \
.unionByName(art_3) \
.unionByName(art_4) \
.unionByName(art_5) \
.dropDuplicates() \
.filter(~F.col('given_name').isNull()) \
.filter(F.col('given_name')!='') \
.withColumn("output_name", F.concat_ws(' ', F.col('given_name'), F.col('family_name'))) \
.withColumn("output_name", get_title_cased_text(F.col('output_name'), F.col('name_lang'))) \
.select(F.col('new_raw_name').alias('raw_input'), "output_name", "given_name", "family_name") \
.coalesce(100).write.mode('overwrite').parquet(f"{iteration_save_path}name_embedding_training_data/given_family_output_names")

### Final Training Data

In [0]:
def get_fake_raw_input_(diff_lang_name):
    if diff_lang_name.strip():
        split_name = diff_lang_name.strip().split(" ")
        if len(split_name) == 2:
            rand_float = random.random()
            if rand_float < 0.7:
                return diff_lang_name.strip()
            else:
                return f"{split_name[1]} {split_name[0]}"
        elif len(split_name) == 3:
            rand_float = random.random()
            if rand_float < 0.7:
                return diff_lang_name.strip()
            elif rand_float < 0.85:
                return f"{split_name[2]} {split_name[0]} {split_name[1]}"
            else:
                return f"{split_name[1]} {split_name[2]} {split_name[0]}"
        else:
            return diff_lang_name.strip()
    else:
        return diff_lang_name.strip()
    
get_fake_raw_input = F.udf(get_fake_raw_input_, StringType())

In [0]:
all_train_data = spark.read.parquet(f"{iteration_save_path}all_processed_data_for_model/all_training_data")
all_train_data.cache().count()

Out[33]: 222619352

In [0]:
all_train_data_diff_langs = spark.read.parquet(f"{iteration_save_path}all_processed_data_for_model/all_training_data_other_languages") \
.select(F.col('original_author').alias('output_name')) \
.withColumn('raw_input', get_fake_raw_input(F.col('output_name'))) \
.filter(F.col('raw_input')!="")

all_train_data_diff_langs.cache().count()

Out[35]: 5715600

In [0]:
all_train_data.dropDuplicates(subset=['output_name']).count()

Out[5]: 49093812

In [0]:
all_train_data.sample(0.00001).show(50, truncate=False)

+----------------------------+------------------------------+
|raw_input                   |output_name                   |
+----------------------------+------------------------------+
|A. Rayment                  |A. Rayment                    |
|Muthusami Kumaran           |Muthusami Kumaran             |
|D B Holland                 |D B Holland                   |
|Rossimiriam P.F. Gil        |Rossimiriam P.F. Gil          |
|Norazira, M A               |M. A. Norazira                |
|Medina A.                   |Angélica Medina               |
|Pyry Matikainen             |Pyry Matikainen               |
|R Gubaydullin               |R.R. Gubaydullin              |
|Anna Asgharian              |Anna Asgharian                |
|Salvador Oton-Tortosa       |Salvador Oton Tortosa         |
|Dharamsi N.                 |Nafisa Dharamsi               |
|Amos S.                     |Sitenda Amos                  |
|Jones-Johnson, Gloria       |Gloria Jones-Johnson          |
|Olga I.

In [0]:
train, val = all_train_data.union(all_train_data_diff_langs.select(*all_train_data.columns)).randomSplit([0.9995, 0.0005], seed=0)

In [0]:
train.coalesce(100).write.mode('overwrite').parquet(f"{iteration_save_path}all_processed_data_for_model_diff_langs/train")

In [0]:
val.coalesce(5).write.mode('overwrite').parquet(f"{iteration_save_path}all_processed_data_for_model_diff_langs/val")

In [0]:
226000000*0.0005

Out[10]: 113000.0

### Randomize data

In [0]:
train = spark.read.parquet(f"{iteration_save_path}all_processed_data_for_model_diff_langs/train")
train.cache().count()

Out[39]: 228220506

In [0]:
val = spark.read.parquet(f"{iteration_save_path}all_processed_data_for_model_diff_langs/val")
val.cache().count()

Out[40]: 114446

In [0]:
train.orderBy(F.rand()).coalesce(100).write.mode('overwrite') \
.parquet(f"{iteration_save_path}all_processed_data_for_model_random_diff_langs/train")

In [0]:
val.orderBy(F.rand()).coalesce(5).write.mode('overwrite') \
.parquet(f"{iteration_save_path}all_processed_data_for_model_random_diff_langs/validation")

In [0]:
train = spark.read.parquet(f"{iteration_save_path}all_processed_data_for_model_random_diff_langs/train")
train.cache().count()

Out[43]: 228220506

In [0]:
train.show(50)

+--------------------+--------------------+
|           raw_input|         output_name|
+--------------------+--------------------+
|      Pasha Hasan G.|      Hasan G. Pasha|
|     M. J. De Castro|  María J. De Castro|
|            戸田昇三|            戸田昇三|
|          Didie E.R.|Elizabeth Rose Didie|
|           Avdeev, I|            I Avdeev|
|       Ting‐yue Zhou|       Ting‐Yue Zhou|
|       Borunova A.B.|       A.B. Borunova|
|   Hassoun Mahmoud H|  Mahmoud H. Hassoun|
|      H. Eneriz Imaz|           H. Eneriz|
|     Garber Gwynneth|     Gwynneth Garber|
|         Lam Denisse|         Denisse Lam|
|    Imerman Paula M.|    Paula M. Imerman|
|            Li Y.-X.|            Y.-X. Li|
|              Jong Y|              Jong Y|
|Dr. Vijaya Kumar ...|Dr. Vijaya Kumar ...|
|     Martins José D.|     José D. Martins|
|         H SILBERMAN|    Howard Silberman|
|            E Schell|         Eric Schell|
|              B Yang|      Byoungeun Yang|
|   P Rivera Ceballos|Paulina Rivera Ce.

#### Things to look at

In [0]:
affiliations.filter(F.col('original_orcid')=='0000-0001-9695-4543') \
.select('original_orcid','original_author').show(25, truncate=False)

+-------------------+---------------+
|original_orcid     |original_author|
+-------------------+---------------+
|0000-0001-9695-4543|Valeriy Lakhno |
|0000-0001-9695-4543|Valerii Lakhno |
|0000-0001-9695-4543|Valery Lakhno  |
|0000-0001-9695-4543|V. Lakhno      |
|0000-0001-9695-4543|V. Lakhno      |
|0000-0001-9695-4543|V. A. Lakhno   |
|0000-0001-9695-4543|Valerii Lakhno |
|0000-0001-9695-4543|Valery Lakhno  |
|0000-0001-9695-4543|Lakhno Valeriy |
+-------------------+---------------+



In [0]:
affiliations.filter(F.col('original_orcid')!='') \
.groupby('original_orcid') \
.agg(F.collect_set(F.col('original_author')).alias('author_names')) \
.withColumn('author_names_len', F.size(F.col('author_names'))) \
.filter(F.col('author_names_len') > 5) \
.show(25, truncate=False)

+-------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------+
|original_orcid     |author_names                                                                                                                                                                                           |author_names_len|
+-------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------+
|0000-0001-6206-2133|[S. A. Melchenko, Yu.M. Zabrodskaya, Yu. M. Zabrodskaya, J. M. Zabrodskaya, Yu M Zabrodskaya, Yu. M. Zabrodskaia]                                                                                      |6               |
|0000-0001-6268-6552|[José Padrón, José M. P