In [None]:
import os
import boto3
import pickle
import faiss
import pandas as pd
import numpy as np
import networkx as nx
import unidecode
import unicodedata
import Levenshtein as lev
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', 50)
from glob import glob
from datetime import datetime
from sklearn.metrics.pairwise import cosine_similarity
from nameparser import HumanName

In [None]:
# !pip install nameparser

#### Disambiguator Model

In [None]:
with open("<local-path-to-model>/Disambiguator.pkl", "rb") as f:
    disambiguator_model = pickle.load(f)

#### Getting data for testing

In [None]:
def create_author_name_list_from_list(name_lists):
    if not isinstance(name_lists, list):
        name_lists = name_lists.tolist()
    
    name_list_len = len(name_lists[0])
    
    temp_name_list = [[j[i] for j in name_lists] for i in range(name_list_len)]
    temp_name_list_2 = [[j[0] for j in i if j] for i in temp_name_list]
    
    return [list(set(x)) for x in temp_name_list_2]

In [None]:
def check_block_vs_block(block_1_names_list, block_2_names_list):
    
    # check first names
    first_check, _ = match_block_names(block_1_names_list[0], block_1_names_list[1], block_2_names_list[0], 
                                    block_2_names_list[1])
    # print(f"FIRST {first_check}")
    
    if first_check:
        last_check, _ = match_block_names(block_1_names_list[-2], block_1_names_list[-1], block_2_names_list[-2], 
                                           block_2_names_list[-1])
        # print(f"LAST {last_check}")
        if last_check:
            m1_check, more_to_go = match_block_names(block_1_names_list[2], block_1_names_list[3], block_2_names_list[2], 
                                           block_2_names_list[3])
            if m1_check:
                if not more_to_go:
                    return 1
                m2_check, more_to_go = match_block_names(block_1_names_list[4], block_1_names_list[5], block_2_names_list[4], 
                                                block_2_names_list[5])
                
                if m2_check:
                    if not more_to_go:
                        return 1
                    m3_check, more_to_go = match_block_names(block_1_names_list[6], block_1_names_list[7], block_2_names_list[6], 
                                                block_2_names_list[7])
                    if m3_check:
                        if not more_to_go:
                            return 1
                        m4_check, more_to_go = match_block_names(block_1_names_list[8], block_1_names_list[8], block_2_names_list[8], 
                                                block_2_names_list[9])
                        if m4_check:
                            if not more_to_go:
                                return 1
                            m5_check, _ = match_block_names(block_1_names_list[10], block_1_names_list[11], block_2_names_list[10], 
                                                block_2_names_list[11])
                            if m5_check:
                                return 1
                            else:
                                return 0
                        else:
                            return 0
                    else:
                        return 0
                else:
                    return 0
            else:
                return 0
        else:
            return 0
    else:
        swap_check = check_if_last_name_swapped_to_front_creates_match(block_1_names_list, block_2_names_list)
        # print(f"SWAP {swap_check}")
        if swap_check:
            return 1
        else:
            return 0

        
def get_name_from_name_list(name_list):
    name = []
    for i in range(0,12,2):
        if name_list[i]:
            name.append(name_list[i][0])
        elif name_list[i+1]:
            name.append(name_list[i+1][0])
        else:
            break
    if name_list[-2]:
        name.append(name_list[-2][0])
    elif name_list[-1]:
        name.append(name_list[-1][0])
    else:
        pass

    return name
        
def check_if_last_name_swapped_to_front_creates_match(block_1, block_2):
    name_1 = get_name_from_name_list(block_1)
    if len(name_1) != 2:
        return False
    else:
        name_2 = get_name_from_name_list(block_2)
        if len(name_2)==2:
            if " ".join(name_1) == " ".join(name_2[-1:] + name_2[:-1]):
                return True
            else:
                return False
        else:
            return False
        
def check_if_first_name_has_wrong_letter(block_1_names, block_1_initials, block_2_names, block_2_initials):
    if block_1_initials and block_2_initials:
        if any(x in block_1_initials for x in block_2_initials):
            if block_1_names and block_2_names:
                for block_1 in block_1_names:
                    for block_2 in block_2_names:
                        dist = lev.distance(block_1, block_2)
                        if dist <=1:
                            if len(block_1) == len(block_2):
                                if len(block_1) > 4:
                                    return True
                                    break
                                else:
                                    pass
                            else:
                                pass
                        else:
                            pass
                return False
            else:
                return False
        else:
            return False
    else:
        return False
    
def match_block_names(block_1_names, block_1_initials, block_2_names, block_2_initials):
    if block_1_names and block_2_names:
        if any(x in block_1_names for x in block_2_names):
            return True, True
        else:
            return False, True
    elif block_1_names and not block_2_names:
        if block_2_initials:
            if any(x in block_1_initials for x in block_2_initials):
                return True, True
            else:
                return False, True
        else:
            return True, True
    elif not block_1_names and block_2_names:
        if block_1_initials:
            if any(x in block_1_initials for x in block_2_initials):
                return True, True
            else:
                return False, True
        else:
            return True, True
    elif block_1_initials and block_2_initials:
        if any(x in block_1_initials for x in block_2_initials):
            return True, True
        else:
            return False, True
    else:
        return True, False

In [None]:
def create_row_label(work_1, work_2):
    work_list = [work_1, work_2]
    work_list.sort()
    return "|".join(work_list)

In [None]:
def transform_name_for_search(name):
    name = unidecode.unidecode(unicodedata.normalize('NFKC', name))
    name = name.lower().replace(" ", " ").replace(".", " ").replace(",", " ").replace("|", " ").replace(")", "").replace("(", "")\
        .replace("-", "").replace("&", "").replace("$", "").replace("#", "").replace("@", "").replace("%", "").replace("0", "") \
        .replace("1", "").replace("2", "").replace("3", "").replace("4", "").replace("5", "").replace("6", "").replace("7", "") \
        .replace("8", "").replace("9", "").replace("*", "").replace("^", "").replace("{", "").replace("}", "").replace("+", "") \
        .replace("=", "").replace("_", "").replace("~", "").replace("`", "").replace("[", "").replace("]", "").replace("\\", "") \
        .replace("<", "").replace(">", "").replace("?", "").replace("/", "").replace(";", "").replace(":", "").replace("\'", "") \
        .replace("\"", "")
    name = " ".join(name.split())
    return name

In [None]:
def check_name_for_initials(author_name):
    author_name = transform_name_for_search(author_name)
    author_split = author_name.split(" ")
    if len(author_split[0]) == 1:
        return 1
    else:
        return 0

#### Creating separate files for pairs that match ORCIDs vs pairs that don't

In [None]:
all_keys = []

s3 = boto3.resource('s3')
my_bucket = "my-bucket"
for my_bucket_object in my_bucket.objects.filter(Prefix='V3/final_model_data/names_to_check/testing_names_with_data_attached_processed_for_xgb/').all():
    if my_bucket_object.key.endswith('parquet'):
        all_keys.append(my_bucket_object.key)

In [None]:
predictors = ['inst_per','concepts_shorter_per', 'coauthors_shorter_per','exact_match_len','exact_match_spaces','citation_per','citation_work_match']

In [None]:
%%time
for i, s3_key in enumerate(all_keys):
    print(i)
    raw_data = pd.read_parquet(f"s3://my-bucket/{s3_key}") \
        .fillna(0.0)
    
    raw_data['exact_match'] = raw_data.apply(lambda x: 1 if x.original_author_1==x.original_author_2 else 0, axis=1)
    raw_data['name_1_len'] = raw_data['original_author_1'].apply(len)
    raw_data['name_1_spaces'] = raw_data['original_author_1'].apply(lambda x: len(x.split(" ")))
    raw_data['exact_match_len'] = raw_data['exact_match'] * raw_data['name_1_len']
    raw_data['exact_match_spaces'] = raw_data['exact_match'] * raw_data['name_1_spaces']
    raw_data['inst_per'] = raw_data['inst_per'].apply(lambda x: 1 if x > 0 else 0)
    
    raw_data['row_label'] = raw_data.apply(lambda x: create_row_label(x.work_id_1, x.work_id_2), axis=1)

    final_data_to_pred = raw_data.copy()

    orcid_matches = final_data_to_pred[((final_data_to_pred['orcid_1']==final_data_to_pred['orcid_2']) & 
                                       (final_data_to_pred['orcid_1']!=''))].copy()

    final_data_to_pred = final_data_to_pred[~((final_data_to_pred['orcid_1']==final_data_to_pred['orcid_2']) & 
                                       (final_data_to_pred['orcid_1']!=''))].copy()

    orcid_non_matches = final_data_to_pred[((final_data_to_pred['orcid_1']!='') & 
                                           (final_data_to_pred['orcid_2']!=''))].copy()

    final_data_to_pred = final_data_to_pred[~((final_data_to_pred['orcid_1']!='') & 
                                           (final_data_to_pred['orcid_2']!=''))].copy()

    all_to_pred = final_data_to_pred[predictors].copy()
    
    probs = disambiguator_model.predict_proba(np.asarray(all_to_pred))[:,1]
    
    final_data_to_pred['pred_score'] = probs
    
    orcid_matches \
        .to_parquet(f"./test_name_preds/{i}_orcid.parquet")
    
    final_data_to_pred[final_data_to_pred['pred_score'] > 0.2] \
        .to_parquet(f"./test_name_preds/{i}_model.parquet")

In [None]:
s3 = boto3.resource('s3')
my_bucket = ""
for my_bucket_object in my_bucket.objects.filter(Prefix='V3/final_model_data/names_to_check/work_id_info_to_join/').all():
    if my_bucket_object.key.endswith('parquet'):
        raw_data_file_key = my_bucket_object.key

#### Inital testing of clustering method based on pairs that are scored by model
These functions and methods are eventually developed and remade in a Pyspark notebook (in the initial clustering file) but initial testing was done using the code below to validate the method being used.

In [None]:
def get_scored_and_orcid_data():
    old_pairs_df = pd.DataFrame()
    old_orcid_pairs = pd.DataFrame()
    for i in range(10):
        orcid_temp = pd.read_parquet(f"./test_name_preds/{i}_orcid.parquet", 
                                     columns=['row_label','work_id_1','work_id_2','orcid_1','original_author_1',
                                              'original_author_2']).rename(columns={'row_label':'pairs'})
        orcid_temp['score'] = 1.0
        orcid_temp['pred_type'] = 'orcid'
        model_temp = pd.read_parquet(f"./test_name_preds/{i}_model.parquet", 
                                     columns=['row_label', 'pred_score','work_id_1','work_id_2'])
        model_temp['pred_type'] = 'model'

        old_orcid_pairs = pd.concat([old_orcid_pairs, orcid_temp], axis=0)
        old_pairs_df = pd.concat([old_pairs_df, model_temp], axis=0)
        
    return old_pairs_df, old_orcid_pairs

In [None]:
def get_unique_orcid(list_of_orcids):
    if not isinstance(list_of_orcids, list):
        list_of_orcids = list_of_orcids.tolist()
        
    orcids = [x for x in list_of_orcids if x]
    
    if orcids:
        return orcids[0]
    else:
        return ""

In [None]:
def get_unique_works(list_of_works):
    if not isinstance(list_of_works, list):
        list_of_works = list_of_works.tolist()
        
    works = [x for y in list_of_works for x in y]
        
    return list(set(works))

In [None]:
def group_latest_pairs(new_df, pairs):
    df = new_df \
        [['cluster_num','work_id','name_match_list','orcid']] \
        .reset_index(drop=True).merge(pairs, how='left', on='cluster_num')
    
    df['cluster_num'] = df['cluster_num'].astype('str')
    df['new_cluster_label'] = df.apply(lambda x: x.new_cluster_label 
                                       if isinstance(x.new_cluster_label, str) 
                                       else x.cluster_num, axis=1)
    
    s = df \
        .fillna(df['cluster_num']) \
        .groupby('new_cluster_label').agg({"work_id": get_unique_works, 
                                       "name_match_list": create_author_name_list_from_list, 
                                       "orcid": get_unique_orcid}).reset_index().drop('new_cluster_label', axis=1)
    
    s.columns = ['work_id','name_match_list','orcid']
    
    s['cluster_num'] = s.index.astype('str')
    
    return s[['cluster_num','work_id','name_match_list','orcid']]

In [None]:
def init_round_of_clustering(cluster_df, thresh=0.4):
    pairs_df, orcid_pairs = get_scored_and_orcid_data()
    
    pairs_df = pairs_df[pairs_df['pred_score']>thresh] \
        .rename(columns={'row_label':'pairs', 
                         'work_id_1':'work_1', 
                         'work_id_2':'work_2', 
                         'pred_score':'score'}).copy()
    
    cluster_df, leftovers, _ = round_of_clustering(cluster_df, pairs_df)
    
    return cluster_df, leftovers

In [None]:
def get_unique_pairs(pairs, num_1s, num_2s):
    taken_numbers = set()
    result = []

    for pair, num1, num2 in zip(pairs, num_1s, num_2s):
        if num1 in taken_numbers or num2 in taken_numbers:
            continue  # Skip this pair if either number is already taken
        taken_numbers.add(num1)
        taken_numbers.add(num2)
        clust_pair = f"{num1}|{num2}"
        result.append([pair, clust_pair, [num1, num2]])
        
    return pd.DataFrame(result, columns=['pairs','new_cluster_label','cluster_num']).explode('cluster_num')

In [None]:
def round_of_clustering(cluster_df, leftovers):
    old_clust_size = cluster_df.shape[0]
    
    exploded_cluster_df = cluster_df.explode('work_id').copy()
    
    print(f"-------Leftovers shape: {leftovers.shape}")
    
    if leftovers.shape[0] == 0:
        new_cluster_df = cluster_df.copy()
        new_leftovers = leftovers.copy()
        cluster_change = 0
    else:

        init_final_pairs_df = leftovers \
            .merge(exploded_cluster_df.rename(columns={'work_id':'work_1','orcid':'orcid_1',
                                                  'name_match_list':'name_match_list_1', 
                                                  'cluster_num':'cluster_num_1'}).copy(), 
                   how='inner', on='work_1') \
            .merge(exploded_cluster_df.rename(columns={'work_id':'work_2','orcid':'orcid_2',
                                                  'name_match_list':'name_match_list_2', 
                                                  'cluster_num':'cluster_num_2'}).copy(), 
                   how='inner', on='work_2').sort_values('score', ascending=False)

        init_final_pairs_df = init_final_pairs_df[init_final_pairs_df['cluster_num_1']!=
                                                  init_final_pairs_df['cluster_num_2']].copy()

#         print(f"%%%%%%% {init_final_pairs_df[(init_final_pairs_df['work_1']=='2059275568_1') | (init_final_pairs_df['work_2']=='2059275568_1')].shape}")

        init_final_pairs_df = init_final_pairs_df[((init_final_pairs_df['orcid_1']!='') &
                                                   (init_final_pairs_df['orcid_2']==init_final_pairs_df['orcid_1'])) | 
                                                  ((init_final_pairs_df['orcid_1']!='') & 
                                                   (init_final_pairs_df['orcid_2']=='')) | 
                                                  ((init_final_pairs_df['orcid_1']=='') & 
                                                   (init_final_pairs_df['orcid_2']!='')) | 
                                                  ((init_final_pairs_df['orcid_1']=='') & 
                                                   (init_final_pairs_df['orcid_2']==''))].copy()

#         print(f"%%%%%%% {init_final_pairs_df[(init_final_pairs_df['work_1']=='2059275568_1') | (init_final_pairs_df['work_2']=='2059275568_1')].shape}")

        init_final_pairs_df['name_check'] = init_final_pairs_df.apply(lambda x: 
                                                                      check_block_vs_block(x.name_match_list_1,
                                                                                           x.name_match_list_2), 
                                                                      axis=1)

        init_final_pairs_df = init_final_pairs_df[init_final_pairs_df['name_check']==1] \
            .sort_values('score', ascending=False).copy()

#         print(f"%%%%%%% {init_final_pairs_df[(init_final_pairs_df['work_1']=='2059275568_1') | (init_final_pairs_df['work_2']=='2059275568_1')].shape}")

        final_cluster_pairs_df = get_unique_pairs(init_final_pairs_df['pairs'].tolist()[:800000], 
                                                  init_final_pairs_df['cluster_num_1'].tolist()[:800000],
                                                  init_final_pairs_df['cluster_num_2'].tolist()[:800000])
        print(f"-------Number of pairs shape: {final_cluster_pairs_df.shape[0]}")

        new_cluster_df = group_latest_pairs(cluster_df, final_cluster_pairs_df[['new_cluster_label', 'cluster_num']])

        new_leftovers = init_final_pairs_df.merge(final_cluster_pairs_df[['pairs','cluster_num']] \
                                                       .drop_duplicates(subset=['pairs']), 
                                                       how='left', on='pairs')
        new_leftovers = new_leftovers[new_leftovers['cluster_num'].isnull()] \
            [['pairs','score','pred_type','work_1','work_2']].drop_duplicates(subset=['pairs']).copy()

#         print(f"%%%%%%% {new_leftovers[(new_leftovers['work_1']=='2059275568_1') | (new_leftovers['work_2']=='2059275568_1')].shape}")

        new_clust_size = new_cluster_df.shape[0]
        cluster_change = (old_clust_size - new_clust_size)/old_clust_size
    return new_cluster_df, new_leftovers, cluster_change

In [None]:
def get_init_orcid_clusters(raw_data_file):
    raw_data = pd.read_parquet(f"s3://author-disambiguation/{raw_data_file}")
    raw_data['author_ind'] = raw_data.index
    raw_data['work_id'] = raw_data['work_id'].apply(lambda x: [x])
    raw_data['initials_name'] = raw_data['original_author'].apply(check_name_for_initials)
    raw_data['name_match_list'] = raw_data['name_match_list'].apply(lambda x: [i.tolist() for i in x])
    init_orcid_matches = raw_data[raw_data['orcid']!=""] \
        .groupby('orcid').agg({"name_match_list": create_author_name_list_from_list,
                               "work_id": get_unique_works}).reset_index()

    init_non_orcid_matches = raw_data[raw_data['orcid']==""][['work_id','orcid','name_match_list']].copy()
    
    init_non_orcid_matches['work_id'] = init_non_orcid_matches['work_id'].apply(list)

    matched_orcid_to_join = pd.concat([init_orcid_matches[['work_id','orcid','name_match_list']], 
                                       init_non_orcid_matches], axis=0).reset_index(drop=True)
    
    matched_orcid_to_join['cluster_num'] = matched_orcid_to_join.index.astype('str')
    
    return matched_orcid_to_join

In [None]:
def perform_clustering(raw_data_file, thresh=0.4):
    print(f"Iteration Number: ORCID")
    cluster_df = get_init_orcid_clusters(raw_data_file)
    _ = check_cluster_stats(cluster_df, raw_data_file)
    print("")
    
    print(f"Iteration Number: INIT")
    cluster_df, leftovers = init_round_of_clustering(cluster_df, thresh=thresh)
    
    _ = check_cluster_stats(cluster_df, raw_data_file)
    print("")
    
    
    for i in range(50):
        print(f"Iteration Number: {i}a")
        cluster_df, leftovers, cluster_change = round_of_clustering(cluster_df, leftovers)
        _ = check_cluster_stats(cluster_df, raw_data_file)
        
        print(f"-------Cluster change percentage: {round(cluster_change, 4)}")
        print("")
        
        if cluster_change < 0.00000001:
            break
        
    return cluster_df, leftovers

In [None]:
def continue_clustering(raw_data_file, cluster_df, leftovers, rounds):
    for i in range(rounds):
        print(f"Iteration Number: {i}a")
        cluster_df, leftovers, cluster_change = round_of_clustering(cluster_df, leftovers)
        _ = check_cluster_stats(cluster_df, raw_data_file)
        
        print(f"-------Cluster change percentage: {round(cluster_change, 4)}")
        if cluster_change < 0.00000001:
            break
            
        print("")
        
    return cluster_df, leftovers

In [None]:
def final_cluster_for_name_and_two_metrics(cluster_df, raw_data_file, metric_1, metric_2):
    level_0_ids = ['17744445','138885662','162324750','144133560','15744967','33923547','71924100','86803240',
               '41008148','127313418','185592680','142362112','144024400','127413603','205649164','95457728',
               '192562407','121332964','39432304']
    
    cluster_df = cluster_df[['cluster_num','work_id']].copy()
    
    raw_data = pd.read_parquet(f"s3://my-bucket/{raw_data_file}")
    
    raw_data['name_match_list'] = raw_data['name_match_list'].apply(lambda x: [i.tolist() for i in x])
    raw_data['concepts'] = raw_data['concepts'].apply(lambda x: [i for i in list(set(x)) if i not in level_0_ids])
    raw_data['coauthors'] = raw_data['coauthors'].apply(lambda x: [i for i in x if len(i) > 6])

    matched_works = raw_data.merge(cluster_df.explode('work_id'), how='inner', on='work_id')
    matched_works['author_ind'] = 1

    grouped_explore = matched_works.groupby('cluster_num').agg({"original_author": set, 
                                                              "name_match_list": create_author_name_list_from_list,
                                                              "orcid": set,
                                                              "work_id": set, 
                                                              "concepts": list,
                                                              "author_ind": np.ma.count, 
                                                              "institutions": list, 
                                                              "coauthors": list}).reset_index()

    grouped_explore['orcid'] = grouped_explore['orcid'].apply(lambda x: [i for i in list(x) if i])
    grouped_explore['orcid_len'] = grouped_explore['orcid'].apply(lambda x: len(x))
    grouped_explore['orcid'] = grouped_explore['orcid'].apply(lambda x: x[0] if x else "")
    grouped_explore['work_id'] = grouped_explore['work_id'].apply(list)
    grouped_explore['original_author_list'] = grouped_explore['original_author'].apply(list)
    grouped_explore['coauthors'] = grouped_explore['coauthors'].apply(lambda x: list(set([i for j in x for i in j])))
    grouped_explore['concepts'] = grouped_explore['concepts'].apply(lambda x: list(set([i for j in x for i in j])))
    grouped_explore['institutions'] = grouped_explore['institutions'].apply(lambda x: list(set([i for j in x 
                                                                                                for i in j])))
    grouped_explore['original_author'] = grouped_explore['original_author'].apply(lambda x: "|".join(x))
    
    print(grouped_explore.shape)
    
    author_names = grouped_explore.explode('original_author_list') \
        .groupby('original_author_list')['author_ind'].count().reset_index()
    author_names_show_twice = author_names[author_names['author_ind']>1][['original_author_list']].copy()
    
    grouped_explore_2 = grouped_explore.explode('original_author_list') \
        .merge(author_names_show_twice, how='inner', on='original_author_list') \
        .explode(metric_1).explode(metric_2) \
        .groupby(['original_author_list', metric_1, metric_2]).agg({"cluster_num": set, 
                                                                    "orcid": set})

    print(grouped_explore_2.shape)
    grouped_explore_2['cluster_num'] = grouped_explore_2['cluster_num'].apply(list)
    grouped_explore_2['cluster_num_len'] = grouped_explore_2['cluster_num'].apply(len)
    grouped_explore_2 = grouped_explore_2[grouped_explore_2['cluster_num_len']>1].copy()
    
    print(grouped_explore_2.shape)
    
    grouped_explore_2['orcid'] = grouped_explore_2['orcid'].apply(lambda x: [i for i in list(x) if i])
    grouped_explore_2['orcid_len'] = grouped_explore_2['orcid'].apply(lambda x: len(x))
    grouped_explore_2 = grouped_explore_2[grouped_explore_2['orcid_len']<2].copy()
    
    print(grouped_explore_2.shape)
    
    grouped_explore_2['cluster_label'] = grouped_explore_2['cluster_num'].apply(lambda x: "|".join([str(i) 
                                                                                            for i in sorted(x)]))
    
    latest_pairs = grouped_explore_2.sort_values('cluster_num_len').drop_duplicates('cluster_label') \
        [['cluster_label', 'cluster_num']].copy()
    
    print(latest_pairs.shape)
    final_pairs = get_unique_clusters(latest_pairs)
    return final_pairs

In [None]:
def get_unique_clusters(pair_df):
    taken_numbers = set()
    result = []

    for label, nums in zip(pair_df['cluster_label'].tolist(), 
                           pair_df['cluster_num'].tolist()):
        new_nums = [x for x in nums if x not in taken_numbers]
        if len(new_nums) != len(nums):
            continue
        for num in nums:
            taken_numbers.add(num)
        result.append([label, nums])
        
    return pd.DataFrame(result, columns=['new_cluster_label','cluster_num']).explode('cluster_num')

In [None]:
def check_cluster_stats(cluster_df, raw_data_file):
    cluster_df = cluster_df[['cluster_num','work_id']].copy()
    
    raw_data = pd.read_parquet(f"s3://my-bucket/{raw_data_file}", 
                               columns=['work_id','orcid','name_match_list'])
    
    raw_data['name_match_list'] = raw_data['name_match_list'].apply(lambda x: [i.tolist() for i in x])

    matched_works = raw_data.merge(cluster_df.explode('work_id'), how='inner', on='work_id')
    matched_works['author_ind'] = 1

    grouped_explore = matched_works.groupby('cluster_num').agg({"name_match_list": create_author_name_list_from_list,
                                                                "orcid": set,
                                                                "work_id": set, 
                                                                "author_ind": np.ma.count}).reset_index()

    grouped_explore['orcid'] = grouped_explore['orcid'].apply(lambda x: [i for i in list(x) if i])
    grouped_explore['orcid_len'] = grouped_explore['orcid'].apply(lambda x: len(x))
    grouped_explore['orcid'] = grouped_explore['orcid'].apply(lambda x: x[0] if x else "")
    grouped_explore['work_id'] = grouped_explore['work_id'].apply(list)

    print(f"-------Number of clusters: {grouped_explore[grouped_explore['author_ind']>1].shape[0]}")
    print(f"-------Number of single clusters: {grouped_explore[grouped_explore['author_ind']==1].shape[0]}")
    print(f"-------Highest number of ORCIDs in one cluster: {grouped_explore['orcid_len'].max()}")

In [None]:
s3 = boto3.resource('s3')
my_bucket = "my-bucket"
for my_bucket_object in my_bucket.objects.filter(Prefix='V3/final_model_data/names_to_check/work_id_info_to_join/').all():
    if my_bucket_object.key.endswith('parquet'):
        raw_data_file_key = my_bucket_object.key

In [None]:
%%time
cluster_df, leftovers = perform_clustering(raw_data_file_key, thresh=0.2)

#### Some code to test the clusters that are made

In [None]:
new_cluster_df = new_cluster_df[['cluster_num','work_id']].copy()
    
raw_data = pd.read_parquet(f"s3://my-bucket/{raw_data_file_key}")

raw_data['name_match_list'] = raw_data['name_match_list'].apply(lambda x: [i.tolist() for i in x])
raw_data['concepts'] = raw_data['concepts'].apply(lambda x: list(set(x)))

matched_works = raw_data.merge(new_cluster_df.explode('work_id'), how='inner', on='work_id')
matched_works['author_ind'] = 1

grouped_explore = matched_works.groupby('cluster_num').agg({"original_author": set, 
                                                          "name_match_list": create_author_name_list_from_list,
                                                          "orcid": set,
                                                          "work_id": set, 
                                                          "concepts": list,
                                                          "author_ind": np.ma.count, 
                                                          "institutions": list, 
                                                          "coauthors": list}).reset_index()

grouped_explore['orcid'] = grouped_explore['orcid'].apply(lambda x: [i for i in list(x) if i])
grouped_explore['orcid_len'] = grouped_explore['orcid'].apply(lambda x: len(x))
grouped_explore['work_id'] = grouped_explore['work_id'].apply(list)
grouped_explore['original_author_list'] = grouped_explore['original_author'].apply(list)
grouped_explore['coauthors'] = grouped_explore['coauthors'].apply(lambda x: list(set([i for j in x for i in j])))
grouped_explore['concepts'] = grouped_explore['concepts'].apply(lambda x: list(set([i for j in x for i in j])))
grouped_explore['institutions'] = grouped_explore['institutions'].apply(lambda x: list(set([i for j in x for i in j])))
grouped_explore['original_author'] = grouped_explore['original_author'].apply(lambda x: "|".join(x))

unmatched_works = grouped_explore[grouped_explore['author_ind']==1].copy()