In [4]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 200)
# pd.set_option('display.max_colwidth', None)
import json
import os
import time
import random
import boto3
import csv
import glob
import sys
import networkx as nx
from unidecode import unidecode
from networkx.algorithms.components.connected import connected_components
from itertools import combinations_with_replacement, combinations
from nameparser import HumanName
from datetime import datetime
from functools import reduce

In [2]:
# !pip install faiss-cpu
# !pip install h5py==2.8.0 (https://stackoverflow.com/questions/39927206/yum-install-libhdf5-dev-on-amazon-linux)
# !pip install ujson
# !pip install nameparser
# !pip install swifter
# !pip install duckdb
# !pip install gputil
# !pip install sentence-transformers
# !pip install networkx
# !pip install unidecode

In [5]:
s3_client = boto3.client("s3")
bucket_name = "author-disambiguation"
paginator = s3_client.get_paginator("list_objects_v2")
response = paginator.paginate(Bucket=bucket_name, Prefix="V1/vectorized_data/",
                              PaginationConfig={"PageSize": 50})

data_filenames = []

for page in response:
    files = page.get("Contents")
    for file in files:
        if file['Key'].endswith(".json"):
            data_filenames.append(file['Key'])

In [6]:
s3_client = boto3.client("s3")
bucket_name = "author-disambiguation"
paginator = s3_client.get_paginator("list_objects_v2")
response = paginator.paginate(Bucket=bucket_name, Prefix="V1/vectorized_data/",
                              PaginationConfig={"PageSize": 50})

cluster_filenames = []

for page in response:
    files = page.get("Contents")
    for file in files:
        if "cluster_file" in file['Key']:
            if '_466_' not in file['Key']:
                cluster_filenames.append(file['Key'])

In [7]:
len(cluster_filenames)

501

In [8]:
len(data_filenames)

501

In [9]:
cluster_filenames.sort()

In [10]:
data_filenames.sort()

In [11]:
filenames = [[x,y] for x,y in zip(data_filenames, cluster_filenames)]

In [10]:
mapping = pd.read_parquet("partition_id_node_mapping.parquet").sort_values('partition_id')

In [11]:
mapping_dict = {x:y for x,y in zip(mapping['partition_id'].tolist(), 
                                   mapping['node'].tolist())}

In [14]:
import pandas as pd

In [15]:
orcid_df = pd.read_parquet("s3://author-disambiguation/V1/paperid_authorid_orcid_mapping/part-00000-tid-1253535575090041537-acbc1baa-04c5-4220-96c6-46c1ea79823e-127-1-c000.snappy.parquet")

In [16]:
orcid_df['paper_author_id'] = orcid_df.apply(lambda x: f"W{x.paper_id}_{x.author_sequence_number}", axis=1)

In [17]:
orcid_df.sample(5)

Unnamed: 0,paper_id,author_id,author_sequence_number,original_orcid,paper_author_id
353982,4281557255,4282288709,2,0000-0002-7586-786X,W4281557255_2
699544,4225270896,2776362133,1,0000-0002-8388-8185,W4225270896_1
4285574,4288046151,2682371531,3,0000-0003-0902-2858,W4288046151_3
3012923,4285805764,4285864727,2,0000-0003-3837-5396,W4285805764_2
656695,4206450394,2134008631,4,0000-0003-0151-9990,W4206450394_4


In [18]:
orcid = orcid_df[['paper_author_id','original_orcid']].drop_duplicates().copy()
orcid.columns = ['data_id','orcid']

In [20]:
orcid.to_parquet("paperid_orcid_mapping.parquet")

In [16]:
# random.shuffle(filenames)

In [17]:
def replace_empty_lists_with_None(col_list):
    if isinstance(col_list, list):
        if col_list:
            return col_list
        else:
            return None
    else:
        return None

In [18]:
def replace_empty_affiliations(col_list):
    if isinstance(col_list, list):
        if len(col_list) > 0:
            if col_list[0] == "":
                return None
            else:
                return " ".join(col_list)
        else:
            return None
    else:
        return None

In [19]:
def get_block_id(name):
    person = HumanName(name)
    last_name = person.last
    first_name = person.first
    if (len(first_name) < 1) & (len(last_name) < 1):
        return name.lower()
    elif len(first_name) < 1:
        return last_name
    elif len(last_name) < 1:
        return name.lower()
    else:
        initials = "%s_%s" % (first_name[0], last_name)
        return initials.lower()

In [20]:
def check_latin_name(text):
    try:        
        str(text).encode('latin-1')
        return True
    except:
        return False

In [21]:
def human_name(name_text):
    name_text = name_text.strip().replace(".", " ").replace("-", " ").replace("  ", " ").replace("  ", " ")\
                .replace("  ", " ")
    person = HumanName(name_text)
    first_name = "".join([x for x in person.first if x not in [".", "-"]])
    last_name = person.last
    middle_name_1 = person.middle.strip()
    if len(middle_name_1.split(" ")) > 1:
        middle_name_2 = " ".join(middle_name_1.split(" ")[1:]).strip()
        middle_name_1 = middle_name_1.split(" ")[0].strip()
    else:
        middle_name_2 = ""
        
    middle_name_1 = "".join([x for x in middle_name_1 if x not in [".", "-"]])
    middle_name_2 = "".join([x for x in middle_name_2 if x not in [".", "-"]])
        
    if (len(first_name) == 3) and (first_name.isupper()) and (not middle_name_2 and not middle_name_1):
        middle_name_1 = first_name[1]
        middle_name_2 = first_name[2]
        first_name = first_name[0]
    elif (len(first_name) == 2) and (first_name.isupper()) and (not middle_name_1):
        middle_name_1 = first_name[1]
        first_name = first_name[0]
        
    return [unidecode(first_name), unidecode(middle_name_1), unidecode(middle_name_2), unidecode(last_name)]

In [22]:
def author_names_list(list_of_author_names):
    if not isinstance(list_of_author_names, list):
        list_of_author_names = list_of_author_names.tolist()
    return list(set(list_of_author_names))

In [23]:
def create_author_name_match_from_matches(name_matches):
    first_name = []
    first_init = []
    m1_name = []
    m1_init = []
    m2_name = []
    m2_init = []
    
    for name_match in name_matches:
        if name_match[0]:
            first_name.append(name_match[0][0])
        if name_match[1]:
            first_init.append(name_match[1][0])
        if name_match[2]:
            m1_name.append(name_match[2][0])
        if name_match[3]:
            m1_init.append(name_match[3][0])
        if name_match[4]:
            m2_name.append(name_match[4][0])
        if name_match[5]:
            m2_init.append(name_match[5][0])
    return [list(set(first_name)), list(set(first_init)), list(set(m1_name)), 
            list(set(m1_init)), list(set(m2_name)), list(set(m2_init))]

In [24]:
def match_block_names(block_1_names, block_1_initials, block_2_names, block_2_initials):
    if block_1_names and block_2_names:
        if block_1_names == block_2_names:
            return True
        else:
            return False
    elif block_1_names and not block_2_names:
        if block_2_initials:
            if block_1_initials == block_2_initials:
                return True
            else:
                return False
        else:
            return True
    elif not block_1_names and block_2_names:
        if block_1_initials:
            if block_1_initials == block_2_initials:
                return True
            else:
                return False
        else:
            return True
    elif block_1_initials and block_2_initials:
        if block_1_initials == block_2_initials:
            return True
        else:
            return False
    else:
        return True
    

####### ALL ABOVE IS DONE (except for variables and paths)

In [25]:
def create_author_name_match(author_names):
    
    if not isinstance(author_names, list):
        if isinstance(author_names, str):
            author_names = [author_names]
        else:
            author_names = author_names.tolist()
        
    first_names = []
    first_initials = []
    middle_1_names = []
    middle_1_initials = []
    middle_2_names = []
    middle_2_initials = []
    
    for author_name in author_names:
        if not author_name:
            pass
        else:
            if (check_latin_name(author_name)) and (len(author_name.split(" "))>1): 
                name = human_name(author_name)
                # get all of the different versions of the name here

                if name[0] and name[1] and name[2] and name[3]:
                    # first name
                    if len(name[0]) > 1:
                        first_names.append(str(name[0]).lower())
                        first_initials.append(str(name[0])[0].lower())
                    else:
                        first_initials.append(str(name[0]).lower())

                    # middle 1 names
                    if len(name[1]) > 1:
                        middle_1_names.append(str(name[1]).lower())
                        middle_1_initials.append(str(name[1])[0].lower())
                    else:
                        middle_1_initials.append(str(name[1]).lower())

                    # middle 2 names
                    if len(name[2]) > 1:
                        middle_2_names.append(str(name[2]).lower())
                        middle_2_initials.append(str(name[2])[0].lower())
                    else:
                        middle_2_initials.append(str(name[2]).lower())

                elif name[0] and name[1] and name[3]:
                    # first name
                    if len(name[0]) > 1:
                        first_names.append(str(name[0]).lower())
                        first_initials.append(str(name[0])[0].lower())
                    else:
                        first_initials.append(str(name[0]).lower())

                    # middle 1 names
                    if len(name[1]) > 1:
                        middle_1_names.append(str(name[1]).lower())
                        middle_1_initials.append(str(name[1])[0].lower())
                    else:
                        middle_1_initials.append(str(name[1]).lower())

                elif name[0] and name[3]:
                    # first name
                    if len(name[0]) > 1:
                        first_names.append(str(name[0]).lower())
                        first_initials.append(str(name[0])[0].lower())
                    else:
                        first_initials.append(str(name[0]).lower())
                else:
                    pass

    return [list(set(first_names)), list(set(first_initials)), list(set(middle_1_names)), 
            list(set(middle_1_initials)), list(set(middle_2_names)), list(set(middle_2_initials))]

In [26]:
def check_block_vs_block(block_1_names_list, block_2_names_list):
    
    # check first names
    first_check = match_block_names(block_1_names_list[0], block_1_names_list[1], block_2_names_list[0], 
                                    block_2_names_list[1])
    
    if first_check:

        middle_1_check = match_block_names(block_1_names_list[2], block_1_names_list[3], block_2_names_list[2], 
                                           block_2_names_list[3])

        if middle_1_check:
            # check middle 2 names
            middle_2_check = match_block_names(block_1_names_list[4], block_1_names_list[5], block_2_names_list[4], 
                                               block_2_names_list[5])

            if middle_2_check:
                return True
            else:
                return False
        else:
            return False
    else:
        return False
    

####### ALL ABOVE IS DONE (except for variables and paths)

In [27]:
def create_name_match_string(name_match_list):
    first_name = name_match_list[0][0] if name_match_list[0] else ""
    first_init = name_match_list[1][0] if name_match_list[1] else ""
    m1_name = name_match_list[2][0] if name_match_list[2] else ""
    m1_init = name_match_list[3][0] if name_match_list[3] else ""
    m2_name = name_match_list[4][0] if name_match_list[4] else ""
    m2_init = name_match_list[5][0] if name_match_list[5] else ""
    
    return f"{first_name}||{first_init}||{m1_name}||{m1_init}||{m2_name}||{m2_init}"

In [28]:
def compile_cluster_coauthors(list_of_coauthors):
    if not isinstance(list_of_coauthors, list):
        try:
            list_of_coauthors = list_of_coauthors.tolist()
        except:
            return []
    
    if list_of_coauthors:
        new_list_of_coauthors = [x for x in list_of_coauthors if x]
        if new_list_of_coauthors:
            if isinstance(new_list_of_coauthors[0], list):
                final_list = list(set([x.lower().replace('.', '') for y in new_list_of_coauthors for x in y]))
                final_list = [x for x in final_list if len(x) > 5]
                return final_list
            else:
                final_list = list(set([x.lower().replace('.', '') for x in new_list_of_coauthors]))
                final_list = [x for x in final_list if len(x) > 5]
                return final_list
        else:
            return []
    else:
        return []

In [29]:
def name_list_len_check(name_list):
    if ((len(name_list[0]) <=1) & (len(name_list[1]) <=1) & (len(name_list[2]) <=1) & 
        (len(name_list[3]) <=1) & (len(name_list[4]) <=1) & (len(name_list[5]) <=1)):
        return True
    else:
        return False

In [30]:
def convert_coauthors_to_block(block_id, coauthors):
    return [f"{block_id}_{coauthor}" for coauthor in coauthors]

In [31]:
def convert_name_to_block(block_id, names):
    if isinstance(names, list):
        return [f"{block_id}_{one_name}" if one_name else f"{block_id}_BLANKBLANK" 
                for one_name in names]
    else:
        return [f"{block_id}_{one_name}" if one_name else f"{block_id}_BLANKBLANK" 
                for one_name in [names]]

In [32]:
def graph_to_clusters_init(new_df):
    df = new_df.reset_index(drop=True).copy()
    df['KEY1'] = df.index
    df['KEY2'] = df.index
    
    df = df[['KEY1','KEY2','block_id','author_name_match',
             'temp_cluster_id','coauthors']].explode('coauthors').explode('author_name_match').copy()
    
    # coauthors
    G1=nx.from_pandas_edgelist(df, 'coauthors', 'KEY1')
    l1=list(nx.connected_components(G1))
    L1=[dict.fromkeys(y,x) for x, y in enumerate(l1)]
    d1={k: v for d1 in L1 for k, v in d1.items()}
    
    # first names
    G2=nx.from_pandas_edgelist(df, 'author_name_match', 'KEY2')
    l2=list(nx.connected_components(G2))
    L2=[dict.fromkeys(y,x) for x, y in enumerate(l2)]
    d2={k: v for d2 in L2 for k, v in d2.items()}
    
    s=df.groupby(['block_id',
                  df.KEY2.map(d2),
                  df.KEY1.map(d1)]).agg({"coauthors": set, 
                                         "temp_cluster_id": set}).reset_index()
    
    final_df = s[['block_id','temp_cluster_id','KEY1','KEY2']].explode('temp_cluster_id') \
        .reset_index(drop=True).copy()
    
    final_df['new_cluster_id'] = final_df.apply(lambda x: f"{x.block_id}_{x.KEY1}_{x.KEY2}", axis=1)
    return final_df

In [33]:
def get_final_clusters_latin_blocks(full_df):
    df = full_df.copy()
    
    # grouping together based on disambert
    grouped_df = df[['final_cluster_id','author_full_text','block_id']] \
        .groupby(['block_id','final_cluster_id']) \
        ['author_full_text'].apply(create_author_name_match).reset_index()
    
    grouped_df['len_check'] = grouped_df['author_full_text'].apply(name_list_len_check)
    
    # splitting up the data based on "good" clusters and "bad" clusters
    for_merging = grouped_df[grouped_df['len_check']]['final_cluster_id'].tolist()
    for_splitting = grouped_df[~grouped_df['len_check']]['final_cluster_id'].tolist()
    
    # getting the correct data points for each good cluster
    merged_df = df[df['final_cluster_id'].isin(for_merging)][['final_cluster_id','block_id',
                                                              'data_id','coauthors','author_full_text']] \
        .groupby(["block_id","final_cluster_id"]).agg({"author_full_text": create_author_name_match,
                                                       "coauthors": compile_cluster_coauthors,
                                                       "data_id": list}) \
        .reset_index()
    
    merged_df['first_name'] = merged_df['author_full_text'].apply(lambda x: x[0][0] if x[0] else "")
    merged_df['first_init'] = merged_df['author_full_text'].apply(lambda x: x[1][0] if x[1] else "")
    merged_df['m1_name'] = merged_df['author_full_text'].apply(lambda x: x[2][0] if x[2] else "")
    merged_df['m1_init'] = merged_df['author_full_text'].apply(lambda x: x[3][0] if x[3] else "")
    merged_df['m2_name'] = merged_df['author_full_text'].apply(lambda x: x[4][0] if x[4] else "")
    merged_df['m2_init'] = merged_df['author_full_text'].apply(lambda x: x[5][0] if x[5] else "")
    merged_df['author_name_match'] = merged_df \
        .apply(lambda x: f"{x.first_name} {x.first_init} {x.m1_name} {x.m1_init} {x.m2_name} {x.m2_init}", 
               axis=1)
    
    # getting the correct data points for each "bad" cluster
    split_df = df[df['final_cluster_id'].isin(for_splitting)][['final_cluster_id','block_id',
                                                               'author_full_text','data_id',
                                                               'coauthors']].copy()
    
    split_df['author_match_list'] = split_df['author_full_text'].apply(create_author_name_match)
    split_df['first_name'] = split_df['author_match_list'].apply(lambda x: x[0][0] if x[0] else "")
    split_df['first_init'] = split_df['author_match_list'].apply(lambda x: x[1][0] if x[1] else "")
    split_df['m1_name'] = split_df['author_match_list'].apply(lambda x: x[2][0] if x[2] else "")
    split_df['m1_init'] = split_df['author_match_list'].apply(lambda x: x[3][0] if x[3] else "")
    split_df['m2_name'] = split_df['author_match_list'].apply(lambda x: x[4][0] if x[4] else "")
    split_df['m2_init'] = split_df['author_match_list'].apply(lambda x: x[5][0] if x[5] else "")
    split_df['author_name_match'] = split_df \
        .apply(lambda x: f"{x.first_name} {x.first_init} {x.m1_name} {x.m1_init} {x.m2_name} {x.m2_init}", 
               axis=1)
    
    split_df = split_df.groupby(["block_id","final_cluster_id","author_name_match",
                                 "first_name","first_init","m1_name","m1_init","m2_name","m2_init"]) \
        .agg({"author_full_text": create_author_name_match,
              "coauthors": compile_cluster_coauthors,
              "data_id": list}) \
        .reset_index()
        
    # putting all of the data together for one final DF
    df_to_merge = pd.concat([split_df[split_df.columns], merged_df[split_df.columns]], axis=0) \
        .reset_index(drop=True) \
        .reset_index()
    
    df_to_merge.columns = ['temp_id'] + list(df_to_merge.columns[1:])
    df_to_merge['temp_cluster_id'] = df_to_merge.apply(lambda x: f"{x.block_id}_{x.temp_id}", axis=1)
    
    df_to_merge['coauthor_check'] = df_to_merge['coauthors'].apply(lambda x: True if x else False)
    
    untouched_clusters = df_to_merge[~df_to_merge['coauthor_check']].copy()
    untouched_clusters['new_cluster_id'] = untouched_clusters['temp_cluster_id'] \
                                                       .apply(lambda x: f"{x}x").to_numpy().astype('str')
    
    print(untouched_clusters.shape)
    
    df_to_merge = df_to_merge[df_to_merge['coauthor_check']].copy()
    
    # making it so names are unique for blocks
    df_to_merge['coauthors'] = df_to_merge.apply(lambda x: convert_coauthors_to_block(x.block_id, 
                                                                                      x.coauthors), 
                                                 axis=1)
    
    df_to_merge['author_name_match'] = df_to_merge.apply(lambda x: convert_name_to_block(x.block_id, 
                                                                                      x.author_name_match), 
                                                         axis=1)
    
    # getting newly merged clusters
    graph_clusters_df_init = graph_to_clusters_init(df_to_merge)
    graph_clusters_df= graph_clusters_df_init.merge(df_to_merge[['data_id','temp_cluster_id']].copy(), 
                                                how='inner', on='temp_cluster_id')
    
    df_to_merge_temp = pd.concat([graph_clusters_df[['block_id','temp_cluster_id',
                                                      'new_cluster_id','data_id']], 
                                   untouched_clusters[['block_id','temp_cluster_id',
                                                       'new_cluster_id','data_id']]], 
                                  axis=0).reset_index(drop=True)
    
    final_cluster_nums = df_to_merge_temp[['block_id','new_cluster_id']].drop_duplicates().copy()
    final_cluster_nums['final_clust'] = final_cluster_nums.sort_values(['block_id','new_cluster_id'])\
                .groupby(['block_id']) \
                .cumcount() + 1
    
    df_to_merge_final = df_to_merge_temp.merge(final_cluster_nums[['new_cluster_id','final_clust']], 
                                                how='inner', on='new_cluster_id')
    
    output_list = df.merge(df_to_merge_final[['data_id','final_clust']] \
                           .explode('data_id').drop_duplicates(), 
                       how='inner', on='data_id')['final_clust'].tolist()
    
    if len(output_list) > df.shape[0]:
        print("-----wrong num------")
        exploded_data_id_df = df_to_merge_final[['block_id','data_id','final_clust']] \
                                   .explode('data_id').drop_duplicates()
        exploded_val_counts = exploded_data_id_df['data_id'].value_counts().reset_index()
        
        
        exploded_val_counts.columns = ['data_id','counts']
        
        ids_to_redo = exploded_val_counts[exploded_val_counts['counts']>=2]['data_id'].tolist()
        
        get_block_ids = df[df['paper_author_id'].isin(ids_to_redo)].copy()
        get_block_ids['test_block_id'] = get_block_ids['author_full_text'].apply(get_block_id)
        
        blocks_to_keep = list(set(get_block_ids['test_block_id'].tolist()))
        
        exploded_ids_to_redo = exploded_data_id_df[exploded_data_id_df['data_id'].isin(ids_to_redo)].copy()
        exploded_ids_to_keep = exploded_data_id_df[~exploded_data_id_df['data_id'].isin(ids_to_redo)].copy()
        
        final_exploded_ids = pd.concat([exploded_ids_to_redo[exploded_ids_to_redo['block_id'].isin(blocks_to_keep)],
                                        exploded_ids_to_keep], 
                                       axis=0)
        
        output_list = df.merge(final_exploded_ids[['data_id','final_clust']],
                               how='inner', on='data_id')['final_clust'].tolist()
    
    return output_list

In [34]:
def get_final_clusters_non_latin_blocks(full_df):
    df = full_df.copy()
    
    # Cluster by init cluster number and author name
    df_to_merge = df.groupby(["block_id","final_cluster_id", 
                              "author_full_text"]) \
        .agg({"coauthors": compile_cluster_coauthors,
              "data_id": list}) \
        .reset_index() \
        .reset_index()
    
    df_to_merge.columns = ['temp_id'] + list(df_to_merge.columns[1:])
    df_to_merge['temp_cluster_id'] = df_to_merge.apply(lambda x: f"{x.block_id}_{x.temp_id}", axis=1)
    
    # coauthors check
    df_to_merge['coauthor_check'] = df_to_merge['coauthors'].apply(lambda x: True if x else False)
    untouched_clusters = df_to_merge[~df_to_merge['coauthor_check']].copy()
    untouched_clusters['new_cluster_id'] = untouched_clusters['temp_cluster_id'] \
                                                       .apply(lambda x: f"{x}nlx").to_numpy().astype('str')
    df_to_merge = df_to_merge[df_to_merge['coauthor_check']].copy()
    
    # Graph clustering where it must be exact string match and also a coauthor match
    df_to_merge['coauthors'] = df_to_merge.apply(lambda x: convert_coauthors_to_block(x.block_id, 
                                                                                      x.coauthors), axis=1)
    
    df_to_merge['author_name_match'] = df_to_merge.apply(lambda x: convert_name_to_block(x.block_id, 
                                                                                      x.author_full_text), 
                                                         axis=1)
    
    # Getting newly merged clusters
    graph_clusters_df_init = graph_to_clusters_init(df_to_merge)
    graph_clusters_df = graph_clusters_df_init.merge(df_to_merge[['data_id','temp_cluster_id']].copy(), 
                                                how='inner', on='temp_cluster_id')
    
    df_to_merge_temp = pd.concat([graph_clusters_df[['block_id','temp_cluster_id',
                                                      'new_cluster_id','data_id']], 
                                   untouched_clusters[['block_id','temp_cluster_id',
                                                       'new_cluster_id','data_id']]], 
                                  axis=0).reset_index(drop=True)
    
    # Getting final cluster numbers
    final_cluster_nums = df_to_merge_temp[['block_id','new_cluster_id']].drop_duplicates().copy()
    final_cluster_nums['final_clust'] = final_cluster_nums.sort_values(['block_id','new_cluster_id'])\
                .groupby(['block_id']) \
                .cumcount() + 1
    
    # Creating ID that can be combined with other DF later
    final_cluster_nums['final_clust'] = final_cluster_nums['final_clust'].apply(lambda x: f"{x}NL")
    
    df_to_merge_final = df_to_merge_temp.merge(final_cluster_nums[['new_cluster_id','final_clust']], 
                                                how='inner', on='new_cluster_id')
    
    output_list = df.merge(df_to_merge_final[['data_id','final_clust']] \
                           .explode('data_id').drop_duplicates(), 
                       how='inner', on='data_id')['final_clust'].tolist()
    
    if len(output_list) > df.shape[0]:
        print("-----wrong num------")
        exploded_data_id_df = df_to_merge_final[['block_id','data_id','final_clust']] \
                                   .explode('data_id').drop_duplicates()
        exploded_val_counts = exploded_data_id_df['data_id'].value_counts().reset_index()
        
        
        exploded_val_counts.columns = ['data_id','counts']
        
        ids_to_redo = exploded_val_counts[exploded_val_counts['counts']>=2]['data_id'].tolist()
        
        get_block_ids = df[df['paper_author_id'].isin(ids_to_redo)].copy()
        get_block_ids['test_block_id'] = get_block_ids['author_full_text'].apply(get_block_id)
        
        blocks_to_keep = list(set(get_block_ids['test_block_id'].tolist()))
        
        exploded_ids_to_redo = exploded_data_id_df[exploded_data_id_df['data_id'].isin(ids_to_redo)].copy()
        exploded_ids_to_keep = exploded_data_id_df[~exploded_data_id_df['data_id'].isin(ids_to_redo)].copy()
        
        final_exploded_ids = pd.concat([exploded_ids_to_redo[exploded_ids_to_redo['block_id'].isin(blocks_to_keep)],
                                        exploded_ids_to_keep], 
                                       axis=0)
        
        output_list = df.merge(final_exploded_ids[['data_id','final_clust']],
                               how='inner', on='data_id')['final_clust'].tolist()
    return output_list

In [35]:
def get_clusters_from_single_cluster_block_id(single_df):
    temp_single_df = single_df.copy()
    
    # Check for latin character names
    temp_single_df['latin_name_check'] = temp_single_df['author_full_text'].apply(check_latin_name)
    single_latin_df = temp_single_df[temp_single_df['latin_name_check']].copy()
    single_non_latin_df = temp_single_df[~temp_single_df['latin_name_check']].copy()
    
    if single_latin_df.shape[0] > 0:
        # latin (get author name match)
        single_latin_df['author_name_match'] = single_latin_df['author_full_text'].apply(create_author_name_match)
        single_latin_df['author_name_match'] = single_latin_df['author_name_match'].apply(create_name_match_string)

        grouped_single_latin_df = single_latin_df.groupby(["block_id","final_cluster_id", 
                                  "author_name_match"]) \
            .agg({"coauthors": compile_cluster_coauthors,
                  "data_id": list}) \
            .reset_index()

        final_latin_cluster_df = grouped_single_latin_df[['block_id','final_cluster_id','author_name_match']] \
            .drop_duplicates().copy()

        final_latin_cluster_df['final_clust'] = final_latin_cluster_df.sort_values(['block_id','final_cluster_id'])\
                    .groupby(['block_id']) \
                    .cumcount() + 1

        final_latin_cluster_df['final_clust'] = final_latin_cluster_df['final_clust'].apply(lambda x: f"{x}SCL")

        final_single_latin_df = single_latin_df.merge(final_latin_cluster_df[['final_cluster_id',
                                                                              'author_name_match',
                                                                              'final_clust']], 
                                               how='inner', 
                                               on=['final_cluster_id','author_name_match']) \
            [['block_id','data_id','author_full_text','final_clust']]
    else:
        final_single_latin_df = pd.DataFrame(columns=['block_id','data_id','author_full_text','final_clust'])
    
    if single_non_latin_df.shape[0] > 0:
        # non-latin (just group on raw author text)
        grouped_single_non_latin_df = single_non_latin_df.groupby(["block_id","final_cluster_id", 
                                  "author_full_text"]) \
            .agg({"coauthors": compile_cluster_coauthors,
                  "data_id": list}) \
            .reset_index()

        final_non_latin_cluster_df = grouped_single_non_latin_df[['block_id','final_cluster_id',
                                                                  'author_full_text']] \
            .drop_duplicates().copy()

        final_non_latin_cluster_df['final_clust'] = final_non_latin_cluster_df.sort_values(['block_id',
                                                                                            'final_cluster_id'])\
                    .groupby(['block_id']) \
                    .cumcount() + 1

        final_non_latin_cluster_df['final_clust'] = final_non_latin_cluster_df['final_clust'].apply(lambda x: 
                                                                                                    f"{x}SCNL")

        final_single_non_latin_df = single_non_latin_df.merge(final_non_latin_cluster_df[['final_cluster_id',
                                                                                  'author_full_text',
                                                                                  'final_clust']], 
                                               how='inner', 
                                               on=['final_cluster_id','author_full_text']) \
            [['block_id','data_id','author_full_text','final_clust']]
    else:
        final_single_non_latin_df = pd.DataFrame(columns=['block_id','data_id','author_full_text','final_clust'])
    
    # Creating ID that can be combined with other DF later
    final_single_df = pd.concat([final_single_latin_df, final_single_non_latin_df], axis=0)
    
    return final_single_df

In [36]:
def get_final_clusters_all_blocks(full_df):
    df = full_df.copy()
    
    # Check for latin character names
    df['latin_name_check'] = df['author_full_text'].apply(check_latin_name)
    latin_df = df[df['latin_name_check']].copy()
    non_latin_df = df[~df['latin_name_check']].copy()
    
    if latin_df.shape[0] > 0:
        # Complicated clustering for latin character names
        latin_output_list = get_final_clusters_latin_blocks(latin_df)
        latin_df['temp_final_clust'] = latin_output_list
    else:
        latin_df= pd.DataFrame(columns=[latin_df.columns+['temp_final_clust']])
    
    if non_latin_df.shape[0] > 0:
        # Simple but conservative clustering for non-latin character names
        non_latin_output_list = get_final_clusters_non_latin_blocks(non_latin_df)
        non_latin_df['temp_final_clust'] = non_latin_output_list
        pass
    else:
        non_latin_df = pd.DataFrame(columns=[non_latin_df.columns+['temp_final_clust']])
        non_latin_output_list = 0
        
    # Merging DFs back together
    full_cluster_df = pd.concat([latin_df, non_latin_df], axis=0).reset_index(drop=True)
    
    # Getting unique cluster ID number
    final_cluster_nums = full_cluster_df[['block_id','temp_final_clust']].drop_duplicates().copy()
    final_cluster_nums['final_clust'] = final_cluster_nums.sort_values(['block_id','temp_final_clust'])\
                .groupby(['block_id']) \
                .cumcount() + 1
    
    
    df_to_merge_final = full_cluster_df.merge(final_cluster_nums[['block_id','temp_final_clust','final_clust']], 
                                                how='inner', on=['block_id','temp_final_clust'])
    
    # Merge to initial df to get the correct order
    final_output_list = df.merge(df_to_merge_final[['data_id','final_clust']] \
                                 .drop_duplicates(), 
                                 how='inner', on='data_id')['final_clust'].tolist()
    
    if len(final_output_list) > df.shape[0]:
        print("-----wrong num------")
        exploded_data_id_df = df_to_merge_final[['block_id','data_id','final_clust']] \
                                   .explode('data_id').drop_duplicates()
        exploded_val_counts = exploded_data_id_df['data_id'].value_counts().reset_index()
        
        
        exploded_val_counts.columns = ['data_id','counts']
        
        ids_to_redo = exploded_val_counts[exploded_val_counts['counts']>=2]['data_id'].tolist()
        
        get_block_ids = df[df['paper_author_id'].isin(ids_to_redo)].copy()
        get_block_ids['test_block_id'] = get_block_ids['author_full_text'].apply(get_block_id)
        
        blocks_to_keep = list(set(get_block_ids['test_block_id'].tolist()))
        
        exploded_ids_to_redo = exploded_data_id_df[exploded_data_id_df['data_id'].isin(ids_to_redo)].copy()
        exploded_ids_to_keep = exploded_data_id_df[~exploded_data_id_df['data_id'].isin(ids_to_redo)].copy()
        
        final_exploded_ids = pd.concat([exploded_ids_to_redo[exploded_ids_to_redo['block_id'].isin(blocks_to_keep)],
                                        exploded_ids_to_keep], 
                                       axis=0)
        
        final_output_list = df.merge(final_exploded_ids[['data_id','final_clust']],
                               how='inner', on='data_id')['final_clust'].tolist()
    return final_output_list

In [37]:
def score_row(pub_year, journal, title, affiliation, coauthors, references):
    final_score = 0
    if pub_year:
        final_score +=1
    if journal:
        final_score +=1
    if title:
        final_score +=1
    if affiliation:
        final_score +=1
    if coauthors:
        final_score +=1
    if references:
        final_score +=1
    return final_score

In [38]:
def integrate_orcid(df_old):
    df = df_old.copy()
    df['test_clust'] = df.apply(lambda x: f"{x.block_id}_{x.final_clust}", axis=1)
    
    # merging orcid to data ids
    all_data = df.merge(orcid, how='left', on='data_id')
    all_data['orcid'] = all_data['orcid'].fillna("NONE")
    
    # grouping data to find where multiple orcids are assigned to a single cluster
    grouped_data = all_data.groupby('test_clust').agg({'orcid': set, 'data_id': set}).reset_index()
    grouped_data['orcid'] = grouped_data['orcid'].apply(lambda x: [i for i in list(x) if i!='NONE'])
    grouped_data['data_id'] = grouped_data['data_id'].apply(list)
    grouped_data['orcid_len'] = grouped_data['orcid'].apply(len)
    
    # breaking out those clusters so they can be fixed so there is only one orcid per cluster
    clusters_to_break = grouped_data[grouped_data['orcid_len']>1]['test_clust'].tolist()
    no_break_clusters_df = all_data[~all_data['test_clust'].isin(clusters_to_break)].copy()
    break_clusters_df = all_data[all_data['test_clust'].isin(clusters_to_break)].copy()

    # orcid with the most works stays in the cluster, the rest are moved to new cluster
    to_join_break = break_clusters_df[break_clusters_df['orcid']!='NONE'].groupby(['test_clust','orcid'])\
        .count().reset_index().sort_values(['data_id','orcid'], ascending=False).drop_duplicates('test_clust') \
        [['orcid','test_clust']]
    to_join_break['keep'] = 1

    mid_break = break_clusters_df.merge(to_join_break, how='left', on=['test_clust','orcid'])
    mid_break['keep'] = mid_break['keep'].fillna(0.0).astype('int')
    mid_break['final_keep'] = mid_break.apply(lambda x: 0 if ((x.orcid != 'NONE') & 
                                                              (x.keep == 0)) else 1, axis=1)
    
    post_break_keep = mid_break[mid_break['final_keep']==1].drop(['keep','final_keep'], axis=1).copy()
    post_break_not_keep = mid_break[mid_break['final_keep']==0].copy()
    
    to_join_not_keep = post_break_not_keep.groupby(['block_id','orcid']).agg({'test_clust': list, 
                                                                          'data_id': list}).reset_index()
    to_join_not_keep['test_clust'] = to_join_not_keep.apply(lambda x: f"{x.test_clust[0]}orc{x.orcid}", axis=1)

    # merge all data back together (now have 1 orcid per cluster)
    final_after_break = pd.concat([no_break_clusters_df, 
                                   post_break_keep[list(no_break_clusters_df.columns)], 
                                   to_join_not_keep.explode('data_id')\
                                       .merge(post_break_not_keep[['data_id','author_full_text','final_clust']], 
                                              how='inner', on='data_id')[list(no_break_clusters_df.columns)]], 
                                  axis=0).reset_index(drop=True)
    
    # want to see if clusters can merge together on orcid
    grouped_data_2 = final_after_break.groupby(['block_id','test_clust'])\
        .agg({'orcid': set, 'data_id': list}).reset_index()
    grouped_data_2['orcid'] = grouped_data_2['orcid'].apply(lambda x: [i for i in list(x) if i!='NONE'])
    grouped_data_2['orcid'] = grouped_data_2['orcid'].apply(lambda x: x[0] if x else "NONE")
    grouped_orc = grouped_data_2[grouped_data_2['orcid']!='NONE'] \
        .groupby(['block_id','orcid']).agg({'data_id': 'sum','test_clust':list}).reset_index()
    grouped_orc['test_clust'] = grouped_orc['test_clust'].apply(lambda x: f"{x[0]}gorc")
    
    # take new clusters that are formed and join back with rest of the data
    flat_orc = grouped_orc.explode('data_id')
    
    flat_orc_data_ids = flat_orc['data_id'].tolist()
    
    final_orc_integrated = pd.concat([final_after_break[~final_after_break['data_id'].isin(flat_orc_data_ids)] \
                                      [['block_id','orcid','data_id','test_clust']], 
                                      flat_orc], axis=0)
    
    final_orc_integrated['final_clust'] = final_orc_integrated['test_clust'].apply(lambda x: x.split("_")[-1])
    
    return final_orc_integrated[['block_id','data_id','final_clust']]

In [39]:
def get_df_from_final_cluster_and_supp(cluster_df, supp_df):
    clust_df = cluster_df.drop_duplicates().copy()
    clust_df.columns = ['block_id', 'paper_author_id','clust_num']
    data_df = supp_df.copy()
    
    # get block_id
    data_df['block_id'] = data_df['author'].apply(get_block_id)
    
    # get merged df
    merged_df = data_df[['author','paper_author_id','block_id']].drop_duplicates(subset=['paper_author_id']) \
        .merge(clust_df.drop_duplicates(subset=['paper_author_id']), how='inner', on=['paper_author_id','block_id'])
    
    merged_df['author_list'] = merged_df['author']
    merged_df['author_text'] = merged_df['author']
    merged_df['final_clust_num'] = merged_df.apply(lambda x: f"{x.block_id}_{x.clust_num}", axis=1)
    
    # getting separate DFs for reclustering and not reclustering
    val_counts = merged_df['block_id'].value_counts().reset_index()
    blocks_to_check = val_counts[(val_counts['block_id'] > 10)]['index'].tolist()
    for_reclustering = merged_df[merged_df['block_id'].isin(blocks_to_check)].copy()
    for_not_reclustering = merged_df[~merged_df['block_id'].isin(blocks_to_check)].copy()
    
    return blocks_to_check, for_reclustering, for_not_reclustering


####### ALL ABOVE IS DONE (except for variables and paths)

In [40]:
def get_reclustered_data(df_to_recluster, blocks_to_check):
    df = df_to_recluster.copy()
    
    # get blocks that are lopsided
    shorter_df = df[df['block_id'].isin(blocks_to_check)] \
        .groupby(['final_clust_num','block_id']).agg({'author_list': create_author_name_match, 
                                                      'author': 'count',
                                                      'author_text': set,
                                                      'paper_author_id': list}).reset_index().copy()
    
    block_id_is_lopsided = []
    block_id_not_lopsided = []
    reclustered_data = {}
    for block_id in blocks_to_check:
        temp_df = shorter_df[shorter_df['block_id']==block_id].copy()
        counts_array = temp_df['author'].to_numpy()
        final_clust_array = temp_df['final_clust_num'].to_numpy()
        work_author_list = temp_df['paper_author_id'].tolist()
        name_lists_list = temp_df['author_list'].tolist()

        if (np.sum(counts_array == 1) == (counts_array.shape[0] - 1)) & (counts_array.shape[0] != 1):
            block_id_is_lopsided.append(block_id)
            # get info for large cluster
            large_cluster_ind = np.argmax(counts_array)
            large_cluster_id = final_clust_array[large_cluster_ind]
            large_cluster_work_authors = work_author_list[large_cluster_ind]
            large_cluster_name_list = name_lists_list[large_cluster_ind]
            
            # get positions of single clusters
            single_cluster_inds = np.where((counts_array==1))[0].tolist()
            
            if check_latin_name(block_id):
                # go through single clusters and add to big cluster if names match
                ungrouped_single_clusters = []
                for sing_clust in single_cluster_inds:
                    if check_block_vs_block(large_cluster_name_list, name_lists_list[sing_clust]):
                        reclustered_data[final_clust_array[sing_clust]] = large_cluster_id
                    else:
                        pass

            else:
                # non-latin character names do simple group by with name string
                name_string_list = temp_df['author_text'].tolist()
                large_cluster_name_strings = list(name_string_list[large_cluster_ind])
                
                ungrouped_single_clusters = []
                for sing_clust in single_cluster_inds:
                    if list(name_string_list[sing_clust])[0] in large_cluster_name_strings:
                        reclustered_data[final_clust_array[sing_clust]] = large_cluster_id
                    else:
                        pass

        else:
            block_id_not_lopsided.append(block_id)
            
    reclustered_df = df[~df['block_id'].isin(block_id_not_lopsided)].copy()
    reclustered_df['final_clust_num'] = reclustered_df['final_clust_num'].replace(reclustered_data)
    
    return reclustered_df, df[df['block_id'].isin(block_id_not_lopsided)].copy(), block_id_is_lopsided


####### ALL ABOVE IS DONE (except for variables and paths)

In [41]:
def fix_lopsided_clusters(init_clusters, supp_data):
    # figure out which block_ids should be looked at
    blocks_check, for_recluster, no_recluster = get_df_from_final_cluster_and_supp(init_clusters, supp_data)
    
    # get reclustered data based on having a lopsided block_id (one large cluster and then single clusters)
    reclustered_data, untouched_data, lop_blocks = get_reclustered_data(for_recluster, blocks_check)
    
    # join all data back together and get into correct format
    final_recluster_data = pd.concat([reclustered_data, no_recluster, untouched_data], axis=0) \
        .reset_index(drop=True)[['block_id','paper_author_id','final_clust_num']]
    final_recluster_data.columns = ['block_id','data_id','final_clust_num']
    final_recluster_data['final_clust'] = final_recluster_data['final_clust_num'].apply(lambda x: x.split("_")[-1])
    
    return final_recluster_data[['block_id','data_id','final_clust']]

### Go through many files

In [42]:
def get_file_final_clusters(data_file_name, cluster_file_name, file_int, num_author_clusters):
    df = pd.read_json(f"s3://author-disambiguation/{data_file_name}",
                  orient='records', lines=True)
    
    cluster_df = pd.read_csv(f"s3://author-disambiguation/{cluster_file_name}")
    cluster_df['final_cluster_id'] = cluster_df.apply(lambda x: f"{x.block_id}_{x.cluster_id}", axis=1)
    print(cluster_df.shape[0])
    
    print("-------- loaded data")
    
    total_df = cluster_df.merge(df, how='inner', left_on='data_id', right_on='paper_author_id')

    total_df['author_full_text'] = total_df['author']
    total_df['row_score'] = total_df.apply(lambda x: score_row(x.pub_year, x.journal, x.title, 
                                                               x.affiliation, x.coauthors, x.references), axis=1)
    
    total_df = total_df.sort_values('row_score', ascending=False) \
        .drop_duplicates(subset=['block_id','data_id']) \
        .reset_index(drop=True) \
        .sort_values(['block_id','cluster_id']).copy()
    
    print(total_df.shape[0])
    
    val_counts = total_df.drop_duplicates(subset=['block_id','cluster_id'])['block_id'] \
        .value_counts().reset_index().copy()
    multiple_clusters = val_counts[val_counts['block_id'] > 1]['index'].tolist()
    single_cluster = val_counts[val_counts['block_id'] == 1]['index'].tolist()
    
    final_new_df = total_df[total_df['block_id'].isin(multiple_clusters)].copy()
    temp_single_df = total_df[total_df['block_id'].isin(single_cluster)].copy()
    
    # Getting single name for each single cluster
    final_single_df = get_clusters_from_single_cluster_block_id(temp_single_df)
    
    print("-------- separate dataframes created")
    
    print("-------- getting init clusters")
    
    final_test_clusters = get_final_clusters_all_blocks(final_new_df)
    
    
    final_new_df['final_clust'] = final_test_clusters
    print(final_new_df.shape[0])
    print(final_single_df.shape[0])
    
    final_merged_df = pd.concat([final_new_df[final_single_df.columns], final_single_df], axis=0)
    
    final_merged_df['final_clust'] = final_merged_df['final_clust'].astype('str')
    
    print("-------- doing orcid integration")
    
    final_merged_orc_df = integrate_orcid(final_merged_df)
    
    print("-------- fixing lopsided clusters")
    
    final_merged_orc_lops_df = fix_lopsided_clusters(final_merged_orc_df, df)
    
    num_author_clusters += final_merged_orc_lops_df[['block_id','final_clust']].drop_duplicates().shape[0]
    
    print(f"-------- NUM AUTHORS = {num_author_clusters} --------")
    
    print(f"--------  saving file")
    
    final_merged_orc_lops_df.to_parquet(f"./all_data_new_clusters_orcid/final_clustering_data_{file_int}.parquet")
    _ = write_file_to_s3(final_merged_orc_lops_df, file_int, mapping_dict[int(file_int)])
    
    return num_author_clusters

In [43]:
def write_file_to_s3(df, file_int, node):
    datetime_str = datetime.now().strftime("%Y_%m_%d")
    clusters = df.copy()
    clusters['cluster_id'] = clusters.apply(lambda x: f"{x.block_id}_{x.final_clust}", axis=1)
    
    to_join = clusters.groupby('cluster_id')['data_id'].apply(list).reset_index()\
        .rename(columns={"data_id":"matched_papers"}).reset_index()

    to_join.columns = ['cluster_id','full_id','matched_papers']

    to_write = to_join[['cluster_id','matched_papers']].explode('matched_papers').drop_duplicates().copy()

    to_write['work'] = to_write['matched_papers'].apply(lambda x: x.split("_")[0][1:])
    to_write['seq_no'] = to_write['matched_papers'].apply(lambda x: x.split("_")[1])

    to_write[['cluster_id','work','seq_no']] \
    .to_csv(f"s3://author-name-disambiguation/V1/data/003_COMPLETED_CLUSTERS/NODE_{str(node)}/{datetime_str}/init_clusters_{str(file_int)}.csv.gz", 
                                                    compression='gzip', header=None, index=None)
    return True

In [44]:
# list of not clustered files (filenames index): [96]

In [45]:
%%time
num_author_clusters = 4299408

for i in range(25,300):
    data_file_name = filenames[i][0]
    cluster_file_name = filenames[i][1]
    file_int = data_file_name.split("partition_")[-1].split(".json")[0]
    
    print("")
    print(f"{i} - {file_int}")
    # graph_clusters_df, untouched_clusters, df_to_merge, df_to_merge_temp, df_to_merge_final
    num_author_clusters = get_file_final_clusters(data_file_name, cluster_file_name, 
                                                  file_int, num_author_clusters)


25 - 120
1165420
-------- loaded data
1165366
-------- separate dataframes created
-------- getting init clusters
(18909, 16)
990436
174930
-------- doing orcid integration
-------- fixing lopsided clusters
-------- NUM AUTHORS = 4474238 --------
--------  saving file

26 - 121
1165244
-------- loaded data
1165196
-------- separate dataframes created
-------- getting init clusters
(18798, 16)
983277
181919
-------- doing orcid integration
-------- fixing lopsided clusters
-------- NUM AUTHORS = 4645119 --------
--------  saving file

27 - 122
1133150
-------- loaded data
1133096
-------- separate dataframes created
-------- getting init clusters
(19181, 16)
948785
184311
-------- doing orcid integration
-------- fixing lopsided clusters
-------- NUM AUTHORS = 4816769 --------
--------  saving file

28 - 123
1190236
-------- loaded data
1190166
-------- separate dataframes created
-------- getting init clusters
(19281, 16)
1013559
176607
-------- doing orcid integration
-------- fixing

In [46]:
%%time
num_author_clusters = num_author_clusters+34764848
for i in range(497,501):
    data_file_name = filenames[i][0]
    cluster_file_name = filenames[i][1]
    file_int = data_file_name.split("partition_")[-1].split(".json")[0]
    
    print("")
    print(f"{i} - {file_int}")
    # graph_clusters_df, untouched_clusters, df_to_merge, df_to_merge_temp, df_to_merge_final
    num_author_clusters = get_file_final_clusters(data_file_name, cluster_file_name, 
                                                  file_int, num_author_clusters)


497 - 96
1078839
-------- loaded data
1078793
-------- separate dataframes created
-------- getting init clusters
(18511, 16)
894229
184564
-------- doing orcid integration
-------- fixing lopsided clusters
-------- NUM AUTHORS = 87594096 --------
--------  saving file

498 - 97
1028329
-------- loaded data
1028287
-------- separate dataframes created
-------- getting init clusters
(16392, 16)
843154
185133
-------- doing orcid integration
-------- fixing lopsided clusters
-------- NUM AUTHORS = 87752746 --------
--------  saving file

499 - 98
1233529
-------- loaded data
1233468
-------- separate dataframes created
-------- getting init clusters
(17507, 16)
1047538
185930
-------- doing orcid integration
-------- fixing lopsided clusters
-------- NUM AUTHORS = 87923891 --------
--------  saving file

500 - 99
1193924
-------- loaded data
1193878
-------- separate dataframes created
-------- getting init clusters
(20293, 16)
1014061
179817
-------- doing orcid integration
-------- fi

In [43]:
def write_to_correct_location(file_int):
    final_merged_df = pd.read_parquet(f"./all_data_new_clusters_86MIL/final_clustering_data_{file_int}.parquet")
    _ = write_file_to_s3(final_merged_df, file_int, mapping_dict[int(file_int)])
    return True

In [44]:
for i in range(501):
    print(i)
    _ = write_to_correct_location(i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [None]:
## renaming files in S3

In [33]:
def rename_file(s3_res, bucket, old_key):
    s3_res.Object(bucket, f"{old_key}.csv.gz").copy_from(CopySource=f"{bucket}/{old_key}")
    s3_res.Object(bucket, old_key).delete()
    return True

In [34]:
s3_client = boto3.client("s3")
bucket_name = "author-name-disambiguation"
paginator = s3_client.get_paginator("list_objects_v2")
response = paginator.paginate(Bucket=bucket_name, Prefix="V1/data/003_COMPLETED_CLUSTERS/",
                              PaginationConfig={"PageSize": 50})

final_cluster_filenames = []

for page in response:
    files = page.get("Contents")
    for file in files:
        if 'init_clusters' in file['Key']:
            final_cluster_filenames.append(file['Key'])

In [35]:
len(final_cluster_filenames)

1

In [36]:
final_cluster_filenames

['V1/data/003_COMPLETED_CLUSTERS/NODE_1/2023_01_15/init_clusters_0.csv.gz']

In [37]:
s3 = boto3.resource('s3')
my_bucket = 'author-name-disambiguation'
for i in final_cluster_filenames[:1]:
    print(i)
    rename_file(s3, my_bucket, i)

V1/data/003_COMPLETED_CLUSTERS/NODE_1/2023_01_15/init_clusters_0.csv.gz


### Saving Supp Data for Live Deployment

In [10]:
def score_row(pub_year, journal, title, affiliation, coauthors, references):
    final_score = 0
    if pub_year:
        final_score +=1
    if journal:
        final_score +=1
    if title:
        final_score +=1
    if affiliation:
        final_score +=1
    if coauthors:
        final_score +=1
    if references:
        final_score +=1
    return final_score

In [11]:
def get_live_deployment_data(data_file_name, cluster_file_name, file_int):
    date_str = datetime.now().strftime("%Y_%m_%d_%H")
    bucket_and_path = "s3://author-disambiguation/V1/vectorized_data"
    df = pd.read_json(f"s3://author-disambiguation/{data_file_name}",
                  orient='records', lines=True)
    
    cluster_df = pd.read_csv(f"s3://author-disambiguation/{cluster_file_name}")
    
    total_df = cluster_df.merge(df, how='inner', left_on='data_id', right_on='paper_author_id')

    total_df['row_score'] = total_df.apply(lambda x: score_row(x.pub_year, x.journal, x.title, 
                                                               x.affiliation, x.coauthors, x.references), axis=1)
    
    total_df = total_df.sort_values('row_score', ascending=False) \
        .drop_duplicates(subset=['block_id','data_id']) \
        .reset_index(drop=True) \
        .sort_values(['block_id','cluster_id']).copy()
    
    total_df[['data_id','block_id','author','coauthors']] \
    .to_parquet(
        f"{bucket_and_path}/partition_{file_int}/{date_str}_supp_data_{file_int}.parquet")

In [13]:
for i in range(159,170):
    data_file_name = filenames[i][0]
    cluster_file_name = filenames[i][1]
    file_int = data_file_name.split("partition_")[-1].split(".json")[0]
    
    print(f"{i} - {file_int}")
    # graph_clusters_df, untouched_clusters, df_to_merge, df_to_merge_temp, df_to_merge_final
    _ = get_live_deployment_data(data_file_name, cluster_file_name, file_int)

159 - 241
160 - 242
161 - 243
162 - 244
163 - 245
164 - 246
165 - 247
166 - 248
167 - 249
168 - 25
169 - 250


In [1]:
import pandas as pd
import glob

In [21]:
def get_block_ids_for_partition(cluster_file_name, file_int):
    cluster_df = pd.read_csv(f"s3://author-disambiguation/{cluster_file_name}")
    
    cluster_df['partition'] = file_int
    
    cluster_df[['block_id','partition']].to_parquet(f"./block_ids_to_partition/partition_{file_int}.parquet")

In [13]:
test_file_1 = pd.read_csv(f"s3://author-disambiguation/V1/vectorized_data/partition_129/cluster_file_partition_129")

In [14]:
test_file_2 = pd.read_csv(f"s3://author-disambiguation/V1/vectorized_data/partition_176/cluster_file_partition_176")

In [17]:
test_file_1[test_file_1['block_id']=='p_kumar']

Unnamed: 0,block_id,data_id,cluster_id
776760,p_kumar,W2761821127_1,0


In [18]:
test_file_2[test_file_2['block_id']=='p_kumar']

Unnamed: 0,block_id,data_id,cluster_id
790584,p_kumar,W1603021249_2,0


In [12]:
cluster_filenames[0]

'V1/vectorized_data/partition_0/cluster_file_partition_0'

In [2]:
def get_all_filenames():
    all_files = []
    for data_file in glob.glob(f"./block_ids_to_partition/part*"):
        all_files.append(data_file)
    return all_files

In [24]:
# for i in range(501):
#     cluster_file_name = filenames[i][1]
#     file_int = cluster_file_name.split("partition_")[-1]
    
#     print(f"{i} - {file_int}")
#     _ = get_block_ids_for_partition(cluster_file_name, file_int)

0 - 0
1 - 1
2 - 10
3 - 100
4 - 101
5 - 102
6 - 103
7 - 104
8 - 105
9 - 106
10 - 107
11 - 108
12 - 109
13 - 11
14 - 110
15 - 111
16 - 112
17 - 113
18 - 114
19 - 115
20 - 116
21 - 117
22 - 118
23 - 119
24 - 12
25 - 120
26 - 121
27 - 122
28 - 123
29 - 124
30 - 125
31 - 126
32 - 127
33 - 128
34 - 129
35 - 13
36 - 130
37 - 131
38 - 132
39 - 133
40 - 134
41 - 135
42 - 136
43 - 137
44 - 138
45 - 139
46 - 14
47 - 140
48 - 141
49 - 142
50 - 143
51 - 144
52 - 145
53 - 146
54 - 147
55 - 148
56 - 149
57 - 15
58 - 150
59 - 151
60 - 152
61 - 153
62 - 154
63 - 155
64 - 156
65 - 157
66 - 158
67 - 159
68 - 16
69 - 160
70 - 161
71 - 162
72 - 163
73 - 164
74 - 165
75 - 166
76 - 167
77 - 168
78 - 169
79 - 17
80 - 170
81 - 171
82 - 172
83 - 173
84 - 174
85 - 175
86 - 176
87 - 177
88 - 178
89 - 179
90 - 18
91 - 180
92 - 181
93 - 182
94 - 183
95 - 184
96 - 185
97 - 186
98 - 187
99 - 188
100 - 189
101 - 19
102 - 190
103 - 191
104 - 192
105 - 193
106 - 194
107 - 195
108 - 196
109 - 197
110 - 198
111 - 199
112 

In [7]:
all_files = get_all_filenames()

In [8]:
len(all_files)

501

In [29]:
full_df = pd.DataFrame()
for block_file in all_files[:100]:
    print(block_file)
    temp_df = pd.read_parquet(block_file)
    full_df = pd.concat([full_df, temp_df], axis=0)

./block_ids_to_partition/partition_0.parquet
./block_ids_to_partition/partition_1.parquet
./block_ids_to_partition/partition_10.parquet
./block_ids_to_partition/partition_100.parquet
./block_ids_to_partition/partition_101.parquet
./block_ids_to_partition/partition_102.parquet
./block_ids_to_partition/partition_103.parquet
./block_ids_to_partition/partition_104.parquet
./block_ids_to_partition/partition_105.parquet
./block_ids_to_partition/partition_106.parquet
./block_ids_to_partition/partition_107.parquet
./block_ids_to_partition/partition_108.parquet
./block_ids_to_partition/partition_109.parquet
./block_ids_to_partition/partition_11.parquet
./block_ids_to_partition/partition_110.parquet
./block_ids_to_partition/partition_111.parquet
./block_ids_to_partition/partition_112.parquet
./block_ids_to_partition/partition_113.parquet
./block_ids_to_partition/partition_114.parquet
./block_ids_to_partition/partition_115.parquet
./block_ids_to_partition/partition_116.parquet
./block_ids_to_part

In [30]:
full_df.to_parquet("./block_ids_to_partition/full_file_1.parquet")

In [31]:
full_df.shape

(121992338, 2)

In [9]:
full_df = pd.DataFrame()
for block_file in all_files[100:200]:
    print(block_file)
    temp_df = pd.read_parquet(block_file)
    full_df = pd.concat([full_df, temp_df], axis=0)

./block_ids_to_partition/partition_189.parquet
./block_ids_to_partition/partition_19.parquet
./block_ids_to_partition/partition_190.parquet
./block_ids_to_partition/partition_191.parquet
./block_ids_to_partition/partition_192.parquet
./block_ids_to_partition/partition_193.parquet
./block_ids_to_partition/partition_194.parquet
./block_ids_to_partition/partition_195.parquet
./block_ids_to_partition/partition_196.parquet
./block_ids_to_partition/partition_197.parquet
./block_ids_to_partition/partition_198.parquet
./block_ids_to_partition/partition_199.parquet
./block_ids_to_partition/partition_2.parquet
./block_ids_to_partition/partition_20.parquet
./block_ids_to_partition/partition_200.parquet
./block_ids_to_partition/partition_201.parquet
./block_ids_to_partition/partition_202.parquet
./block_ids_to_partition/partition_203.parquet
./block_ids_to_partition/partition_204.parquet
./block_ids_to_partition/partition_205.parquet
./block_ids_to_partition/partition_206.parquet
./block_ids_to_pa

In [10]:
full_df.to_parquet("./block_ids_to_partition/full_file_2.parquet")

In [11]:
full_df = pd.DataFrame()
for block_file in all_files[200:300]:
    print(block_file)
    temp_df = pd.read_parquet(block_file)
    full_df = pd.concat([full_df, temp_df], axis=0)

./block_ids_to_partition/partition_279.parquet
./block_ids_to_partition/partition_28.parquet
./block_ids_to_partition/partition_280.parquet
./block_ids_to_partition/partition_281.parquet
./block_ids_to_partition/partition_282.parquet
./block_ids_to_partition/partition_283.parquet
./block_ids_to_partition/partition_284.parquet
./block_ids_to_partition/partition_285.parquet
./block_ids_to_partition/partition_286.parquet
./block_ids_to_partition/partition_287.parquet
./block_ids_to_partition/partition_288.parquet
./block_ids_to_partition/partition_289.parquet
./block_ids_to_partition/partition_29.parquet
./block_ids_to_partition/partition_290.parquet
./block_ids_to_partition/partition_291.parquet
./block_ids_to_partition/partition_292.parquet
./block_ids_to_partition/partition_293.parquet
./block_ids_to_partition/partition_294.parquet
./block_ids_to_partition/partition_295.parquet
./block_ids_to_partition/partition_296.parquet
./block_ids_to_partition/partition_297.parquet
./block_ids_to_

In [12]:
full_df.to_parquet("./block_ids_to_partition/full_file_3.parquet")

In [13]:
full_df = pd.DataFrame()
for block_file in all_files[300:400]:
    print(block_file)
    temp_df = pd.read_parquet(block_file)
    full_df = pd.concat([full_df, temp_df], axis=0)

./block_ids_to_partition/partition_369.parquet
./block_ids_to_partition/partition_37.parquet
./block_ids_to_partition/partition_370.parquet
./block_ids_to_partition/partition_371.parquet
./block_ids_to_partition/partition_372.parquet
./block_ids_to_partition/partition_373.parquet
./block_ids_to_partition/partition_38.parquet
./block_ids_to_partition/partition_374.parquet
./block_ids_to_partition/partition_375.parquet
./block_ids_to_partition/partition_376.parquet
./block_ids_to_partition/partition_377.parquet
./block_ids_to_partition/partition_378.parquet
./block_ids_to_partition/partition_379.parquet
./block_ids_to_partition/partition_380.parquet
./block_ids_to_partition/partition_381.parquet
./block_ids_to_partition/partition_382.parquet
./block_ids_to_partition/partition_383.parquet
./block_ids_to_partition/partition_384.parquet
./block_ids_to_partition/partition_385.parquet
./block_ids_to_partition/partition_386.parquet
./block_ids_to_partition/partition_387.parquet
./block_ids_to_

In [14]:
full_df.to_parquet("./block_ids_to_partition/full_file_4.parquet")

In [15]:
full_df = pd.DataFrame()
for block_file in all_files[400:501]:
    print(block_file)
    temp_df = pd.read_parquet(block_file)
    full_df = pd.concat([full_df, temp_df], axis=0)

./block_ids_to_partition/partition_459.parquet
./block_ids_to_partition/partition_46.parquet
./block_ids_to_partition/partition_460.parquet
./block_ids_to_partition/partition_461.parquet
./block_ids_to_partition/partition_462.parquet
./block_ids_to_partition/partition_463.parquet
./block_ids_to_partition/partition_464.parquet
./block_ids_to_partition/partition_465.parquet
./block_ids_to_partition/partition_466.parquet
./block_ids_to_partition/partition_467.parquet
./block_ids_to_partition/partition_468.parquet
./block_ids_to_partition/partition_469.parquet
./block_ids_to_partition/partition_47.parquet
./block_ids_to_partition/partition_470.parquet
./block_ids_to_partition/partition_471.parquet
./block_ids_to_partition/partition_472.parquet
./block_ids_to_partition/partition_473.parquet
./block_ids_to_partition/partition_474.parquet
./block_ids_to_partition/partition_475.parquet
./block_ids_to_partition/partition_476.parquet
./block_ids_to_partition/partition_477.parquet
./block_ids_to_

In [16]:
full_df.to_parquet("./block_ids_to_partition/full_file_5.parquet")

In [24]:
for i in range(1,6):
    test = pd.read_parquet(f"./block_ids_to_partition/full_file_{i}.parquet")
    test['count_col'] = 1
    test.groupby(['block_id','partition']).count().reset_index().to_parquet(f"./block_ids_to_partition/small_file_{i}.parquet")

In [1]:
import pandas as pd

In [28]:
final_df = pd.DataFrame()
for i in range(1,6):
    test = pd.read_parquet(f"./block_ids_to_partition/small_file_{i}.parquet")
    final_df = pd.concat([final_df, test], axis=0)

In [32]:
final_df.sort_values('count_col', ascending=False).drop_duplicates('block_id') \
.to_parquet("./block_ids_to_partition/block_id_partition_mapping.parquet")

In [2]:
final_df = pd.read_parquet("./block_ids_to_partition/block_id_partition_mapping.parquet")

In [14]:
part_to_node = pd.read_parquet("s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/partition_id_node_mapping.parquet")
part_to_node.columns = ['partition', 'node']
part_to_node['partition'] = part_to_node['partition'].astype('str')

In [15]:
part_to_node.sample()

Unnamed: 0,partition,node
466,466,1


In [16]:
final_df.shape

(25954765, 3)

In [18]:
final_df.merge(part_to_node, how='inner', on='partition')\
[['block_id','node']].to_parquet("./block_ids_to_partition/block_id_node_mapping.parquet")

### Transferring Files from one env to another

In [None]:
s3_client = boto3.client("s3")
bucket_name = "author-name-disambiguation"
paginator = s3_client.get_paginator("list_objects_v2")
response = paginator.paginate(Bucket=bucket_name, Prefix="V1/vectorized_data/",
                              PaginationConfig={"PageSize": 50})
data_filenames = []

for page in response:
    files = page.get("Contents")
    for file in files:
        if 'supp_data' in file['Key']:
            data_filenames.append(file['Key'])

In [4]:
len(data_filenames)

501

In [5]:
data_filenames[0]

'V1/vectorized_data/partition_0/2023_02_16_20_supp_data_0.parquet'

In [6]:
for i in data_filenames:
    os.system(f"aws s3 cp s3://{bucket_name}/{i} /home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/")

Completed 256.0 KiB/109.8 MiB (1.6 MiB/s) with 1 file(s) remainingCompleted 512.0 KiB/109.8 MiB (3.2 MiB/s) with 1 file(s) remainingCompleted 768.0 KiB/109.8 MiB (4.8 MiB/s) with 1 file(s) remainingCompleted 1.0 MiB/109.8 MiB (6.4 MiB/s) with 1 file(s) remaining  Completed 1.2 MiB/109.8 MiB (7.9 MiB/s) with 1 file(s) remaining  Completed 1.5 MiB/109.8 MiB (9.5 MiB/s) with 1 file(s) remaining  Completed 1.8 MiB/109.8 MiB (10.9 MiB/s) with 1 file(s) remaining Completed 2.0 MiB/109.8 MiB (12.4 MiB/s) with 1 file(s) remaining Completed 2.2 MiB/109.8 MiB (13.9 MiB/s) with 1 file(s) remaining Completed 2.5 MiB/109.8 MiB (15.4 MiB/s) with 1 file(s) remaining Completed 2.8 MiB/109.8 MiB (16.8 MiB/s) with 1 file(s) remaining Completed 3.0 MiB/109.8 MiB (18.2 MiB/s) with 1 file(s) remaining Completed 3.2 MiB/109.8 MiB (19.5 MiB/s) with 1 file(s) remaining Completed 3.5 MiB/109.8 MiB (20.8 MiB/s) with 1 file(s) remaining Completed 3.8 MiB/109.8 MiB (22.1 MiB/s) with 1 file(s) remain

Completed 35.0 MiB/109.8 MiB (98.5 MiB/s) with 1 file(s) remainingCompleted 35.2 MiB/109.8 MiB (98.7 MiB/s) with 1 file(s) remainingCompleted 35.5 MiB/109.8 MiB (99.0 MiB/s) with 1 file(s) remainingCompleted 35.8 MiB/109.8 MiB (99.5 MiB/s) with 1 file(s) remainingCompleted 36.0 MiB/109.8 MiB (99.9 MiB/s) with 1 file(s) remainingCompleted 36.2 MiB/109.8 MiB (100.5 MiB/s) with 1 file(s) remainingCompleted 36.5 MiB/109.8 MiB (100.9 MiB/s) with 1 file(s) remainingCompleted 36.8 MiB/109.8 MiB (101.4 MiB/s) with 1 file(s) remainingCompleted 37.0 MiB/109.8 MiB (101.9 MiB/s) with 1 file(s) remainingCompleted 37.2 MiB/109.8 MiB (102.3 MiB/s) with 1 file(s) remainingCompleted 37.5 MiB/109.8 MiB (102.6 MiB/s) with 1 file(s) remainingCompleted 37.8 MiB/109.8 MiB (102.7 MiB/s) with 1 file(s) remainingCompleted 38.0 MiB/109.8 MiB (102.9 MiB/s) with 1 file(s) remainingCompleted 38.2 MiB/109.8 MiB (103.1 MiB/s) with 1 file(s) remainingCompleted 38.5 MiB/109.8 MiB (103.3 MiB/s) with 1 fil

download: s3://author-disambiguation/V1/vectorized_data/partition_0/2023_02_16_20_supp_data_0.parquet to all_data_supp_data_live_deployment/2023_02_16_20_supp_data_0.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_1/2023_02_16_20_supp_data_1.parquet to all_data_supp_data_live_deployment/2023_02_16_20_supp_data_1.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_10/2023_02_16_20_supp_data_10.parquet to all_data_supp_data_live_deployment/2023_02_16_20_supp_data_10.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_100/2023_02_16_20_supp_data_100.parquet to all_data_supp_data_live_deployment/2023_02_16_20_supp_data_100.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_101/2023_02_16_20_supp_data_101.parquet to all_data_supp_data_live_deployment/2023_02_16_20_supp_data_101.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_102/2023_02_16_20_supp_data_102.parquet to all_da

Completed 256.0 KiB/133.6 MiB (1.3 MiB/s) with 1 file(s) remainingCompleted 512.0 KiB/133.6 MiB (2.6 MiB/s) with 1 file(s) remainingCompleted 768.0 KiB/133.6 MiB (3.8 MiB/s) with 1 file(s) remainingCompleted 1.0 MiB/133.6 MiB (5.1 MiB/s) with 1 file(s) remaining  Completed 1.2 MiB/133.6 MiB (6.3 MiB/s) with 1 file(s) remaining  Completed 1.5 MiB/133.6 MiB (7.5 MiB/s) with 1 file(s) remaining  Completed 1.8 MiB/133.6 MiB (8.7 MiB/s) with 1 file(s) remaining  Completed 2.0 MiB/133.6 MiB (10.0 MiB/s) with 1 file(s) remaining Completed 2.2 MiB/133.6 MiB (11.1 MiB/s) with 1 file(s) remaining Completed 2.5 MiB/133.6 MiB (12.3 MiB/s) with 1 file(s) remaining Completed 2.8 MiB/133.6 MiB (13.5 MiB/s) with 1 file(s) remaining Completed 3.0 MiB/133.6 MiB (14.7 MiB/s) with 1 file(s) remaining Completed 3.2 MiB/133.6 MiB (15.9 MiB/s) with 1 file(s) remaining Completed 3.5 MiB/133.6 MiB (17.0 MiB/s) with 1 file(s) remaining Completed 3.8 MiB/133.6 MiB (18.1 MiB/s) with 1 file(s) remain

Completed 35.8 MiB/133.6 MiB (91.1 MiB/s) with 1 file(s) remainingCompleted 36.0 MiB/133.6 MiB (91.3 MiB/s) with 1 file(s) remainingCompleted 36.2 MiB/133.6 MiB (91.5 MiB/s) with 1 file(s) remainingCompleted 36.5 MiB/133.6 MiB (92.1 MiB/s) with 1 file(s) remainingCompleted 36.8 MiB/133.6 MiB (92.4 MiB/s) with 1 file(s) remainingCompleted 37.0 MiB/133.6 MiB (92.8 MiB/s) with 1 file(s) remainingCompleted 37.2 MiB/133.6 MiB (93.2 MiB/s) with 1 file(s) remainingCompleted 37.5 MiB/133.6 MiB (93.5 MiB/s) with 1 file(s) remainingCompleted 37.8 MiB/133.6 MiB (93.9 MiB/s) with 1 file(s) remainingCompleted 38.0 MiB/133.6 MiB (94.3 MiB/s) with 1 file(s) remainingCompleted 38.2 MiB/133.6 MiB (94.5 MiB/s) with 1 file(s) remainingCompleted 38.5 MiB/133.6 MiB (94.8 MiB/s) with 1 file(s) remainingCompleted 38.8 MiB/133.6 MiB (95.3 MiB/s) with 1 file(s) remainingCompleted 39.0 MiB/133.6 MiB (95.6 MiB/s) with 1 file(s) remainingCompleted 39.2 MiB/133.6 MiB (95.8 MiB/s) with 1 file(s) remai

Completed 70.0 MiB/133.6 MiB (118.0 MiB/s) with 1 file(s) remainingCompleted 70.2 MiB/133.6 MiB (118.1 MiB/s) with 1 file(s) remainingCompleted 70.5 MiB/133.6 MiB (118.2 MiB/s) with 1 file(s) remainingCompleted 70.8 MiB/133.6 MiB (118.3 MiB/s) with 1 file(s) remainingCompleted 71.0 MiB/133.6 MiB (118.4 MiB/s) with 1 file(s) remainingCompleted 71.2 MiB/133.6 MiB (118.5 MiB/s) with 1 file(s) remainingCompleted 71.5 MiB/133.6 MiB (118.6 MiB/s) with 1 file(s) remainingCompleted 71.8 MiB/133.6 MiB (118.5 MiB/s) with 1 file(s) remainingCompleted 72.0 MiB/133.6 MiB (118.3 MiB/s) with 1 file(s) remainingCompleted 72.2 MiB/133.6 MiB (118.5 MiB/s) with 1 file(s) remainingCompleted 72.5 MiB/133.6 MiB (118.5 MiB/s) with 1 file(s) remainingCompleted 72.8 MiB/133.6 MiB (118.7 MiB/s) with 1 file(s) remainingCompleted 73.0 MiB/133.6 MiB (113.8 MiB/s) with 1 file(s) remainingCompleted 73.2 MiB/133.6 MiB (114.1 MiB/s) with 1 file(s) remainingCompleted 73.5 MiB/133.6 MiB (114.2 MiB/s) with 

download: s3://author-disambiguation/V1/vectorized_data/partition_14/2023_02_16_21_supp_data_14.parquet to all_data_supp_data_live_deployment/2023_02_16_21_supp_data_14.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_140/2023_02_16_21_supp_data_140.parquet to all_data_supp_data_live_deployment/2023_02_16_21_supp_data_140.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_141/2023_02_16_21_supp_data_141.parquet to all_data_supp_data_live_deployment/2023_02_16_21_supp_data_141.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_142/2023_02_16_21_supp_data_142.parquet to all_data_supp_data_live_deployment/2023_02_16_21_supp_data_142.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_143/2023_02_16_21_supp_data_143.parquet to all_data_supp_data_live_deployment/2023_02_16_21_supp_data_143.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_144/2023_02_16_21_supp_data_144.parqu

Completed 256.0 KiB/171.8 MiB (1.4 MiB/s) with 1 file(s) remainingCompleted 512.0 KiB/171.8 MiB (2.7 MiB/s) with 1 file(s) remainingCompleted 768.0 KiB/171.8 MiB (4.1 MiB/s) with 1 file(s) remainingCompleted 1.0 MiB/171.8 MiB (5.4 MiB/s) with 1 file(s) remaining  Completed 1.2 MiB/171.8 MiB (6.6 MiB/s) with 1 file(s) remaining  Completed 1.5 MiB/171.8 MiB (7.9 MiB/s) with 1 file(s) remaining  Completed 1.8 MiB/171.8 MiB (9.2 MiB/s) with 1 file(s) remaining  Completed 2.0 MiB/171.8 MiB (10.4 MiB/s) with 1 file(s) remaining Completed 2.2 MiB/171.8 MiB (11.7 MiB/s) with 1 file(s) remaining Completed 2.5 MiB/171.8 MiB (12.9 MiB/s) with 1 file(s) remaining Completed 2.8 MiB/171.8 MiB (14.0 MiB/s) with 1 file(s) remaining Completed 3.0 MiB/171.8 MiB (15.2 MiB/s) with 1 file(s) remaining Completed 3.2 MiB/171.8 MiB (16.4 MiB/s) with 1 file(s) remaining Completed 3.5 MiB/171.8 MiB (17.6 MiB/s) with 1 file(s) remaining Completed 3.8 MiB/171.8 MiB (18.7 MiB/s) with 1 file(s) remain

Completed 36.2 MiB/171.8 MiB (94.8 MiB/s) with 1 file(s) remainingCompleted 36.5 MiB/171.8 MiB (95.4 MiB/s) with 1 file(s) remainingCompleted 36.8 MiB/171.8 MiB (95.8 MiB/s) with 1 file(s) remainingCompleted 37.0 MiB/171.8 MiB (96.3 MiB/s) with 1 file(s) remainingCompleted 37.2 MiB/171.8 MiB (96.8 MiB/s) with 1 file(s) remainingCompleted 37.5 MiB/171.8 MiB (97.2 MiB/s) with 1 file(s) remainingCompleted 37.8 MiB/171.8 MiB (97.6 MiB/s) with 1 file(s) remainingCompleted 38.0 MiB/171.8 MiB (98.0 MiB/s) with 1 file(s) remainingCompleted 38.2 MiB/171.8 MiB (98.4 MiB/s) with 1 file(s) remainingCompleted 38.5 MiB/171.8 MiB (98.8 MiB/s) with 1 file(s) remainingCompleted 38.8 MiB/171.8 MiB (99.2 MiB/s) with 1 file(s) remainingCompleted 39.0 MiB/171.8 MiB (99.8 MiB/s) with 1 file(s) remainingCompleted 39.2 MiB/171.8 MiB (100.1 MiB/s) with 1 file(s) remainingCompleted 39.5 MiB/171.8 MiB (100.6 MiB/s) with 1 file(s) remainingCompleted 39.8 MiB/171.8 MiB (101.0 MiB/s) with 1 file(s) re

download: s3://author-disambiguation/V1/vectorized_data/partition_181/2023_02_16_22_supp_data_181.parquet to all_data_supp_data_live_deployment/2023_02_16_22_supp_data_181.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_182/2023_02_16_22_supp_data_182.parquet to all_data_supp_data_live_deployment/2023_02_16_22_supp_data_182.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_183/2023_02_16_22_supp_data_183.parquet to all_data_supp_data_live_deployment/2023_02_16_22_supp_data_183.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_184/2023_02_16_22_supp_data_184.parquet to all_data_supp_data_live_deployment/2023_02_16_22_supp_data_184.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_185/2023_02_16_22_supp_data_185.parquet to all_data_supp_data_live_deployment/2023_02_16_22_supp_data_185.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_186/2023_02_16_22_supp_data_186.pa

Completed 256.0 KiB/155.8 MiB (1.7 MiB/s) with 1 file(s) remainingCompleted 512.0 KiB/155.8 MiB (3.3 MiB/s) with 1 file(s) remainingCompleted 768.0 KiB/155.8 MiB (4.9 MiB/s) with 1 file(s) remainingCompleted 1.0 MiB/155.8 MiB (6.5 MiB/s) with 1 file(s) remaining  Completed 1.2 MiB/155.8 MiB (8.0 MiB/s) with 1 file(s) remaining  Completed 1.5 MiB/155.8 MiB (8.8 MiB/s) with 1 file(s) remaining  Completed 1.8 MiB/155.8 MiB (10.2 MiB/s) with 1 file(s) remaining Completed 2.0 MiB/155.8 MiB (11.6 MiB/s) with 1 file(s) remaining Completed 2.2 MiB/155.8 MiB (12.9 MiB/s) with 1 file(s) remaining Completed 2.5 MiB/155.8 MiB (14.3 MiB/s) with 1 file(s) remaining Completed 2.8 MiB/155.8 MiB (15.7 MiB/s) with 1 file(s) remaining Completed 3.0 MiB/155.8 MiB (17.0 MiB/s) with 1 file(s) remaining Completed 3.2 MiB/155.8 MiB (18.2 MiB/s) with 1 file(s) remaining Completed 3.5 MiB/155.8 MiB (19.4 MiB/s) with 1 file(s) remaining Completed 3.8 MiB/155.8 MiB (20.4 MiB/s) with 1 file(s) remain

Completed 34.5 MiB/155.8 MiB (98.0 MiB/s) with 1 file(s) remainingCompleted 34.8 MiB/155.8 MiB (98.5 MiB/s) with 1 file(s) remainingCompleted 35.0 MiB/155.8 MiB (99.0 MiB/s) with 1 file(s) remainingCompleted 35.2 MiB/155.8 MiB (99.3 MiB/s) with 1 file(s) remainingCompleted 35.5 MiB/155.8 MiB (100.0 MiB/s) with 1 file(s) remainingCompleted 35.8 MiB/155.8 MiB (100.3 MiB/s) with 1 file(s) remainingCompleted 36.0 MiB/155.8 MiB (100.8 MiB/s) with 1 file(s) remainingCompleted 36.2 MiB/155.8 MiB (101.3 MiB/s) with 1 file(s) remainingCompleted 36.5 MiB/155.8 MiB (101.8 MiB/s) with 1 file(s) remainingCompleted 36.8 MiB/155.8 MiB (102.3 MiB/s) with 1 file(s) remainingCompleted 37.0 MiB/155.8 MiB (102.8 MiB/s) with 1 file(s) remainingCompleted 37.2 MiB/155.8 MiB (103.3 MiB/s) with 1 file(s) remainingCompleted 37.5 MiB/155.8 MiB (103.8 MiB/s) with 1 file(s) remainingCompleted 37.8 MiB/155.8 MiB (104.3 MiB/s) with 1 file(s) remainingCompleted 38.0 MiB/155.8 MiB (104.5 MiB/s) with 1 fi

Completed 69.8 MiB/155.8 MiB (125.6 MiB/s) with 1 file(s) remainingCompleted 70.0 MiB/155.8 MiB (125.8 MiB/s) with 1 file(s) remainingCompleted 70.2 MiB/155.8 MiB (126.1 MiB/s) with 1 file(s) remainingCompleted 70.5 MiB/155.8 MiB (126.4 MiB/s) with 1 file(s) remainingCompleted 70.8 MiB/155.8 MiB (126.7 MiB/s) with 1 file(s) remainingCompleted 71.0 MiB/155.8 MiB (127.0 MiB/s) with 1 file(s) remainingCompleted 71.2 MiB/155.8 MiB (127.4 MiB/s) with 1 file(s) remainingCompleted 71.5 MiB/155.8 MiB (127.6 MiB/s) with 1 file(s) remainingCompleted 71.8 MiB/155.8 MiB (127.9 MiB/s) with 1 file(s) remainingCompleted 72.0 MiB/155.8 MiB (128.1 MiB/s) with 1 file(s) remainingCompleted 72.2 MiB/155.8 MiB (128.5 MiB/s) with 1 file(s) remainingCompleted 72.5 MiB/155.8 MiB (128.7 MiB/s) with 1 file(s) remainingCompleted 72.8 MiB/155.8 MiB (128.9 MiB/s) with 1 file(s) remainingCompleted 73.0 MiB/155.8 MiB (129.2 MiB/s) with 1 file(s) remainingCompleted 73.2 MiB/155.8 MiB (129.5 MiB/s) with 

Completed 102.2 MiB/155.8 MiB (134.4 MiB/s) with 1 file(s) remainingCompleted 102.5 MiB/155.8 MiB (134.6 MiB/s) with 1 file(s) remainingCompleted 102.8 MiB/155.8 MiB (134.7 MiB/s) with 1 file(s) remainingCompleted 103.0 MiB/155.8 MiB (135.0 MiB/s) with 1 file(s) remainingCompleted 103.2 MiB/155.8 MiB (135.1 MiB/s) with 1 file(s) remainingCompleted 103.5 MiB/155.8 MiB (135.2 MiB/s) with 1 file(s) remainingCompleted 103.8 MiB/155.8 MiB (135.5 MiB/s) with 1 file(s) remainingCompleted 104.0 MiB/155.8 MiB (133.8 MiB/s) with 1 file(s) remainingCompleted 104.2 MiB/155.8 MiB (133.9 MiB/s) with 1 file(s) remainingCompleted 104.5 MiB/155.8 MiB (134.0 MiB/s) with 1 file(s) remainingCompleted 104.8 MiB/155.8 MiB (134.2 MiB/s) with 1 file(s) remainingCompleted 105.0 MiB/155.8 MiB (134.3 MiB/s) with 1 file(s) remainingCompleted 105.2 MiB/155.8 MiB (134.4 MiB/s) with 1 file(s) remainingCompleted 105.5 MiB/155.8 MiB (134.6 MiB/s) with 1 file(s) remainingCompleted 105.8 MiB/155.8 MiB (134

download: s3://author-disambiguation/V1/vectorized_data/partition_222/2023_02_17_00_supp_data_222.parquet to all_data_supp_data_live_deployment/2023_02_17_00_supp_data_222.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_223/2023_02_17_00_supp_data_223.parquet to all_data_supp_data_live_deployment/2023_02_17_00_supp_data_223.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_224/2023_02_17_00_supp_data_224.parquet to all_data_supp_data_live_deployment/2023_02_17_00_supp_data_224.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_225/2023_02_17_00_supp_data_225.parquet to all_data_supp_data_live_deployment/2023_02_17_00_supp_data_225.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_226/2023_02_17_00_supp_data_226.parquet to all_data_supp_data_live_deployment/2023_02_17_00_supp_data_226.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_227/2023_02_17_00_supp_data_227.pa

Completed 256.0 KiB/152.8 MiB (1.1 MiB/s) with 1 file(s) remainingCompleted 512.0 KiB/152.8 MiB (2.2 MiB/s) with 1 file(s) remainingCompleted 768.0 KiB/152.8 MiB (3.3 MiB/s) with 1 file(s) remainingCompleted 1.0 MiB/152.8 MiB (4.3 MiB/s) with 1 file(s) remaining  Completed 1.2 MiB/152.8 MiB (5.1 MiB/s) with 1 file(s) remaining  Completed 1.5 MiB/152.8 MiB (6.2 MiB/s) with 1 file(s) remaining  Completed 1.8 MiB/152.8 MiB (7.2 MiB/s) with 1 file(s) remaining  Completed 2.0 MiB/152.8 MiB (8.1 MiB/s) with 1 file(s) remaining  Completed 2.2 MiB/152.8 MiB (9.1 MiB/s) with 1 file(s) remaining  Completed 2.5 MiB/152.8 MiB (10.1 MiB/s) with 1 file(s) remaining Completed 2.8 MiB/152.8 MiB (11.1 MiB/s) with 1 file(s) remaining Completed 3.0 MiB/152.8 MiB (12.0 MiB/s) with 1 file(s) remaining Completed 3.2 MiB/152.8 MiB (12.9 MiB/s) with 1 file(s) remaining Completed 3.5 MiB/152.8 MiB (13.8 MiB/s) with 1 file(s) remaining Completed 3.8 MiB/152.8 MiB (14.7 MiB/s) with 1 file(s) remain

Completed 37.2 MiB/152.8 MiB (87.0 MiB/s) with 1 file(s) remainingCompleted 37.5 MiB/152.8 MiB (87.2 MiB/s) with 1 file(s) remainingCompleted 37.8 MiB/152.8 MiB (87.4 MiB/s) with 1 file(s) remainingCompleted 38.0 MiB/152.8 MiB (87.8 MiB/s) with 1 file(s) remainingCompleted 38.2 MiB/152.8 MiB (85.5 MiB/s) with 1 file(s) remainingCompleted 38.5 MiB/152.8 MiB (85.9 MiB/s) with 1 file(s) remainingCompleted 38.8 MiB/152.8 MiB (86.3 MiB/s) with 1 file(s) remainingCompleted 39.0 MiB/152.8 MiB (86.6 MiB/s) with 1 file(s) remainingCompleted 39.2 MiB/152.8 MiB (87.0 MiB/s) with 1 file(s) remainingCompleted 39.5 MiB/152.8 MiB (87.3 MiB/s) with 1 file(s) remainingCompleted 39.8 MiB/152.8 MiB (87.9 MiB/s) with 1 file(s) remainingCompleted 40.0 MiB/152.8 MiB (88.2 MiB/s) with 1 file(s) remainingCompleted 40.2 MiB/152.8 MiB (88.5 MiB/s) with 1 file(s) remainingCompleted 40.5 MiB/152.8 MiB (88.9 MiB/s) with 1 file(s) remainingCompleted 40.8 MiB/152.8 MiB (89.3 MiB/s) with 1 file(s) remai

download: s3://author-disambiguation/V1/vectorized_data/partition_264/2023_02_17_01_supp_data_264.parquet to all_data_supp_data_live_deployment/2023_02_17_01_supp_data_264.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_265/2023_02_17_00_supp_data_265.parquet to all_data_supp_data_live_deployment/2023_02_17_00_supp_data_265.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_266/2023_02_17_00_supp_data_266.parquet to all_data_supp_data_live_deployment/2023_02_17_00_supp_data_266.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_267/2023_02_17_00_supp_data_267.parquet to all_data_supp_data_live_deployment/2023_02_17_00_supp_data_267.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_268/2023_02_17_00_supp_data_268.parquet to all_data_supp_data_live_deployment/2023_02_17_00_supp_data_268.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_269/2023_02_17_00_supp_data_269.pa

Completed 256.0 KiB/181.5 MiB (1.8 MiB/s) with 1 file(s) remainingCompleted 512.0 KiB/181.5 MiB (3.5 MiB/s) with 1 file(s) remainingCompleted 768.0 KiB/181.5 MiB (5.2 MiB/s) with 1 file(s) remainingCompleted 1.0 MiB/181.5 MiB (6.6 MiB/s) with 1 file(s) remaining  Completed 1.2 MiB/181.5 MiB (8.1 MiB/s) with 1 file(s) remaining  Completed 1.5 MiB/181.5 MiB (9.6 MiB/s) with 1 file(s) remaining  Completed 1.8 MiB/181.5 MiB (11.2 MiB/s) with 1 file(s) remaining Completed 2.0 MiB/181.5 MiB (12.7 MiB/s) with 1 file(s) remaining Completed 2.2 MiB/181.5 MiB (14.0 MiB/s) with 1 file(s) remaining Completed 2.5 MiB/181.5 MiB (15.5 MiB/s) with 1 file(s) remaining Completed 2.8 MiB/181.5 MiB (16.9 MiB/s) with 1 file(s) remaining Completed 3.0 MiB/181.5 MiB (18.4 MiB/s) with 1 file(s) remaining Completed 3.2 MiB/181.5 MiB (19.8 MiB/s) with 1 file(s) remaining Completed 3.5 MiB/181.5 MiB (21.2 MiB/s) with 1 file(s) remaining Completed 3.8 MiB/181.5 MiB (22.6 MiB/s) with 1 file(s) remain

Completed 40.2 MiB/181.5 MiB (117.1 MiB/s) with 1 file(s) remainingCompleted 40.5 MiB/181.5 MiB (117.3 MiB/s) with 1 file(s) remainingCompleted 40.8 MiB/181.5 MiB (117.5 MiB/s) with 1 file(s) remainingCompleted 41.0 MiB/181.5 MiB (117.9 MiB/s) with 1 file(s) remainingCompleted 41.2 MiB/181.5 MiB (118.1 MiB/s) with 1 file(s) remainingCompleted 41.5 MiB/181.5 MiB (118.3 MiB/s) with 1 file(s) remainingCompleted 41.8 MiB/181.5 MiB (118.6 MiB/s) with 1 file(s) remainingCompleted 42.0 MiB/181.5 MiB (118.9 MiB/s) with 1 file(s) remainingCompleted 42.2 MiB/181.5 MiB (119.2 MiB/s) with 1 file(s) remainingCompleted 42.5 MiB/181.5 MiB (119.5 MiB/s) with 1 file(s) remainingCompleted 42.8 MiB/181.5 MiB (119.8 MiB/s) with 1 file(s) remainingCompleted 43.0 MiB/181.5 MiB (120.2 MiB/s) with 1 file(s) remainingCompleted 43.2 MiB/181.5 MiB (120.6 MiB/s) with 1 file(s) remainingCompleted 43.5 MiB/181.5 MiB (121.0 MiB/s) with 1 file(s) remainingCompleted 43.8 MiB/181.5 MiB (121.4 MiB/s) with 

download: s3://author-disambiguation/V1/vectorized_data/partition_305/2023_02_16_21_supp_data_305.parquet to all_data_supp_data_live_deployment/2023_02_16_21_supp_data_305.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_306/2023_02_16_21_supp_data_306.parquet to all_data_supp_data_live_deployment/2023_02_16_21_supp_data_306.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_307/2023_02_16_21_supp_data_307.parquet to all_data_supp_data_live_deployment/2023_02_16_21_supp_data_307.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_308/2023_02_16_21_supp_data_308.parquet to all_data_supp_data_live_deployment/2023_02_16_21_supp_data_308.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_309/2023_02_16_21_supp_data_309.parquet to all_data_supp_data_live_deployment/2023_02_16_21_supp_data_309.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_31/2023_02_16_21_supp_data_31.parq

download: s3://author-disambiguation/V1/vectorized_data/partition_347/2023_02_16_22_supp_data_347.parquet to all_data_supp_data_live_deployment/2023_02_16_22_supp_data_347.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_348/2023_02_16_22_supp_data_348.parquet to all_data_supp_data_live_deployment/2023_02_16_22_supp_data_348.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_349/2023_02_16_22_supp_data_349.parquet to all_data_supp_data_live_deployment/2023_02_16_22_supp_data_349.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_35/2023_02_16_22_supp_data_35.parquet to all_data_supp_data_live_deployment/2023_02_16_22_supp_data_35.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_350/2023_02_16_22_supp_data_350.parquet to all_data_supp_data_live_deployment/2023_02_16_22_supp_data_350.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_351/2023_02_16_22_supp_data_351.parqu

download: s3://author-disambiguation/V1/vectorized_data/partition_389/2023_02_17_00_supp_data_389.parquet to all_data_supp_data_live_deployment/2023_02_17_00_supp_data_389.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_39/2023_02_17_00_supp_data_39.parquet to all_data_supp_data_live_deployment/2023_02_17_00_supp_data_39.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_390/2023_02_17_00_supp_data_390.parquet to all_data_supp_data_live_deployment/2023_02_17_00_supp_data_390.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_391/2023_02_17_00_supp_data_391.parquet to all_data_supp_data_live_deployment/2023_02_17_00_supp_data_391.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_392/2023_02_17_00_supp_data_392.parquet to all_data_supp_data_live_deployment/2023_02_17_00_supp_data_392.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_393/2023_02_17_00_supp_data_393.parqu

Completed 256.0 KiB/142.6 MiB (1.6 MiB/s) with 1 file(s) remainingCompleted 512.0 KiB/142.6 MiB (3.1 MiB/s) with 1 file(s) remainingCompleted 768.0 KiB/142.6 MiB (4.0 MiB/s) with 1 file(s) remainingCompleted 1.0 MiB/142.6 MiB (5.3 MiB/s) with 1 file(s) remaining  Completed 1.2 MiB/142.6 MiB (6.4 MiB/s) with 1 file(s) remaining  Completed 1.5 MiB/142.6 MiB (7.7 MiB/s) with 1 file(s) remaining  Completed 1.8 MiB/142.6 MiB (8.9 MiB/s) with 1 file(s) remaining  Completed 2.0 MiB/142.6 MiB (10.1 MiB/s) with 1 file(s) remaining Completed 2.2 MiB/142.6 MiB (11.3 MiB/s) with 1 file(s) remaining Completed 2.5 MiB/142.6 MiB (12.6 MiB/s) with 1 file(s) remaining Completed 2.8 MiB/142.6 MiB (13.7 MiB/s) with 1 file(s) remaining Completed 3.0 MiB/142.6 MiB (14.7 MiB/s) with 1 file(s) remaining Completed 3.2 MiB/142.6 MiB (15.8 MiB/s) with 1 file(s) remaining Completed 3.5 MiB/142.6 MiB (17.0 MiB/s) with 1 file(s) remaining Completed 3.8 MiB/142.6 MiB (18.1 MiB/s) with 1 file(s) remain

Completed 32.0 MiB/142.6 MiB (88.6 MiB/s) with 1 file(s) remainingCompleted 32.2 MiB/142.6 MiB (89.2 MiB/s) with 1 file(s) remainingCompleted 32.5 MiB/142.6 MiB (89.5 MiB/s) with 1 file(s) remainingCompleted 32.8 MiB/142.6 MiB (90.0 MiB/s) with 1 file(s) remainingCompleted 33.0 MiB/142.6 MiB (90.6 MiB/s) with 1 file(s) remainingCompleted 33.2 MiB/142.6 MiB (91.0 MiB/s) with 1 file(s) remainingCompleted 33.5 MiB/142.6 MiB (91.6 MiB/s) with 1 file(s) remainingCompleted 33.8 MiB/142.6 MiB (92.1 MiB/s) with 1 file(s) remainingCompleted 34.0 MiB/142.6 MiB (92.5 MiB/s) with 1 file(s) remainingCompleted 34.2 MiB/142.6 MiB (93.0 MiB/s) with 1 file(s) remainingCompleted 34.5 MiB/142.6 MiB (93.5 MiB/s) with 1 file(s) remainingCompleted 34.8 MiB/142.6 MiB (93.9 MiB/s) with 1 file(s) remainingCompleted 35.0 MiB/142.6 MiB (94.5 MiB/s) with 1 file(s) remainingCompleted 35.2 MiB/142.6 MiB (94.7 MiB/s) with 1 file(s) remainingCompleted 35.5 MiB/142.6 MiB (95.3 MiB/s) with 1 file(s) remai

download: s3://author-disambiguation/V1/vectorized_data/partition_43/2023_02_16_21_supp_data_43.parquet to all_data_supp_data_live_deployment/2023_02_16_21_supp_data_43.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_430/2023_02_16_21_supp_data_430.parquet to all_data_supp_data_live_deployment/2023_02_16_21_supp_data_430.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_431/2023_02_16_21_supp_data_431.parquet to all_data_supp_data_live_deployment/2023_02_16_21_supp_data_431.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_432/2023_02_16_21_supp_data_432.parquet to all_data_supp_data_live_deployment/2023_02_16_21_supp_data_432.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_433/2023_02_16_21_supp_data_433.parquet to all_data_supp_data_live_deployment/2023_02_16_21_supp_data_433.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_434/2023_02_16_21_supp_data_434.parqu

Completed 256.0 KiB/120.1 MiB (1.6 MiB/s) with 1 file(s) remainingCompleted 512.0 KiB/120.1 MiB (3.1 MiB/s) with 1 file(s) remainingCompleted 768.0 KiB/120.1 MiB (4.5 MiB/s) with 1 file(s) remainingCompleted 1.0 MiB/120.1 MiB (5.9 MiB/s) with 1 file(s) remaining  Completed 1.2 MiB/120.1 MiB (7.2 MiB/s) with 1 file(s) remaining  Completed 1.5 MiB/120.1 MiB (8.6 MiB/s) with 1 file(s) remaining  Completed 1.8 MiB/120.1 MiB (9.9 MiB/s) with 1 file(s) remaining  Completed 2.0 MiB/120.1 MiB (11.3 MiB/s) with 1 file(s) remaining Completed 2.2 MiB/120.1 MiB (12.6 MiB/s) with 1 file(s) remaining Completed 2.5 MiB/120.1 MiB (14.0 MiB/s) with 1 file(s) remaining Completed 2.8 MiB/120.1 MiB (15.3 MiB/s) with 1 file(s) remaining Completed 3.0 MiB/120.1 MiB (16.6 MiB/s) with 1 file(s) remaining Completed 3.2 MiB/120.1 MiB (17.9 MiB/s) with 1 file(s) remaining Completed 3.5 MiB/120.1 MiB (19.1 MiB/s) with 1 file(s) remaining Completed 3.8 MiB/120.1 MiB (20.4 MiB/s) with 1 file(s) remain

Completed 34.8 MiB/120.1 MiB (96.8 MiB/s) with 1 file(s) remainingCompleted 35.0 MiB/120.1 MiB (97.0 MiB/s) with 1 file(s) remainingCompleted 35.2 MiB/120.1 MiB (96.9 MiB/s) with 1 file(s) remainingCompleted 35.5 MiB/120.1 MiB (95.4 MiB/s) with 1 file(s) remainingCompleted 35.8 MiB/120.1 MiB (95.7 MiB/s) with 1 file(s) remainingCompleted 36.0 MiB/120.1 MiB (95.9 MiB/s) with 1 file(s) remainingCompleted 36.2 MiB/120.1 MiB (96.2 MiB/s) with 1 file(s) remainingCompleted 36.5 MiB/120.1 MiB (96.4 MiB/s) with 1 file(s) remainingCompleted 36.8 MiB/120.1 MiB (96.7 MiB/s) with 1 file(s) remainingCompleted 37.0 MiB/120.1 MiB (97.0 MiB/s) with 1 file(s) remainingCompleted 37.2 MiB/120.1 MiB (97.3 MiB/s) with 1 file(s) remainingCompleted 37.5 MiB/120.1 MiB (97.7 MiB/s) with 1 file(s) remainingCompleted 37.8 MiB/120.1 MiB (98.2 MiB/s) with 1 file(s) remainingCompleted 38.0 MiB/120.1 MiB (98.8 MiB/s) with 1 file(s) remainingCompleted 38.2 MiB/120.1 MiB (99.1 MiB/s) with 1 file(s) remai

download: s3://author-disambiguation/V1/vectorized_data/partition_471/2023_02_16_22_supp_data_471.parquet to all_data_supp_data_live_deployment/2023_02_16_22_supp_data_471.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_472/2023_02_16_22_supp_data_472.parquet to all_data_supp_data_live_deployment/2023_02_16_22_supp_data_472.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_473/2023_02_16_22_supp_data_473.parquet to all_data_supp_data_live_deployment/2023_02_16_22_supp_data_473.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_474/2023_02_16_22_supp_data_474.parquet to all_data_supp_data_live_deployment/2023_02_16_22_supp_data_474.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_475/2023_02_16_22_supp_data_475.parquet to all_data_supp_data_live_deployment/2023_02_16_22_supp_data_475.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_476/2023_02_16_22_supp_data_476.pa

download: s3://author-disambiguation/V1/vectorized_data/partition_62/2023_02_16_23_supp_data_62.parquet to all_data_supp_data_live_deployment/2023_02_16_23_supp_data_62.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_63/2023_02_16_23_supp_data_63.parquet to all_data_supp_data_live_deployment/2023_02_16_23_supp_data_63.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_64/2023_02_16_23_supp_data_64.parquet to all_data_supp_data_live_deployment/2023_02_16_23_supp_data_64.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_65/2023_02_16_23_supp_data_65.parquet to all_data_supp_data_live_deployment/2023_02_16_23_supp_data_65.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_66/2023_02_16_23_supp_data_66.parquet to all_data_supp_data_live_deployment/2023_02_16_23_supp_data_66.parquet
download: s3://author-disambiguation/V1/vectorized_data/partition_67/2023_02_16_23_supp_data_67.parquet to all_data

In [2]:
local_filepaths = glob.glob(f"/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/*")

In [3]:
local_filepaths[:2]

['/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_21_supp_data_130.parquet',
 '/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_20_supp_data_0.parquet']

In [4]:
local_filenames = [x.split("/")[-1] for x in local_filepaths]
file_int = [int(x.split("_")[-1].split(".")[0]) for x in local_filenames]

In [5]:
local_filenames[:2]

['2023_02_16_21_supp_data_130.parquet', '2023_02_16_20_supp_data_0.parquet']

In [6]:
file_int[:2]

[130, 0]

In [7]:
node_mapping = \
pd.read_parquet("s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/partition_id_node_mapping.parquet")

In [8]:
node_mapping_dict = node_mapping.set_index('partition_id').to_dict(orient='index')

In [10]:
for local_file, file_i in zip(local_filepaths, file_int):
    print(local_file)
    bucket_name = "author-name-disambiguation"
    prefix = f"V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_{node_mapping_dict[file_i]['node']}/"
    os.system(f"aws s3 cp {local_file} s3://{bucket_name}/{prefix}partition_{file_i}/001_data_files/ ")

/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_21_supp_data_130.parquet
upload: all_data_supp_data_live_deployment/2023_02_16_21_supp_data_130.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_130/001_data_files/2023_02_16_21_supp_data_130.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_20_supp_data_0.parquet
upload: all_data_supp_data_live_deployment/2023_02_16_20_supp_data_0.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_0/001_data_files/2023_02_16_20_supp_data_0.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_21_supp_data_13.parquet
upload: all_data_supp_data_live_deployment/2023_02_16_21_supp_data_13.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_13/001_data_files/2023_02_16_2

Completed 256.0 KiB/126.5 MiB (2.3 MiB/s) with 1 file(s) remainingCompleted 512.0 KiB/126.5 MiB (4.3 MiB/s) with 1 file(s) remainingCompleted 768.0 KiB/126.5 MiB (6.1 MiB/s) with 1 file(s) remainingCompleted 1.0 MiB/126.5 MiB (8.0 MiB/s) with 1 file(s) remaining  Completed 1.2 MiB/126.5 MiB (9.7 MiB/s) with 1 file(s) remaining  Completed 1.5 MiB/126.5 MiB (11.4 MiB/s) with 1 file(s) remaining Completed 1.8 MiB/126.5 MiB (13.1 MiB/s) with 1 file(s) remaining Completed 2.0 MiB/126.5 MiB (14.7 MiB/s) with 1 file(s) remaining Completed 2.2 MiB/126.5 MiB (16.3 MiB/s) with 1 file(s) remaining Completed 2.5 MiB/126.5 MiB (17.9 MiB/s) with 1 file(s) remaining Completed 2.8 MiB/126.5 MiB (19.4 MiB/s) with 1 file(s) remaining Completed 3.0 MiB/126.5 MiB (21.0 MiB/s) with 1 file(s) remaining Completed 3.2 MiB/126.5 MiB (22.5 MiB/s) with 1 file(s) remaining Completed 3.5 MiB/126.5 MiB (23.9 MiB/s) with 1 file(s) remaining Completed 3.8 MiB/126.5 MiB (25.4 MiB/s) with 1 file(s) remain

Completed 44.0 MiB/126.5 MiB (142.1 MiB/s) with 1 file(s) remainingCompleted 44.2 MiB/126.5 MiB (142.3 MiB/s) with 1 file(s) remainingCompleted 44.5 MiB/126.5 MiB (142.5 MiB/s) with 1 file(s) remainingCompleted 44.8 MiB/126.5 MiB (143.1 MiB/s) with 1 file(s) remainingCompleted 45.0 MiB/126.5 MiB (143.8 MiB/s) with 1 file(s) remainingCompleted 45.2 MiB/126.5 MiB (144.4 MiB/s) with 1 file(s) remainingCompleted 45.5 MiB/126.5 MiB (145.0 MiB/s) with 1 file(s) remainingCompleted 45.8 MiB/126.5 MiB (144.5 MiB/s) with 1 file(s) remainingCompleted 46.0 MiB/126.5 MiB (144.9 MiB/s) with 1 file(s) remainingCompleted 46.2 MiB/126.5 MiB (145.5 MiB/s) with 1 file(s) remainingCompleted 46.5 MiB/126.5 MiB (145.2 MiB/s) with 1 file(s) remainingCompleted 46.8 MiB/126.5 MiB (146.0 MiB/s) with 1 file(s) remainingCompleted 47.0 MiB/126.5 MiB (146.2 MiB/s) with 1 file(s) remainingCompleted 47.2 MiB/126.5 MiB (146.9 MiB/s) with 1 file(s) remainingCompleted 47.5 MiB/126.5 MiB (147.2 MiB/s) with 

Completed 88.2 MiB/126.5 MiB (164.4 MiB/s) with 1 file(s) remainingCompleted 88.5 MiB/126.5 MiB (162.9 MiB/s) with 1 file(s) remainingCompleted 88.8 MiB/126.5 MiB (162.6 MiB/s) with 1 file(s) remainingCompleted 89.0 MiB/126.5 MiB (162.6 MiB/s) with 1 file(s) remainingCompleted 89.2 MiB/126.5 MiB (162.9 MiB/s) with 1 file(s) remainingCompleted 89.5 MiB/126.5 MiB (163.1 MiB/s) with 1 file(s) remainingCompleted 89.8 MiB/126.5 MiB (163.3 MiB/s) with 1 file(s) remainingCompleted 90.0 MiB/126.5 MiB (163.6 MiB/s) with 1 file(s) remainingCompleted 90.2 MiB/126.5 MiB (163.7 MiB/s) with 1 file(s) remainingCompleted 90.5 MiB/126.5 MiB (164.0 MiB/s) with 1 file(s) remainingCompleted 90.8 MiB/126.5 MiB (164.1 MiB/s) with 1 file(s) remainingCompleted 91.0 MiB/126.5 MiB (164.2 MiB/s) with 1 file(s) remainingCompleted 91.2 MiB/126.5 MiB (164.6 MiB/s) with 1 file(s) remainingCompleted 91.5 MiB/126.5 MiB (164.7 MiB/s) with 1 file(s) remainingCompleted 91.8 MiB/126.5 MiB (164.9 MiB/s) with 

upload: all_data_supp_data_live_deployment/2023_02_16_21_supp_data_143.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_143/001_data_files/2023_02_16_21_supp_data_143.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_20_supp_data_109.parquet
upload: all_data_supp_data_live_deployment/2023_02_16_20_supp_data_109.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_109/001_data_files/2023_02_16_20_supp_data_109.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_21_supp_data_144.parquet
upload: all_data_supp_data_live_deployment/2023_02_16_21_supp_data_144.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_144/001_data_files/2023_02_16_21_supp_data_144.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2

Completed 256.0 KiB/114.1 MiB (1.9 MiB/s) with 1 file(s) remainingCompleted 512.0 KiB/114.1 MiB (3.8 MiB/s) with 1 file(s) remainingCompleted 768.0 KiB/114.1 MiB (5.6 MiB/s) with 1 file(s) remainingCompleted 1.0 MiB/114.1 MiB (7.3 MiB/s) with 1 file(s) remaining  Completed 1.2 MiB/114.1 MiB (9.0 MiB/s) with 1 file(s) remaining  Completed 1.5 MiB/114.1 MiB (10.7 MiB/s) with 1 file(s) remaining Completed 1.8 MiB/114.1 MiB (12.3 MiB/s) with 1 file(s) remaining Completed 2.0 MiB/114.1 MiB (13.9 MiB/s) with 1 file(s) remaining Completed 2.2 MiB/114.1 MiB (15.5 MiB/s) with 1 file(s) remaining Completed 2.5 MiB/114.1 MiB (17.0 MiB/s) with 1 file(s) remaining Completed 2.8 MiB/114.1 MiB (18.6 MiB/s) with 1 file(s) remaining Completed 3.0 MiB/114.1 MiB (20.1 MiB/s) with 1 file(s) remaining Completed 3.2 MiB/114.1 MiB (21.6 MiB/s) with 1 file(s) remaining Completed 3.5 MiB/114.1 MiB (22.9 MiB/s) with 1 file(s) remaining Completed 3.8 MiB/114.1 MiB (24.4 MiB/s) with 1 file(s) remain

Completed 47.5 MiB/114.1 MiB (143.7 MiB/s) with 1 file(s) remainingCompleted 47.8 MiB/114.1 MiB (143.8 MiB/s) with 1 file(s) remainingCompleted 48.0 MiB/114.1 MiB (143.1 MiB/s) with 1 file(s) remainingCompleted 48.2 MiB/114.1 MiB (143.1 MiB/s) with 1 file(s) remainingCompleted 48.5 MiB/114.1 MiB (143.7 MiB/s) with 1 file(s) remainingCompleted 48.8 MiB/114.1 MiB (144.3 MiB/s) with 1 file(s) remainingCompleted 49.0 MiB/114.1 MiB (144.6 MiB/s) with 1 file(s) remainingCompleted 49.2 MiB/114.1 MiB (144.9 MiB/s) with 1 file(s) remainingCompleted 49.5 MiB/114.1 MiB (145.6 MiB/s) with 1 file(s) remainingCompleted 49.8 MiB/114.1 MiB (145.6 MiB/s) with 1 file(s) remainingCompleted 50.0 MiB/114.1 MiB (146.1 MiB/s) with 1 file(s) remainingCompleted 50.2 MiB/114.1 MiB (145.7 MiB/s) with 1 file(s) remainingCompleted 50.5 MiB/114.1 MiB (146.1 MiB/s) with 1 file(s) remainingCompleted 50.8 MiB/114.1 MiB (146.8 MiB/s) with 1 file(s) remainingCompleted 51.0 MiB/114.1 MiB (147.3 MiB/s) with 

upload: all_data_supp_data_live_deployment/2023_02_16_21_supp_data_154.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_154/001_data_files/2023_02_16_21_supp_data_154.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_20_supp_data_12.parquet
upload: all_data_supp_data_live_deployment/2023_02_16_20_supp_data_12.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_12/001_data_files/2023_02_16_20_supp_data_12.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_22_supp_data_155.parquet
upload: all_data_supp_data_live_deployment/2023_02_16_22_supp_data_155.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_155/001_data_files/2023_02_16_22_supp_data_155.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_

Completed 256.0 KiB/135.8 MiB (1.3 MiB/s) with 1 file(s) remainingCompleted 512.0 KiB/135.8 MiB (2.5 MiB/s) with 1 file(s) remainingCompleted 768.0 KiB/135.8 MiB (3.7 MiB/s) with 1 file(s) remainingCompleted 1.0 MiB/135.8 MiB (4.8 MiB/s) with 1 file(s) remaining  Completed 1.2 MiB/135.8 MiB (6.0 MiB/s) with 1 file(s) remaining  Completed 1.5 MiB/135.8 MiB (7.1 MiB/s) with 1 file(s) remaining  Completed 1.8 MiB/135.8 MiB (8.3 MiB/s) with 1 file(s) remaining  Completed 2.0 MiB/135.8 MiB (9.5 MiB/s) with 1 file(s) remaining  Completed 2.2 MiB/135.8 MiB (10.6 MiB/s) with 1 file(s) remaining Completed 2.5 MiB/135.8 MiB (11.7 MiB/s) with 1 file(s) remaining Completed 2.8 MiB/135.8 MiB (12.8 MiB/s) with 1 file(s) remaining Completed 3.0 MiB/135.8 MiB (13.9 MiB/s) with 1 file(s) remaining Completed 3.2 MiB/135.8 MiB (15.0 MiB/s) with 1 file(s) remaining Completed 3.5 MiB/135.8 MiB (16.1 MiB/s) with 1 file(s) remaining Completed 3.8 MiB/135.8 MiB (17.2 MiB/s) with 1 file(s) remain

Completed 46.5 MiB/135.8 MiB (119.2 MiB/s) with 1 file(s) remainingCompleted 46.8 MiB/135.8 MiB (119.7 MiB/s) with 1 file(s) remainingCompleted 47.0 MiB/135.8 MiB (119.8 MiB/s) with 1 file(s) remainingCompleted 47.2 MiB/135.8 MiB (120.2 MiB/s) with 1 file(s) remainingCompleted 47.5 MiB/135.8 MiB (120.7 MiB/s) with 1 file(s) remainingCompleted 47.8 MiB/135.8 MiB (121.1 MiB/s) with 1 file(s) remainingCompleted 48.0 MiB/135.8 MiB (121.4 MiB/s) with 1 file(s) remainingCompleted 48.2 MiB/135.8 MiB (121.9 MiB/s) with 1 file(s) remainingCompleted 48.5 MiB/135.8 MiB (121.9 MiB/s) with 1 file(s) remainingCompleted 48.8 MiB/135.8 MiB (122.5 MiB/s) with 1 file(s) remainingCompleted 49.0 MiB/135.8 MiB (123.0 MiB/s) with 1 file(s) remainingCompleted 49.2 MiB/135.8 MiB (123.1 MiB/s) with 1 file(s) remainingCompleted 49.5 MiB/135.8 MiB (123.3 MiB/s) with 1 file(s) remainingCompleted 49.8 MiB/135.8 MiB (123.6 MiB/s) with 1 file(s) remainingCompleted 50.0 MiB/135.8 MiB (123.7 MiB/s) with 

Completed 84.2 MiB/135.8 MiB (142.5 MiB/s) with 1 file(s) remainingCompleted 84.5 MiB/135.8 MiB (142.7 MiB/s) with 1 file(s) remainingCompleted 84.8 MiB/135.8 MiB (142.6 MiB/s) with 1 file(s) remainingCompleted 85.0 MiB/135.8 MiB (142.6 MiB/s) with 1 file(s) remainingCompleted 85.2 MiB/135.8 MiB (142.3 MiB/s) with 1 file(s) remainingCompleted 85.5 MiB/135.8 MiB (142.5 MiB/s) with 1 file(s) remainingCompleted 85.8 MiB/135.8 MiB (142.8 MiB/s) with 1 file(s) remainingCompleted 86.0 MiB/135.8 MiB (142.3 MiB/s) with 1 file(s) remainingCompleted 86.2 MiB/135.8 MiB (142.5 MiB/s) with 1 file(s) remainingCompleted 86.5 MiB/135.8 MiB (142.7 MiB/s) with 1 file(s) remainingCompleted 86.8 MiB/135.8 MiB (142.9 MiB/s) with 1 file(s) remainingCompleted 87.0 MiB/135.8 MiB (143.1 MiB/s) with 1 file(s) remainingCompleted 87.2 MiB/135.8 MiB (143.2 MiB/s) with 1 file(s) remainingCompleted 87.5 MiB/135.8 MiB (143.4 MiB/s) with 1 file(s) remainingCompleted 87.8 MiB/135.8 MiB (143.6 MiB/s) with 

upload: all_data_supp_data_live_deployment/2023_02_16_21_supp_data_136.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_136/001_data_files/2023_02_16_21_supp_data_136.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_22_supp_data_165.parquet
upload: all_data_supp_data_live_deployment/2023_02_16_22_supp_data_165.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_165/001_data_files/2023_02_16_22_supp_data_165.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_21_supp_data_137.parquet
upload: all_data_supp_data_live_deployment/2023_02_16_21_supp_data_137.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_137/001_data_files/2023_02_16_21_supp_data_137.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2

Completed 256.0 KiB/203.0 MiB (1.9 MiB/s) with 1 file(s) remainingCompleted 512.0 KiB/203.0 MiB (3.5 MiB/s) with 1 file(s) remainingCompleted 768.0 KiB/203.0 MiB (5.1 MiB/s) with 1 file(s) remainingCompleted 1.0 MiB/203.0 MiB (6.7 MiB/s) with 1 file(s) remaining  Completed 1.2 MiB/203.0 MiB (8.1 MiB/s) with 1 file(s) remaining  Completed 1.5 MiB/203.0 MiB (9.6 MiB/s) with 1 file(s) remaining  Completed 1.8 MiB/203.0 MiB (11.0 MiB/s) with 1 file(s) remaining Completed 2.0 MiB/203.0 MiB (12.3 MiB/s) with 1 file(s) remaining Completed 2.2 MiB/203.0 MiB (13.7 MiB/s) with 1 file(s) remaining Completed 2.5 MiB/203.0 MiB (15.2 MiB/s) with 1 file(s) remaining Completed 2.8 MiB/203.0 MiB (16.6 MiB/s) with 1 file(s) remaining Completed 3.0 MiB/203.0 MiB (18.0 MiB/s) with 1 file(s) remaining Completed 3.2 MiB/203.0 MiB (19.5 MiB/s) with 1 file(s) remaining Completed 3.5 MiB/203.0 MiB (20.9 MiB/s) with 1 file(s) remaining Completed 3.8 MiB/203.0 MiB (22.0 MiB/s) with 1 file(s) remain

Completed 44.5 MiB/203.0 MiB (132.8 MiB/s) with 1 file(s) remainingCompleted 44.8 MiB/203.0 MiB (133.3 MiB/s) with 1 file(s) remainingCompleted 45.0 MiB/203.0 MiB (133.5 MiB/s) with 1 file(s) remainingCompleted 45.2 MiB/203.0 MiB (134.2 MiB/s) with 1 file(s) remainingCompleted 45.5 MiB/203.0 MiB (134.4 MiB/s) with 1 file(s) remainingCompleted 45.8 MiB/203.0 MiB (134.5 MiB/s) with 1 file(s) remainingCompleted 46.0 MiB/203.0 MiB (135.0 MiB/s) with 1 file(s) remainingCompleted 46.2 MiB/203.0 MiB (135.7 MiB/s) with 1 file(s) remainingCompleted 46.5 MiB/203.0 MiB (135.9 MiB/s) with 1 file(s) remainingCompleted 46.8 MiB/203.0 MiB (135.9 MiB/s) with 1 file(s) remainingCompleted 47.0 MiB/203.0 MiB (136.6 MiB/s) with 1 file(s) remainingCompleted 47.2 MiB/203.0 MiB (137.0 MiB/s) with 1 file(s) remainingCompleted 47.5 MiB/203.0 MiB (137.2 MiB/s) with 1 file(s) remainingCompleted 47.8 MiB/203.0 MiB (137.3 MiB/s) with 1 file(s) remainingCompleted 48.0 MiB/203.0 MiB (137.8 MiB/s) with 

Completed 88.2 MiB/203.0 MiB (148.2 MiB/s) with 1 file(s) remainingCompleted 88.5 MiB/203.0 MiB (147.6 MiB/s) with 1 file(s) remainingCompleted 88.8 MiB/203.0 MiB (147.2 MiB/s) with 1 file(s) remainingCompleted 89.0 MiB/203.0 MiB (147.3 MiB/s) with 1 file(s) remainingCompleted 89.2 MiB/203.0 MiB (147.6 MiB/s) with 1 file(s) remainingCompleted 89.5 MiB/203.0 MiB (147.4 MiB/s) with 1 file(s) remainingCompleted 89.8 MiB/203.0 MiB (147.7 MiB/s) with 1 file(s) remainingCompleted 90.0 MiB/203.0 MiB (147.9 MiB/s) with 1 file(s) remainingCompleted 90.2 MiB/203.0 MiB (147.6 MiB/s) with 1 file(s) remainingCompleted 90.5 MiB/203.0 MiB (147.4 MiB/s) with 1 file(s) remainingCompleted 90.8 MiB/203.0 MiB (147.3 MiB/s) with 1 file(s) remainingCompleted 91.0 MiB/203.0 MiB (147.4 MiB/s) with 1 file(s) remainingCompleted 91.2 MiB/203.0 MiB (147.8 MiB/s) with 1 file(s) remainingCompleted 91.5 MiB/203.0 MiB (148.1 MiB/s) with 1 file(s) remainingCompleted 91.8 MiB/203.0 MiB (147.4 MiB/s) with 

Completed 133.8 MiB/203.0 MiB (168.1 MiB/s) with 1 file(s) remainingCompleted 134.0 MiB/203.0 MiB (168.2 MiB/s) with 1 file(s) remainingCompleted 134.2 MiB/203.0 MiB (168.5 MiB/s) with 1 file(s) remainingCompleted 134.5 MiB/203.0 MiB (168.2 MiB/s) with 1 file(s) remainingCompleted 134.8 MiB/203.0 MiB (168.5 MiB/s) with 1 file(s) remainingCompleted 135.0 MiB/203.0 MiB (168.5 MiB/s) with 1 file(s) remainingCompleted 135.2 MiB/203.0 MiB (168.7 MiB/s) with 1 file(s) remainingCompleted 135.5 MiB/203.0 MiB (169.0 MiB/s) with 1 file(s) remainingCompleted 135.8 MiB/203.0 MiB (169.1 MiB/s) with 1 file(s) remainingCompleted 136.0 MiB/203.0 MiB (169.4 MiB/s) with 1 file(s) remainingCompleted 136.2 MiB/203.0 MiB (169.4 MiB/s) with 1 file(s) remainingCompleted 136.5 MiB/203.0 MiB (169.5 MiB/s) with 1 file(s) remainingCompleted 136.8 MiB/203.0 MiB (169.6 MiB/s) with 1 file(s) remainingCompleted 137.0 MiB/203.0 MiB (169.4 MiB/s) with 1 file(s) remainingCompleted 137.2 MiB/203.0 MiB (169

Completed 168.2 MiB/203.0 MiB (165.1 MiB/s) with 1 file(s) remainingCompleted 168.5 MiB/203.0 MiB (165.1 MiB/s) with 1 file(s) remainingCompleted 168.8 MiB/203.0 MiB (165.2 MiB/s) with 1 file(s) remainingCompleted 169.0 MiB/203.0 MiB (165.0 MiB/s) with 1 file(s) remainingCompleted 169.2 MiB/203.0 MiB (165.1 MiB/s) with 1 file(s) remainingCompleted 169.5 MiB/203.0 MiB (165.2 MiB/s) with 1 file(s) remainingCompleted 169.8 MiB/203.0 MiB (165.3 MiB/s) with 1 file(s) remainingCompleted 170.0 MiB/203.0 MiB (165.4 MiB/s) with 1 file(s) remainingCompleted 170.2 MiB/203.0 MiB (165.5 MiB/s) with 1 file(s) remainingCompleted 170.5 MiB/203.0 MiB (165.6 MiB/s) with 1 file(s) remainingCompleted 170.8 MiB/203.0 MiB (165.7 MiB/s) with 1 file(s) remainingCompleted 171.0 MiB/203.0 MiB (165.8 MiB/s) with 1 file(s) remainingCompleted 171.2 MiB/203.0 MiB (166.0 MiB/s) with 1 file(s) remainingCompleted 171.5 MiB/203.0 MiB (165.9 MiB/s) with 1 file(s) remainingCompleted 171.8 MiB/203.0 MiB (166

upload: all_data_supp_data_live_deployment/2023_02_16_22_supp_data_185.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_185/001_data_files/2023_02_16_22_supp_data_185.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_22_supp_data_186.parquet
upload: all_data_supp_data_live_deployment/2023_02_16_22_supp_data_186.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_186/001_data_files/2023_02_16_22_supp_data_186.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_22_supp_data_187.parquet
upload: all_data_supp_data_live_deployment/2023_02_16_22_supp_data_187.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_187/001_data_files/2023_02_16_22_supp_data_187.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2

Completed 256.0 KiB/214.1 MiB (2.0 MiB/s) with 1 file(s) remainingCompleted 512.0 KiB/214.1 MiB (4.0 MiB/s) with 1 file(s) remainingCompleted 768.0 KiB/214.1 MiB (5.9 MiB/s) with 1 file(s) remainingCompleted 1.0 MiB/214.1 MiB (7.7 MiB/s) with 1 file(s) remaining  Completed 1.2 MiB/214.1 MiB (9.5 MiB/s) with 1 file(s) remaining  Completed 1.5 MiB/214.1 MiB (11.3 MiB/s) with 1 file(s) remaining Completed 1.8 MiB/214.1 MiB (13.1 MiB/s) with 1 file(s) remaining Completed 2.0 MiB/214.1 MiB (14.8 MiB/s) with 1 file(s) remaining Completed 2.2 MiB/214.1 MiB (16.5 MiB/s) with 1 file(s) remaining Completed 2.5 MiB/214.1 MiB (18.2 MiB/s) with 1 file(s) remaining Completed 2.8 MiB/214.1 MiB (19.8 MiB/s) with 1 file(s) remaining Completed 3.0 MiB/214.1 MiB (21.4 MiB/s) with 1 file(s) remaining Completed 3.2 MiB/214.1 MiB (23.0 MiB/s) with 1 file(s) remaining Completed 3.5 MiB/214.1 MiB (24.6 MiB/s) with 1 file(s) remaining Completed 3.8 MiB/214.1 MiB (26.2 MiB/s) with 1 file(s) remain

Completed 49.2 MiB/214.1 MiB (151.7 MiB/s) with 1 file(s) remainingCompleted 49.5 MiB/214.1 MiB (151.7 MiB/s) with 1 file(s) remainingCompleted 49.8 MiB/214.1 MiB (152.2 MiB/s) with 1 file(s) remainingCompleted 50.0 MiB/214.1 MiB (151.2 MiB/s) with 1 file(s) remainingCompleted 50.2 MiB/214.1 MiB (151.8 MiB/s) with 1 file(s) remainingCompleted 50.5 MiB/214.1 MiB (152.4 MiB/s) with 1 file(s) remainingCompleted 50.8 MiB/214.1 MiB (153.1 MiB/s) with 1 file(s) remainingCompleted 51.0 MiB/214.1 MiB (153.5 MiB/s) with 1 file(s) remainingCompleted 51.2 MiB/214.1 MiB (153.7 MiB/s) with 1 file(s) remainingCompleted 51.5 MiB/214.1 MiB (154.4 MiB/s) with 1 file(s) remainingCompleted 51.8 MiB/214.1 MiB (154.5 MiB/s) with 1 file(s) remainingCompleted 52.0 MiB/214.1 MiB (154.2 MiB/s) with 1 file(s) remainingCompleted 52.2 MiB/214.1 MiB (154.8 MiB/s) with 1 file(s) remainingCompleted 52.5 MiB/214.1 MiB (154.6 MiB/s) with 1 file(s) remainingCompleted 52.8 MiB/214.1 MiB (155.2 MiB/s) with 

Completed 90.0 MiB/214.1 MiB (169.5 MiB/s) with 1 file(s) remainingCompleted 90.2 MiB/214.1 MiB (169.7 MiB/s) with 1 file(s) remainingCompleted 90.5 MiB/214.1 MiB (169.9 MiB/s) with 1 file(s) remainingCompleted 90.8 MiB/214.1 MiB (170.2 MiB/s) with 1 file(s) remainingCompleted 91.0 MiB/214.1 MiB (170.5 MiB/s) with 1 file(s) remainingCompleted 91.2 MiB/214.1 MiB (170.6 MiB/s) with 1 file(s) remainingCompleted 91.5 MiB/214.1 MiB (170.3 MiB/s) with 1 file(s) remainingCompleted 91.8 MiB/214.1 MiB (170.5 MiB/s) with 1 file(s) remainingCompleted 92.0 MiB/214.1 MiB (170.7 MiB/s) with 1 file(s) remainingCompleted 92.2 MiB/214.1 MiB (170.9 MiB/s) with 1 file(s) remainingCompleted 92.5 MiB/214.1 MiB (171.1 MiB/s) with 1 file(s) remainingCompleted 92.8 MiB/214.1 MiB (171.4 MiB/s) with 1 file(s) remainingCompleted 93.0 MiB/214.1 MiB (171.7 MiB/s) with 1 file(s) remainingCompleted 93.2 MiB/214.1 MiB (171.9 MiB/s) with 1 file(s) remainingCompleted 93.5 MiB/214.1 MiB (172.1 MiB/s) with 

Completed 134.2 MiB/214.1 MiB (183.3 MiB/s) with 1 file(s) remainingCompleted 134.5 MiB/214.1 MiB (183.1 MiB/s) with 1 file(s) remainingCompleted 134.8 MiB/214.1 MiB (183.4 MiB/s) with 1 file(s) remainingCompleted 135.0 MiB/214.1 MiB (183.8 MiB/s) with 1 file(s) remainingCompleted 135.2 MiB/214.1 MiB (184.1 MiB/s) with 1 file(s) remainingCompleted 135.5 MiB/214.1 MiB (184.2 MiB/s) with 1 file(s) remainingCompleted 135.8 MiB/214.1 MiB (184.1 MiB/s) with 1 file(s) remainingCompleted 136.0 MiB/214.1 MiB (184.4 MiB/s) with 1 file(s) remainingCompleted 136.2 MiB/214.1 MiB (184.5 MiB/s) with 1 file(s) remainingCompleted 136.5 MiB/214.1 MiB (183.9 MiB/s) with 1 file(s) remainingCompleted 136.8 MiB/214.1 MiB (184.1 MiB/s) with 1 file(s) remainingCompleted 137.0 MiB/214.1 MiB (184.3 MiB/s) with 1 file(s) remainingCompleted 137.2 MiB/214.1 MiB (184.4 MiB/s) with 1 file(s) remainingCompleted 137.5 MiB/214.1 MiB (184.6 MiB/s) with 1 file(s) remainingCompleted 137.8 MiB/214.1 MiB (184

Completed 176.2 MiB/214.1 MiB (185.7 MiB/s) with 1 file(s) remainingCompleted 176.5 MiB/214.1 MiB (185.6 MiB/s) with 1 file(s) remainingCompleted 176.8 MiB/214.1 MiB (185.3 MiB/s) with 1 file(s) remainingCompleted 177.0 MiB/214.1 MiB (185.3 MiB/s) with 1 file(s) remainingCompleted 177.2 MiB/214.1 MiB (185.1 MiB/s) with 1 file(s) remainingCompleted 177.5 MiB/214.1 MiB (185.0 MiB/s) with 1 file(s) remainingCompleted 177.8 MiB/214.1 MiB (185.0 MiB/s) with 1 file(s) remainingCompleted 178.0 MiB/214.1 MiB (184.7 MiB/s) with 1 file(s) remainingCompleted 178.2 MiB/214.1 MiB (184.7 MiB/s) with 1 file(s) remainingCompleted 178.5 MiB/214.1 MiB (185.0 MiB/s) with 1 file(s) remainingCompleted 178.8 MiB/214.1 MiB (185.0 MiB/s) with 1 file(s) remainingCompleted 179.0 MiB/214.1 MiB (185.1 MiB/s) with 1 file(s) remainingCompleted 179.2 MiB/214.1 MiB (184.2 MiB/s) with 1 file(s) remainingCompleted 179.5 MiB/214.1 MiB (184.4 MiB/s) with 1 file(s) remainingCompleted 179.8 MiB/214.1 MiB (184

upload: all_data_supp_data_live_deployment/2023_02_16_23_supp_data_206.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_206/001_data_files/2023_02_16_23_supp_data_206.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_23_supp_data_207.parquet
upload: all_data_supp_data_live_deployment/2023_02_16_23_supp_data_207.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_207/001_data_files/2023_02_16_23_supp_data_207.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_23_supp_data_208.parquet
upload: all_data_supp_data_live_deployment/2023_02_16_23_supp_data_208.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_208/001_data_files/2023_02_16_23_supp_data_208.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2

Completed 256.0 KiB/174.9 MiB (2.0 MiB/s) with 1 file(s) remainingCompleted 512.0 KiB/174.9 MiB (3.9 MiB/s) with 1 file(s) remainingCompleted 768.0 KiB/174.9 MiB (5.8 MiB/s) with 1 file(s) remainingCompleted 1.0 MiB/174.9 MiB (7.6 MiB/s) with 1 file(s) remaining  Completed 1.2 MiB/174.9 MiB (8.8 MiB/s) with 1 file(s) remaining  Completed 1.5 MiB/174.9 MiB (10.5 MiB/s) with 1 file(s) remaining Completed 1.8 MiB/174.9 MiB (12.2 MiB/s) with 1 file(s) remaining Completed 2.0 MiB/174.9 MiB (13.8 MiB/s) with 1 file(s) remaining Completed 2.2 MiB/174.9 MiB (15.3 MiB/s) with 1 file(s) remaining Completed 2.5 MiB/174.9 MiB (16.8 MiB/s) with 1 file(s) remaining Completed 2.8 MiB/174.9 MiB (18.4 MiB/s) with 1 file(s) remaining Completed 3.0 MiB/174.9 MiB (19.9 MiB/s) with 1 file(s) remaining Completed 3.2 MiB/174.9 MiB (21.5 MiB/s) with 1 file(s) remaining Completed 3.5 MiB/174.9 MiB (23.0 MiB/s) with 1 file(s) remaining Completed 3.8 MiB/174.9 MiB (24.6 MiB/s) with 1 file(s) remain

Completed 48.5 MiB/174.9 MiB (147.7 MiB/s) with 1 file(s) remainingCompleted 48.8 MiB/174.9 MiB (148.2 MiB/s) with 1 file(s) remainingCompleted 49.0 MiB/174.9 MiB (148.9 MiB/s) with 1 file(s) remainingCompleted 49.2 MiB/174.9 MiB (149.4 MiB/s) with 1 file(s) remainingCompleted 49.5 MiB/174.9 MiB (150.0 MiB/s) with 1 file(s) remainingCompleted 49.8 MiB/174.9 MiB (150.0 MiB/s) with 1 file(s) remainingCompleted 50.0 MiB/174.9 MiB (149.9 MiB/s) with 1 file(s) remainingCompleted 50.2 MiB/174.9 MiB (150.3 MiB/s) with 1 file(s) remainingCompleted 50.5 MiB/174.9 MiB (150.4 MiB/s) with 1 file(s) remainingCompleted 50.8 MiB/174.9 MiB (150.4 MiB/s) with 1 file(s) remainingCompleted 51.0 MiB/174.9 MiB (150.1 MiB/s) with 1 file(s) remainingCompleted 51.2 MiB/174.9 MiB (150.5 MiB/s) with 1 file(s) remainingCompleted 51.5 MiB/174.9 MiB (151.2 MiB/s) with 1 file(s) remainingCompleted 51.8 MiB/174.9 MiB (151.3 MiB/s) with 1 file(s) remainingCompleted 52.0 MiB/174.9 MiB (152.0 MiB/s) with 

Completed 91.2 MiB/174.9 MiB (171.4 MiB/s) with 1 file(s) remainingCompleted 91.5 MiB/174.9 MiB (171.7 MiB/s) with 1 file(s) remainingCompleted 91.8 MiB/174.9 MiB (172.0 MiB/s) with 1 file(s) remainingCompleted 92.0 MiB/174.9 MiB (171.9 MiB/s) with 1 file(s) remainingCompleted 92.2 MiB/174.9 MiB (172.3 MiB/s) with 1 file(s) remainingCompleted 92.5 MiB/174.9 MiB (172.7 MiB/s) with 1 file(s) remainingCompleted 92.8 MiB/174.9 MiB (172.4 MiB/s) with 1 file(s) remainingCompleted 93.0 MiB/174.9 MiB (172.7 MiB/s) with 1 file(s) remainingCompleted 93.2 MiB/174.9 MiB (173.1 MiB/s) with 1 file(s) remainingCompleted 93.5 MiB/174.9 MiB (172.5 MiB/s) with 1 file(s) remainingCompleted 93.8 MiB/174.9 MiB (172.9 MiB/s) with 1 file(s) remainingCompleted 94.0 MiB/174.9 MiB (173.3 MiB/s) with 1 file(s) remainingCompleted 94.2 MiB/174.9 MiB (170.9 MiB/s) with 1 file(s) remainingCompleted 94.5 MiB/174.9 MiB (171.3 MiB/s) with 1 file(s) remainingCompleted 94.8 MiB/174.9 MiB (171.5 MiB/s) with 

Completed 135.0 MiB/174.9 MiB (183.7 MiB/s) with 1 file(s) remainingCompleted 135.2 MiB/174.9 MiB (183.9 MiB/s) with 1 file(s) remainingCompleted 135.5 MiB/174.9 MiB (183.8 MiB/s) with 1 file(s) remainingCompleted 135.8 MiB/174.9 MiB (184.1 MiB/s) with 1 file(s) remainingCompleted 136.0 MiB/174.9 MiB (184.3 MiB/s) with 1 file(s) remainingCompleted 136.2 MiB/174.9 MiB (184.1 MiB/s) with 1 file(s) remainingCompleted 136.5 MiB/174.9 MiB (184.2 MiB/s) with 1 file(s) remainingCompleted 136.8 MiB/174.9 MiB (184.5 MiB/s) with 1 file(s) remainingCompleted 137.0 MiB/174.9 MiB (184.9 MiB/s) with 1 file(s) remainingCompleted 137.2 MiB/174.9 MiB (185.0 MiB/s) with 1 file(s) remainingCompleted 137.5 MiB/174.9 MiB (185.2 MiB/s) with 1 file(s) remainingCompleted 137.8 MiB/174.9 MiB (184.8 MiB/s) with 1 file(s) remainingCompleted 138.0 MiB/174.9 MiB (184.7 MiB/s) with 1 file(s) remainingCompleted 138.2 MiB/174.9 MiB (184.8 MiB/s) with 1 file(s) remainingCompleted 138.5 MiB/174.9 MiB (185

upload: all_data_supp_data_live_deployment/2023_02_17_00_supp_data_228.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_228/001_data_files/2023_02_17_00_supp_data_228.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_17_00_supp_data_229.parquet
upload: all_data_supp_data_live_deployment/2023_02_17_00_supp_data_229.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_229/001_data_files/2023_02_17_00_supp_data_229.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_17_00_supp_data_23.parquet
upload: all_data_supp_data_live_deployment/2023_02_17_00_supp_data_23.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_23/001_data_files/2023_02_17_00_supp_data_23.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_

Completed 256.0 KiB/170.3 MiB (2.3 MiB/s) with 1 file(s) remainingCompleted 512.0 KiB/170.3 MiB (4.4 MiB/s) with 1 file(s) remainingCompleted 768.0 KiB/170.3 MiB (6.5 MiB/s) with 1 file(s) remainingCompleted 1.0 MiB/170.3 MiB (8.6 MiB/s) with 1 file(s) remaining  Completed 1.2 MiB/170.3 MiB (10.6 MiB/s) with 1 file(s) remaining Completed 1.5 MiB/170.3 MiB (12.6 MiB/s) with 1 file(s) remaining Completed 1.8 MiB/170.3 MiB (14.6 MiB/s) with 1 file(s) remaining Completed 2.0 MiB/170.3 MiB (16.6 MiB/s) with 1 file(s) remaining Completed 2.2 MiB/170.3 MiB (18.5 MiB/s) with 1 file(s) remaining Completed 2.5 MiB/170.3 MiB (20.1 MiB/s) with 1 file(s) remaining Completed 2.8 MiB/170.3 MiB (21.1 MiB/s) with 1 file(s) remaining Completed 3.0 MiB/170.3 MiB (22.9 MiB/s) with 1 file(s) remaining Completed 3.2 MiB/170.3 MiB (24.7 MiB/s) with 1 file(s) remaining Completed 3.5 MiB/170.3 MiB (26.6 MiB/s) with 1 file(s) remaining Completed 3.8 MiB/170.3 MiB (28.4 MiB/s) with 1 file(s) remain

Completed 49.0 MiB/170.3 MiB (155.4 MiB/s) with 1 file(s) remainingCompleted 49.2 MiB/170.3 MiB (156.1 MiB/s) with 1 file(s) remainingCompleted 49.5 MiB/170.3 MiB (156.7 MiB/s) with 1 file(s) remainingCompleted 49.8 MiB/170.3 MiB (157.2 MiB/s) with 1 file(s) remainingCompleted 50.0 MiB/170.3 MiB (157.6 MiB/s) with 1 file(s) remainingCompleted 50.2 MiB/170.3 MiB (158.3 MiB/s) with 1 file(s) remainingCompleted 50.5 MiB/170.3 MiB (157.6 MiB/s) with 1 file(s) remainingCompleted 50.8 MiB/170.3 MiB (158.3 MiB/s) with 1 file(s) remainingCompleted 51.0 MiB/170.3 MiB (158.7 MiB/s) with 1 file(s) remainingCompleted 51.2 MiB/170.3 MiB (156.5 MiB/s) with 1 file(s) remainingCompleted 51.5 MiB/170.3 MiB (157.1 MiB/s) with 1 file(s) remainingCompleted 51.8 MiB/170.3 MiB (157.5 MiB/s) with 1 file(s) remainingCompleted 52.0 MiB/170.3 MiB (158.0 MiB/s) with 1 file(s) remainingCompleted 52.2 MiB/170.3 MiB (158.7 MiB/s) with 1 file(s) remainingCompleted 52.5 MiB/170.3 MiB (159.5 MiB/s) with 

Completed 96.2 MiB/170.3 MiB (179.7 MiB/s) with 1 file(s) remainingCompleted 96.5 MiB/170.3 MiB (180.0 MiB/s) with 1 file(s) remainingCompleted 96.8 MiB/170.3 MiB (180.1 MiB/s) with 1 file(s) remainingCompleted 97.0 MiB/170.3 MiB (180.3 MiB/s) with 1 file(s) remainingCompleted 97.2 MiB/170.3 MiB (180.3 MiB/s) with 1 file(s) remainingCompleted 97.5 MiB/170.3 MiB (180.3 MiB/s) with 1 file(s) remainingCompleted 97.8 MiB/170.3 MiB (180.5 MiB/s) with 1 file(s) remainingCompleted 98.0 MiB/170.3 MiB (180.6 MiB/s) with 1 file(s) remainingCompleted 98.2 MiB/170.3 MiB (179.9 MiB/s) with 1 file(s) remainingCompleted 98.5 MiB/170.3 MiB (180.2 MiB/s) with 1 file(s) remainingCompleted 98.8 MiB/170.3 MiB (180.3 MiB/s) with 1 file(s) remainingCompleted 99.0 MiB/170.3 MiB (180.3 MiB/s) with 1 file(s) remainingCompleted 99.2 MiB/170.3 MiB (180.5 MiB/s) with 1 file(s) remainingCompleted 99.5 MiB/170.3 MiB (180.9 MiB/s) with 1 file(s) remainingCompleted 99.8 MiB/170.3 MiB (180.7 MiB/s) with 

Completed 140.0 MiB/170.3 MiB (189.7 MiB/s) with 1 file(s) remainingCompleted 140.2 MiB/170.3 MiB (190.1 MiB/s) with 1 file(s) remainingCompleted 140.5 MiB/170.3 MiB (190.2 MiB/s) with 1 file(s) remainingCompleted 140.8 MiB/170.3 MiB (190.5 MiB/s) with 1 file(s) remainingCompleted 141.0 MiB/170.3 MiB (190.7 MiB/s) with 1 file(s) remainingCompleted 141.2 MiB/170.3 MiB (190.7 MiB/s) with 1 file(s) remainingCompleted 141.5 MiB/170.3 MiB (191.0 MiB/s) with 1 file(s) remainingCompleted 141.8 MiB/170.3 MiB (190.7 MiB/s) with 1 file(s) remainingCompleted 142.0 MiB/170.3 MiB (191.0 MiB/s) with 1 file(s) remainingCompleted 142.2 MiB/170.3 MiB (190.9 MiB/s) with 1 file(s) remainingCompleted 142.5 MiB/170.3 MiB (191.0 MiB/s) with 1 file(s) remainingCompleted 142.8 MiB/170.3 MiB (191.2 MiB/s) with 1 file(s) remainingCompleted 143.0 MiB/170.3 MiB (191.4 MiB/s) with 1 file(s) remainingCompleted 143.2 MiB/170.3 MiB (191.6 MiB/s) with 1 file(s) remainingCompleted 143.5 MiB/170.3 MiB (191

upload: all_data_supp_data_live_deployment/2023_02_17_00_supp_data_25.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_25/001_data_files/2023_02_17_00_supp_data_25.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_17_00_supp_data_250.parquet
upload: all_data_supp_data_live_deployment/2023_02_17_00_supp_data_250.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_250/001_data_files/2023_02_17_00_supp_data_250.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_17_00_supp_data_251.parquet
upload: all_data_supp_data_live_deployment/2023_02_17_00_supp_data_251.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_251/001_data_files/2023_02_17_00_supp_data_251.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023

Completed 256.0 KiB/120.3 MiB (2.0 MiB/s) with 1 file(s) remainingCompleted 512.0 KiB/120.3 MiB (3.8 MiB/s) with 1 file(s) remainingCompleted 768.0 KiB/120.3 MiB (5.7 MiB/s) with 1 file(s) remainingCompleted 1.0 MiB/120.3 MiB (7.5 MiB/s) with 1 file(s) remaining  Completed 1.2 MiB/120.3 MiB (9.3 MiB/s) with 1 file(s) remaining  Completed 1.5 MiB/120.3 MiB (11.1 MiB/s) with 1 file(s) remaining Completed 1.8 MiB/120.3 MiB (12.8 MiB/s) with 1 file(s) remaining Completed 2.0 MiB/120.3 MiB (14.5 MiB/s) with 1 file(s) remaining Completed 2.2 MiB/120.3 MiB (16.2 MiB/s) with 1 file(s) remaining Completed 2.5 MiB/120.3 MiB (17.9 MiB/s) with 1 file(s) remaining Completed 2.8 MiB/120.3 MiB (19.2 MiB/s) with 1 file(s) remaining Completed 3.0 MiB/120.3 MiB (20.9 MiB/s) with 1 file(s) remaining Completed 3.2 MiB/120.3 MiB (22.5 MiB/s) with 1 file(s) remaining Completed 3.5 MiB/120.3 MiB (24.1 MiB/s) with 1 file(s) remaining Completed 3.8 MiB/120.3 MiB (25.7 MiB/s) with 1 file(s) remain

Completed 49.5 MiB/120.3 MiB (150.2 MiB/s) with 1 file(s) remainingCompleted 49.8 MiB/120.3 MiB (150.8 MiB/s) with 1 file(s) remainingCompleted 50.0 MiB/120.3 MiB (151.5 MiB/s) with 1 file(s) remainingCompleted 50.2 MiB/120.3 MiB (151.4 MiB/s) with 1 file(s) remainingCompleted 50.5 MiB/120.3 MiB (151.9 MiB/s) with 1 file(s) remainingCompleted 50.8 MiB/120.3 MiB (151.3 MiB/s) with 1 file(s) remainingCompleted 51.0 MiB/120.3 MiB (151.9 MiB/s) with 1 file(s) remainingCompleted 51.2 MiB/120.3 MiB (151.5 MiB/s) with 1 file(s) remainingCompleted 51.5 MiB/120.3 MiB (152.1 MiB/s) with 1 file(s) remainingCompleted 51.8 MiB/120.3 MiB (152.6 MiB/s) with 1 file(s) remainingCompleted 52.0 MiB/120.3 MiB (153.1 MiB/s) with 1 file(s) remainingCompleted 52.2 MiB/120.3 MiB (153.5 MiB/s) with 1 file(s) remainingCompleted 52.5 MiB/120.3 MiB (153.9 MiB/s) with 1 file(s) remainingCompleted 52.8 MiB/120.3 MiB (154.3 MiB/s) with 1 file(s) remainingCompleted 53.0 MiB/120.3 MiB (153.9 MiB/s) with 

upload: all_data_supp_data_live_deployment/2023_02_17_00_supp_data_271.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_271/001_data_files/2023_02_17_00_supp_data_271.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_17_00_supp_data_272.parquet
upload: all_data_supp_data_live_deployment/2023_02_17_00_supp_data_272.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_272/001_data_files/2023_02_17_00_supp_data_272.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_17_00_supp_data_273.parquet
upload: all_data_supp_data_live_deployment/2023_02_17_00_supp_data_273.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_273/001_data_files/2023_02_17_00_supp_data_273.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2

Completed 256.0 KiB/156.4 MiB (1.6 MiB/s) with 1 file(s) remainingCompleted 512.0 KiB/156.4 MiB (3.2 MiB/s) with 1 file(s) remainingCompleted 768.0 KiB/156.4 MiB (4.8 MiB/s) with 1 file(s) remainingCompleted 1.0 MiB/156.4 MiB (6.4 MiB/s) with 1 file(s) remaining  Completed 1.2 MiB/156.4 MiB (8.0 MiB/s) with 1 file(s) remaining  Completed 1.5 MiB/156.4 MiB (9.5 MiB/s) with 1 file(s) remaining  Completed 1.8 MiB/156.4 MiB (11.1 MiB/s) with 1 file(s) remaining Completed 2.0 MiB/156.4 MiB (12.6 MiB/s) with 1 file(s) remaining Completed 2.2 MiB/156.4 MiB (14.1 MiB/s) with 1 file(s) remaining Completed 2.5 MiB/156.4 MiB (15.6 MiB/s) with 1 file(s) remaining Completed 2.8 MiB/156.4 MiB (17.1 MiB/s) with 1 file(s) remaining Completed 3.0 MiB/156.4 MiB (18.5 MiB/s) with 1 file(s) remaining Completed 3.2 MiB/156.4 MiB (19.9 MiB/s) with 1 file(s) remaining Completed 3.5 MiB/156.4 MiB (21.2 MiB/s) with 1 file(s) remaining Completed 3.8 MiB/156.4 MiB (22.4 MiB/s) with 1 file(s) remain

Completed 49.8 MiB/156.4 MiB (139.7 MiB/s) with 1 file(s) remainingCompleted 50.0 MiB/156.4 MiB (140.2 MiB/s) with 1 file(s) remainingCompleted 50.2 MiB/156.4 MiB (140.8 MiB/s) with 1 file(s) remainingCompleted 50.5 MiB/156.4 MiB (141.1 MiB/s) with 1 file(s) remainingCompleted 50.8 MiB/156.4 MiB (140.7 MiB/s) with 1 file(s) remainingCompleted 51.0 MiB/156.4 MiB (141.2 MiB/s) with 1 file(s) remainingCompleted 51.2 MiB/156.4 MiB (141.8 MiB/s) with 1 file(s) remainingCompleted 51.5 MiB/156.4 MiB (142.3 MiB/s) with 1 file(s) remainingCompleted 51.8 MiB/156.4 MiB (142.1 MiB/s) with 1 file(s) remainingCompleted 52.0 MiB/156.4 MiB (141.8 MiB/s) with 1 file(s) remainingCompleted 52.2 MiB/156.4 MiB (142.1 MiB/s) with 1 file(s) remainingCompleted 52.5 MiB/156.4 MiB (142.6 MiB/s) with 1 file(s) remainingCompleted 52.8 MiB/156.4 MiB (143.1 MiB/s) with 1 file(s) remainingCompleted 53.0 MiB/156.4 MiB (142.2 MiB/s) with 1 file(s) remainingCompleted 53.2 MiB/156.4 MiB (142.9 MiB/s) with 

Completed 96.2 MiB/156.4 MiB (160.4 MiB/s) with 1 file(s) remainingCompleted 96.5 MiB/156.4 MiB (160.6 MiB/s) with 1 file(s) remainingCompleted 96.8 MiB/156.4 MiB (160.6 MiB/s) with 1 file(s) remainingCompleted 97.0 MiB/156.4 MiB (160.2 MiB/s) with 1 file(s) remainingCompleted 97.2 MiB/156.4 MiB (160.4 MiB/s) with 1 file(s) remainingCompleted 97.5 MiB/156.4 MiB (160.6 MiB/s) with 1 file(s) remainingCompleted 97.8 MiB/156.4 MiB (160.8 MiB/s) with 1 file(s) remainingCompleted 98.0 MiB/156.4 MiB (161.0 MiB/s) with 1 file(s) remainingCompleted 98.2 MiB/156.4 MiB (161.2 MiB/s) with 1 file(s) remainingCompleted 98.5 MiB/156.4 MiB (161.2 MiB/s) with 1 file(s) remainingCompleted 98.8 MiB/156.4 MiB (161.3 MiB/s) with 1 file(s) remainingCompleted 99.0 MiB/156.4 MiB (161.6 MiB/s) with 1 file(s) remainingCompleted 99.2 MiB/156.4 MiB (161.0 MiB/s) with 1 file(s) remainingCompleted 99.5 MiB/156.4 MiB (160.8 MiB/s) with 1 file(s) remainingCompleted 99.8 MiB/156.4 MiB (161.0 MiB/s) with 

upload: all_data_supp_data_live_deployment/2023_02_16_21_supp_data_293.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_293/001_data_files/2023_02_16_21_supp_data_293.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_21_supp_data_294.parquet
upload: all_data_supp_data_live_deployment/2023_02_16_21_supp_data_294.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_294/001_data_files/2023_02_16_21_supp_data_294.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_21_supp_data_295.parquet
upload: all_data_supp_data_live_deployment/2023_02_16_21_supp_data_295.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_295/001_data_files/2023_02_16_21_supp_data_295.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2

Completed 256.0 KiB/151.5 MiB (1.8 MiB/s) with 1 file(s) remainingCompleted 512.0 KiB/151.5 MiB (3.6 MiB/s) with 1 file(s) remainingCompleted 768.0 KiB/151.5 MiB (5.4 MiB/s) with 1 file(s) remainingCompleted 1.0 MiB/151.5 MiB (7.2 MiB/s) with 1 file(s) remaining  Completed 1.2 MiB/151.5 MiB (8.9 MiB/s) with 1 file(s) remaining  Completed 1.5 MiB/151.5 MiB (10.6 MiB/s) with 1 file(s) remaining Completed 1.8 MiB/151.5 MiB (12.3 MiB/s) with 1 file(s) remaining Completed 2.0 MiB/151.5 MiB (14.0 MiB/s) with 1 file(s) remaining Completed 2.2 MiB/151.5 MiB (15.6 MiB/s) with 1 file(s) remaining Completed 2.5 MiB/151.5 MiB (17.3 MiB/s) with 1 file(s) remaining Completed 2.8 MiB/151.5 MiB (18.9 MiB/s) with 1 file(s) remaining Completed 3.0 MiB/151.5 MiB (20.5 MiB/s) with 1 file(s) remaining Completed 3.2 MiB/151.5 MiB (22.1 MiB/s) with 1 file(s) remaining Completed 3.5 MiB/151.5 MiB (23.5 MiB/s) with 1 file(s) remaining Completed 3.8 MiB/151.5 MiB (25.0 MiB/s) with 1 file(s) remain

Completed 49.0 MiB/151.5 MiB (145.4 MiB/s) with 1 file(s) remainingCompleted 49.2 MiB/151.5 MiB (146.0 MiB/s) with 1 file(s) remainingCompleted 49.5 MiB/151.5 MiB (146.3 MiB/s) with 1 file(s) remainingCompleted 49.8 MiB/151.5 MiB (146.3 MiB/s) with 1 file(s) remainingCompleted 50.0 MiB/151.5 MiB (146.6 MiB/s) with 1 file(s) remainingCompleted 50.2 MiB/151.5 MiB (147.2 MiB/s) with 1 file(s) remainingCompleted 50.5 MiB/151.5 MiB (146.2 MiB/s) with 1 file(s) remainingCompleted 50.8 MiB/151.5 MiB (146.6 MiB/s) with 1 file(s) remainingCompleted 51.0 MiB/151.5 MiB (146.8 MiB/s) with 1 file(s) remainingCompleted 51.2 MiB/151.5 MiB (146.8 MiB/s) with 1 file(s) remainingCompleted 51.5 MiB/151.5 MiB (147.5 MiB/s) with 1 file(s) remainingCompleted 51.8 MiB/151.5 MiB (147.9 MiB/s) with 1 file(s) remainingCompleted 52.0 MiB/151.5 MiB (148.1 MiB/s) with 1 file(s) remainingCompleted 52.2 MiB/151.5 MiB (148.8 MiB/s) with 1 file(s) remainingCompleted 52.5 MiB/151.5 MiB (148.9 MiB/s) with 

Completed 96.2 MiB/151.5 MiB (170.3 MiB/s) with 1 file(s) remainingCompleted 96.5 MiB/151.5 MiB (170.3 MiB/s) with 1 file(s) remainingCompleted 96.8 MiB/151.5 MiB (170.5 MiB/s) with 1 file(s) remainingCompleted 97.0 MiB/151.5 MiB (170.6 MiB/s) with 1 file(s) remainingCompleted 97.2 MiB/151.5 MiB (170.4 MiB/s) with 1 file(s) remainingCompleted 97.5 MiB/151.5 MiB (170.6 MiB/s) with 1 file(s) remainingCompleted 97.8 MiB/151.5 MiB (170.7 MiB/s) with 1 file(s) remainingCompleted 98.0 MiB/151.5 MiB (171.1 MiB/s) with 1 file(s) remainingCompleted 98.2 MiB/151.5 MiB (171.0 MiB/s) with 1 file(s) remainingCompleted 98.5 MiB/151.5 MiB (170.2 MiB/s) with 1 file(s) remainingCompleted 98.8 MiB/151.5 MiB (170.6 MiB/s) with 1 file(s) remainingCompleted 99.0 MiB/151.5 MiB (170.8 MiB/s) with 1 file(s) remainingCompleted 99.2 MiB/151.5 MiB (170.2 MiB/s) with 1 file(s) remainingCompleted 99.5 MiB/151.5 MiB (170.4 MiB/s) with 1 file(s) remainingCompleted 99.8 MiB/151.5 MiB (170.6 MiB/s) with 

upload: all_data_supp_data_live_deployment/2023_02_16_21_supp_data_314.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_314/001_data_files/2023_02_16_21_supp_data_314.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_21_supp_data_315.parquet
upload: all_data_supp_data_live_deployment/2023_02_16_21_supp_data_315.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_315/001_data_files/2023_02_16_21_supp_data_315.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_22_supp_data_316.parquet
upload: all_data_supp_data_live_deployment/2023_02_16_22_supp_data_316.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_316/001_data_files/2023_02_16_22_supp_data_316.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2

Completed 256.0 KiB/174.2 MiB (1.7 MiB/s) with 1 file(s) remainingCompleted 512.0 KiB/174.2 MiB (3.3 MiB/s) with 1 file(s) remainingCompleted 768.0 KiB/174.2 MiB (4.9 MiB/s) with 1 file(s) remainingCompleted 1.0 MiB/174.2 MiB (6.5 MiB/s) with 1 file(s) remaining  Completed 1.2 MiB/174.2 MiB (8.0 MiB/s) with 1 file(s) remaining  Completed 1.5 MiB/174.2 MiB (9.5 MiB/s) with 1 file(s) remaining  Completed 1.8 MiB/174.2 MiB (11.0 MiB/s) with 1 file(s) remaining Completed 2.0 MiB/174.2 MiB (12.5 MiB/s) with 1 file(s) remaining Completed 2.2 MiB/174.2 MiB (14.0 MiB/s) with 1 file(s) remaining Completed 2.5 MiB/174.2 MiB (15.4 MiB/s) with 1 file(s) remaining Completed 2.8 MiB/174.2 MiB (16.8 MiB/s) with 1 file(s) remaining Completed 3.0 MiB/174.2 MiB (18.2 MiB/s) with 1 file(s) remaining Completed 3.2 MiB/174.2 MiB (19.5 MiB/s) with 1 file(s) remaining Completed 3.5 MiB/174.2 MiB (21.0 MiB/s) with 1 file(s) remaining Completed 3.8 MiB/174.2 MiB (22.4 MiB/s) with 1 file(s) remain

Completed 49.2 MiB/174.2 MiB (140.9 MiB/s) with 1 file(s) remainingCompleted 49.5 MiB/174.2 MiB (141.5 MiB/s) with 1 file(s) remainingCompleted 49.8 MiB/174.2 MiB (141.8 MiB/s) with 1 file(s) remainingCompleted 50.0 MiB/174.2 MiB (142.2 MiB/s) with 1 file(s) remainingCompleted 50.2 MiB/174.2 MiB (142.5 MiB/s) with 1 file(s) remainingCompleted 50.5 MiB/174.2 MiB (142.7 MiB/s) with 1 file(s) remainingCompleted 50.8 MiB/174.2 MiB (143.2 MiB/s) with 1 file(s) remainingCompleted 51.0 MiB/174.2 MiB (142.5 MiB/s) with 1 file(s) remainingCompleted 51.2 MiB/174.2 MiB (143.0 MiB/s) with 1 file(s) remainingCompleted 51.5 MiB/174.2 MiB (143.6 MiB/s) with 1 file(s) remainingCompleted 51.8 MiB/174.2 MiB (144.2 MiB/s) with 1 file(s) remainingCompleted 52.0 MiB/174.2 MiB (144.6 MiB/s) with 1 file(s) remainingCompleted 52.2 MiB/174.2 MiB (145.1 MiB/s) with 1 file(s) remainingCompleted 52.5 MiB/174.2 MiB (144.8 MiB/s) with 1 file(s) remainingCompleted 52.8 MiB/174.2 MiB (145.3 MiB/s) with 

Completed 88.2 MiB/174.2 MiB (158.8 MiB/s) with 1 file(s) remainingCompleted 88.5 MiB/174.2 MiB (159.0 MiB/s) with 1 file(s) remainingCompleted 88.8 MiB/174.2 MiB (159.0 MiB/s) with 1 file(s) remainingCompleted 89.0 MiB/174.2 MiB (158.4 MiB/s) with 1 file(s) remainingCompleted 89.2 MiB/174.2 MiB (158.5 MiB/s) with 1 file(s) remainingCompleted 89.5 MiB/174.2 MiB (158.5 MiB/s) with 1 file(s) remainingCompleted 89.8 MiB/174.2 MiB (158.2 MiB/s) with 1 file(s) remainingCompleted 90.0 MiB/174.2 MiB (158.3 MiB/s) with 1 file(s) remainingCompleted 90.2 MiB/174.2 MiB (158.5 MiB/s) with 1 file(s) remainingCompleted 90.5 MiB/174.2 MiB (158.2 MiB/s) with 1 file(s) remainingCompleted 90.8 MiB/174.2 MiB (158.2 MiB/s) with 1 file(s) remainingCompleted 91.0 MiB/174.2 MiB (158.3 MiB/s) with 1 file(s) remainingCompleted 91.2 MiB/174.2 MiB (158.4 MiB/s) with 1 file(s) remainingCompleted 91.5 MiB/174.2 MiB (158.6 MiB/s) with 1 file(s) remainingCompleted 91.8 MiB/174.2 MiB (158.8 MiB/s) with 

Completed 129.0 MiB/174.2 MiB (170.6 MiB/s) with 1 file(s) remainingCompleted 129.2 MiB/174.2 MiB (170.6 MiB/s) with 1 file(s) remainingCompleted 129.5 MiB/174.2 MiB (170.7 MiB/s) with 1 file(s) remainingCompleted 129.8 MiB/174.2 MiB (170.7 MiB/s) with 1 file(s) remainingCompleted 130.0 MiB/174.2 MiB (171.0 MiB/s) with 1 file(s) remainingCompleted 130.2 MiB/174.2 MiB (171.0 MiB/s) with 1 file(s) remainingCompleted 130.5 MiB/174.2 MiB (170.9 MiB/s) with 1 file(s) remainingCompleted 130.8 MiB/174.2 MiB (171.2 MiB/s) with 1 file(s) remainingCompleted 131.0 MiB/174.2 MiB (171.4 MiB/s) with 1 file(s) remainingCompleted 131.2 MiB/174.2 MiB (171.4 MiB/s) with 1 file(s) remainingCompleted 131.5 MiB/174.2 MiB (171.6 MiB/s) with 1 file(s) remainingCompleted 131.8 MiB/174.2 MiB (171.8 MiB/s) with 1 file(s) remainingCompleted 132.0 MiB/174.2 MiB (171.9 MiB/s) with 1 file(s) remainingCompleted 132.2 MiB/174.2 MiB (171.6 MiB/s) with 1 file(s) remainingCompleted 132.5 MiB/174.2 MiB (171

upload: all_data_supp_data_live_deployment/2023_02_16_22_supp_data_336.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_336/001_data_files/2023_02_16_22_supp_data_336.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_22_supp_data_337.parquet
upload: all_data_supp_data_live_deployment/2023_02_16_22_supp_data_337.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_337/001_data_files/2023_02_16_22_supp_data_337.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_22_supp_data_338.parquet
upload: all_data_supp_data_live_deployment/2023_02_16_22_supp_data_338.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_338/001_data_files/2023_02_16_22_supp_data_338.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2

Completed 256.0 KiB/128.1 MiB (2.0 MiB/s) with 1 file(s) remainingCompleted 512.0 KiB/128.1 MiB (4.1 MiB/s) with 1 file(s) remainingCompleted 768.0 KiB/128.1 MiB (6.0 MiB/s) with 1 file(s) remainingCompleted 1.0 MiB/128.1 MiB (7.9 MiB/s) with 1 file(s) remaining  Completed 1.2 MiB/128.1 MiB (9.8 MiB/s) with 1 file(s) remaining  Completed 1.5 MiB/128.1 MiB (11.5 MiB/s) with 1 file(s) remaining Completed 1.8 MiB/128.1 MiB (13.0 MiB/s) with 1 file(s) remaining Completed 2.0 MiB/128.1 MiB (14.8 MiB/s) with 1 file(s) remaining Completed 2.2 MiB/128.1 MiB (16.5 MiB/s) with 1 file(s) remaining Completed 2.5 MiB/128.1 MiB (18.3 MiB/s) with 1 file(s) remaining Completed 2.8 MiB/128.1 MiB (20.0 MiB/s) with 1 file(s) remaining Completed 3.0 MiB/128.1 MiB (21.6 MiB/s) with 1 file(s) remaining Completed 3.2 MiB/128.1 MiB (23.3 MiB/s) with 1 file(s) remaining Completed 3.5 MiB/128.1 MiB (24.9 MiB/s) with 1 file(s) remaining Completed 3.8 MiB/128.1 MiB (26.4 MiB/s) with 1 file(s) remain

Completed 50.5 MiB/128.1 MiB (155.4 MiB/s) with 1 file(s) remainingCompleted 50.8 MiB/128.1 MiB (155.8 MiB/s) with 1 file(s) remainingCompleted 51.0 MiB/128.1 MiB (156.4 MiB/s) with 1 file(s) remainingCompleted 51.2 MiB/128.1 MiB (156.6 MiB/s) with 1 file(s) remainingCompleted 51.5 MiB/128.1 MiB (157.0 MiB/s) with 1 file(s) remainingCompleted 51.8 MiB/128.1 MiB (157.5 MiB/s) with 1 file(s) remainingCompleted 52.0 MiB/128.1 MiB (158.0 MiB/s) with 1 file(s) remainingCompleted 52.2 MiB/128.1 MiB (158.3 MiB/s) with 1 file(s) remainingCompleted 52.5 MiB/128.1 MiB (159.0 MiB/s) with 1 file(s) remainingCompleted 52.8 MiB/128.1 MiB (158.0 MiB/s) with 1 file(s) remainingCompleted 53.0 MiB/128.1 MiB (158.3 MiB/s) with 1 file(s) remainingCompleted 53.2 MiB/128.1 MiB (159.0 MiB/s) with 1 file(s) remainingCompleted 53.5 MiB/128.1 MiB (159.4 MiB/s) with 1 file(s) remainingCompleted 53.8 MiB/128.1 MiB (159.6 MiB/s) with 1 file(s) remainingCompleted 54.0 MiB/128.1 MiB (160.3 MiB/s) with 

Completed 88.2 MiB/128.1 MiB (155.8 MiB/s) with 1 file(s) remainingCompleted 88.5 MiB/128.1 MiB (156.1 MiB/s) with 1 file(s) remainingCompleted 88.8 MiB/128.1 MiB (155.6 MiB/s) with 1 file(s) remainingCompleted 89.0 MiB/128.1 MiB (155.7 MiB/s) with 1 file(s) remainingCompleted 89.2 MiB/128.1 MiB (155.8 MiB/s) with 1 file(s) remainingCompleted 89.5 MiB/128.1 MiB (156.2 MiB/s) with 1 file(s) remainingCompleted 89.8 MiB/128.1 MiB (156.5 MiB/s) with 1 file(s) remainingCompleted 90.0 MiB/128.1 MiB (156.8 MiB/s) with 1 file(s) remainingCompleted 90.2 MiB/128.1 MiB (156.9 MiB/s) with 1 file(s) remainingCompleted 90.5 MiB/128.1 MiB (157.2 MiB/s) with 1 file(s) remainingCompleted 90.8 MiB/128.1 MiB (157.4 MiB/s) with 1 file(s) remainingCompleted 91.0 MiB/128.1 MiB (157.7 MiB/s) with 1 file(s) remainingCompleted 91.2 MiB/128.1 MiB (157.9 MiB/s) with 1 file(s) remainingCompleted 91.5 MiB/128.1 MiB (158.1 MiB/s) with 1 file(s) remainingCompleted 91.8 MiB/128.1 MiB (158.2 MiB/s) with 

upload: all_data_supp_data_live_deployment/2023_02_16_23_supp_data_358.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_358/001_data_files/2023_02_16_23_supp_data_358.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_23_supp_data_359.parquet
upload: all_data_supp_data_live_deployment/2023_02_16_23_supp_data_359.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_359/001_data_files/2023_02_16_23_supp_data_359.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_23_supp_data_36.parquet
upload: all_data_supp_data_live_deployment/2023_02_16_23_supp_data_36.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_36/001_data_files/2023_02_16_23_supp_data_36.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_

Completed 256.0 KiB/145.1 MiB (2.1 MiB/s) with 1 file(s) remainingCompleted 512.0 KiB/145.1 MiB (4.0 MiB/s) with 1 file(s) remainingCompleted 768.0 KiB/145.1 MiB (5.9 MiB/s) with 1 file(s) remainingCompleted 1.0 MiB/145.1 MiB (7.8 MiB/s) with 1 file(s) remaining  Completed 1.2 MiB/145.1 MiB (9.7 MiB/s) with 1 file(s) remaining  Completed 1.5 MiB/145.1 MiB (11.5 MiB/s) with 1 file(s) remaining Completed 1.8 MiB/145.1 MiB (13.3 MiB/s) with 1 file(s) remaining Completed 2.0 MiB/145.1 MiB (15.0 MiB/s) with 1 file(s) remaining Completed 2.2 MiB/145.1 MiB (16.8 MiB/s) with 1 file(s) remaining Completed 2.5 MiB/145.1 MiB (18.4 MiB/s) with 1 file(s) remaining Completed 2.8 MiB/145.1 MiB (20.0 MiB/s) with 1 file(s) remaining Completed 3.0 MiB/145.1 MiB (21.5 MiB/s) with 1 file(s) remaining Completed 3.2 MiB/145.1 MiB (23.0 MiB/s) with 1 file(s) remaining Completed 3.5 MiB/145.1 MiB (24.7 MiB/s) with 1 file(s) remaining Completed 3.8 MiB/145.1 MiB (26.2 MiB/s) with 1 file(s) remain

Completed 49.0 MiB/145.1 MiB (150.2 MiB/s) with 1 file(s) remainingCompleted 49.2 MiB/145.1 MiB (150.4 MiB/s) with 1 file(s) remainingCompleted 49.5 MiB/145.1 MiB (150.6 MiB/s) with 1 file(s) remainingCompleted 49.8 MiB/145.1 MiB (151.3 MiB/s) with 1 file(s) remainingCompleted 50.0 MiB/145.1 MiB (151.8 MiB/s) with 1 file(s) remainingCompleted 50.2 MiB/145.1 MiB (152.3 MiB/s) with 1 file(s) remainingCompleted 50.5 MiB/145.1 MiB (152.9 MiB/s) with 1 file(s) remainingCompleted 50.8 MiB/145.1 MiB (153.2 MiB/s) with 1 file(s) remainingCompleted 51.0 MiB/145.1 MiB (153.5 MiB/s) with 1 file(s) remainingCompleted 51.2 MiB/145.1 MiB (152.8 MiB/s) with 1 file(s) remainingCompleted 51.5 MiB/145.1 MiB (153.5 MiB/s) with 1 file(s) remainingCompleted 51.8 MiB/145.1 MiB (154.1 MiB/s) with 1 file(s) remainingCompleted 52.0 MiB/145.1 MiB (154.4 MiB/s) with 1 file(s) remainingCompleted 52.2 MiB/145.1 MiB (155.0 MiB/s) with 1 file(s) remainingCompleted 52.5 MiB/145.1 MiB (155.1 MiB/s) with 

Completed 96.8 MiB/145.1 MiB (181.2 MiB/s) with 1 file(s) remainingCompleted 97.0 MiB/145.1 MiB (181.4 MiB/s) with 1 file(s) remainingCompleted 97.2 MiB/145.1 MiB (181.7 MiB/s) with 1 file(s) remainingCompleted 97.5 MiB/145.1 MiB (181.2 MiB/s) with 1 file(s) remainingCompleted 97.8 MiB/145.1 MiB (181.4 MiB/s) with 1 file(s) remainingCompleted 98.0 MiB/145.1 MiB (181.6 MiB/s) with 1 file(s) remainingCompleted 98.2 MiB/145.1 MiB (180.5 MiB/s) with 1 file(s) remainingCompleted 98.5 MiB/145.1 MiB (180.8 MiB/s) with 1 file(s) remainingCompleted 98.8 MiB/145.1 MiB (181.0 MiB/s) with 1 file(s) remainingCompleted 99.0 MiB/145.1 MiB (181.3 MiB/s) with 1 file(s) remainingCompleted 99.2 MiB/145.1 MiB (181.6 MiB/s) with 1 file(s) remainingCompleted 99.5 MiB/145.1 MiB (181.8 MiB/s) with 1 file(s) remainingCompleted 99.8 MiB/145.1 MiB (182.0 MiB/s) with 1 file(s) remainingCompleted 100.0 MiB/145.1 MiB (182.2 MiB/s) with 1 file(s) remainingCompleted 100.2 MiB/145.1 MiB (182.5 MiB/s) wit

upload: all_data_supp_data_live_deployment/2023_02_16_23_supp_data_38.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_38/001_data_files/2023_02_16_23_supp_data_38.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_23_supp_data_380.parquet
upload: all_data_supp_data_live_deployment/2023_02_16_23_supp_data_380.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_380/001_data_files/2023_02_16_23_supp_data_380.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_23_supp_data_381.parquet
upload: all_data_supp_data_live_deployment/2023_02_16_23_supp_data_381.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_381/001_data_files/2023_02_16_23_supp_data_381.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023

Completed 256.0 KiB/134.4 MiB (1.8 MiB/s) with 1 file(s) remainingCompleted 512.0 KiB/134.4 MiB (3.4 MiB/s) with 1 file(s) remainingCompleted 768.0 KiB/134.4 MiB (4.9 MiB/s) with 1 file(s) remainingCompleted 1.0 MiB/134.4 MiB (6.4 MiB/s) with 1 file(s) remaining  Completed 1.2 MiB/134.4 MiB (8.0 MiB/s) with 1 file(s) remaining  Completed 1.5 MiB/134.4 MiB (9.5 MiB/s) with 1 file(s) remaining  Completed 1.8 MiB/134.4 MiB (11.0 MiB/s) with 1 file(s) remaining Completed 2.0 MiB/134.4 MiB (12.5 MiB/s) with 1 file(s) remaining Completed 2.2 MiB/134.4 MiB (14.0 MiB/s) with 1 file(s) remaining Completed 2.5 MiB/134.4 MiB (15.4 MiB/s) with 1 file(s) remaining Completed 2.8 MiB/134.4 MiB (17.0 MiB/s) with 1 file(s) remaining Completed 3.0 MiB/134.4 MiB (18.3 MiB/s) with 1 file(s) remaining Completed 3.2 MiB/134.4 MiB (19.7 MiB/s) with 1 file(s) remaining Completed 3.5 MiB/134.4 MiB (21.1 MiB/s) with 1 file(s) remaining Completed 3.8 MiB/134.4 MiB (22.4 MiB/s) with 1 file(s) remain

Completed 46.2 MiB/134.4 MiB (135.8 MiB/s) with 1 file(s) remainingCompleted 46.5 MiB/134.4 MiB (136.1 MiB/s) with 1 file(s) remainingCompleted 46.8 MiB/134.4 MiB (136.6 MiB/s) with 1 file(s) remainingCompleted 47.0 MiB/134.4 MiB (137.3 MiB/s) with 1 file(s) remainingCompleted 47.2 MiB/134.4 MiB (137.8 MiB/s) with 1 file(s) remainingCompleted 47.5 MiB/134.4 MiB (137.1 MiB/s) with 1 file(s) remainingCompleted 47.8 MiB/134.4 MiB (137.5 MiB/s) with 1 file(s) remainingCompleted 48.0 MiB/134.4 MiB (138.2 MiB/s) with 1 file(s) remainingCompleted 48.2 MiB/134.4 MiB (138.4 MiB/s) with 1 file(s) remainingCompleted 48.5 MiB/134.4 MiB (138.6 MiB/s) with 1 file(s) remainingCompleted 48.8 MiB/134.4 MiB (139.1 MiB/s) with 1 file(s) remainingCompleted 49.0 MiB/134.4 MiB (139.5 MiB/s) with 1 file(s) remainingCompleted 49.2 MiB/134.4 MiB (140.1 MiB/s) with 1 file(s) remainingCompleted 49.5 MiB/134.4 MiB (140.5 MiB/s) with 1 file(s) remainingCompleted 49.8 MiB/134.4 MiB (140.7 MiB/s) with 

Completed 80.2 MiB/134.4 MiB (141.3 MiB/s) with 1 file(s) remainingCompleted 80.5 MiB/134.4 MiB (141.7 MiB/s) with 1 file(s) remainingCompleted 80.8 MiB/134.4 MiB (141.7 MiB/s) with 1 file(s) remainingCompleted 81.0 MiB/134.4 MiB (141.7 MiB/s) with 1 file(s) remainingCompleted 81.2 MiB/134.4 MiB (142.0 MiB/s) with 1 file(s) remainingCompleted 81.5 MiB/134.4 MiB (141.8 MiB/s) with 1 file(s) remainingCompleted 81.8 MiB/134.4 MiB (142.1 MiB/s) with 1 file(s) remainingCompleted 82.0 MiB/134.4 MiB (142.0 MiB/s) with 1 file(s) remainingCompleted 82.2 MiB/134.4 MiB (142.4 MiB/s) with 1 file(s) remainingCompleted 82.5 MiB/134.4 MiB (142.7 MiB/s) with 1 file(s) remainingCompleted 82.8 MiB/134.4 MiB (142.6 MiB/s) with 1 file(s) remainingCompleted 83.0 MiB/134.4 MiB (142.8 MiB/s) with 1 file(s) remainingCompleted 83.2 MiB/134.4 MiB (142.9 MiB/s) with 1 file(s) remainingCompleted 83.5 MiB/134.4 MiB (143.0 MiB/s) with 1 file(s) remainingCompleted 83.8 MiB/134.4 MiB (143.4 MiB/s) with 

upload: all_data_supp_data_live_deployment/2023_02_17_00_supp_data_400.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_400/001_data_files/2023_02_17_00_supp_data_400.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_17_00_supp_data_401.parquet
upload: all_data_supp_data_live_deployment/2023_02_17_00_supp_data_401.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_401/001_data_files/2023_02_17_00_supp_data_401.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_17_00_supp_data_402.parquet
upload: all_data_supp_data_live_deployment/2023_02_17_00_supp_data_402.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_402/001_data_files/2023_02_17_00_supp_data_402.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2

Completed 256.0 KiB/132.5 MiB (2.0 MiB/s) with 1 file(s) remainingCompleted 512.0 KiB/132.5 MiB (4.0 MiB/s) with 1 file(s) remainingCompleted 768.0 KiB/132.5 MiB (5.9 MiB/s) with 1 file(s) remainingCompleted 1.0 MiB/132.5 MiB (7.8 MiB/s) with 1 file(s) remaining  Completed 1.2 MiB/132.5 MiB (9.3 MiB/s) with 1 file(s) remaining  Completed 1.5 MiB/132.5 MiB (11.0 MiB/s) with 1 file(s) remaining Completed 1.8 MiB/132.5 MiB (12.8 MiB/s) with 1 file(s) remaining Completed 2.0 MiB/132.5 MiB (14.5 MiB/s) with 1 file(s) remaining Completed 2.2 MiB/132.5 MiB (16.1 MiB/s) with 1 file(s) remaining Completed 2.5 MiB/132.5 MiB (17.7 MiB/s) with 1 file(s) remaining Completed 2.8 MiB/132.5 MiB (19.3 MiB/s) with 1 file(s) remaining Completed 3.0 MiB/132.5 MiB (20.9 MiB/s) with 1 file(s) remaining Completed 3.2 MiB/132.5 MiB (22.4 MiB/s) with 1 file(s) remaining Completed 3.5 MiB/132.5 MiB (23.9 MiB/s) with 1 file(s) remaining Completed 3.8 MiB/132.5 MiB (25.5 MiB/s) with 1 file(s) remain

Completed 50.0 MiB/132.5 MiB (153.5 MiB/s) with 1 file(s) remainingCompleted 50.2 MiB/132.5 MiB (154.0 MiB/s) with 1 file(s) remainingCompleted 50.5 MiB/132.5 MiB (153.6 MiB/s) with 1 file(s) remainingCompleted 50.8 MiB/132.5 MiB (153.7 MiB/s) with 1 file(s) remainingCompleted 51.0 MiB/132.5 MiB (154.2 MiB/s) with 1 file(s) remainingCompleted 51.2 MiB/132.5 MiB (154.8 MiB/s) with 1 file(s) remainingCompleted 51.5 MiB/132.5 MiB (155.3 MiB/s) with 1 file(s) remainingCompleted 51.8 MiB/132.5 MiB (154.9 MiB/s) with 1 file(s) remainingCompleted 52.0 MiB/132.5 MiB (155.2 MiB/s) with 1 file(s) remainingCompleted 52.2 MiB/132.5 MiB (155.7 MiB/s) with 1 file(s) remainingCompleted 52.5 MiB/132.5 MiB (156.4 MiB/s) with 1 file(s) remainingCompleted 52.8 MiB/132.5 MiB (157.1 MiB/s) with 1 file(s) remainingCompleted 53.0 MiB/132.5 MiB (156.1 MiB/s) with 1 file(s) remainingCompleted 53.2 MiB/132.5 MiB (156.8 MiB/s) with 1 file(s) remainingCompleted 53.5 MiB/132.5 MiB (156.8 MiB/s) with 

Completed 88.2 MiB/132.5 MiB (159.7 MiB/s) with 1 file(s) remainingCompleted 88.5 MiB/132.5 MiB (159.6 MiB/s) with 1 file(s) remainingCompleted 88.8 MiB/132.5 MiB (159.4 MiB/s) with 1 file(s) remainingCompleted 89.0 MiB/132.5 MiB (158.0 MiB/s) with 1 file(s) remainingCompleted 89.2 MiB/132.5 MiB (158.2 MiB/s) with 1 file(s) remainingCompleted 89.5 MiB/132.5 MiB (158.1 MiB/s) with 1 file(s) remainingCompleted 89.8 MiB/132.5 MiB (158.3 MiB/s) with 1 file(s) remainingCompleted 90.0 MiB/132.5 MiB (158.7 MiB/s) with 1 file(s) remainingCompleted 90.2 MiB/132.5 MiB (158.7 MiB/s) with 1 file(s) remainingCompleted 90.5 MiB/132.5 MiB (159.0 MiB/s) with 1 file(s) remainingCompleted 90.8 MiB/132.5 MiB (159.1 MiB/s) with 1 file(s) remainingCompleted 91.0 MiB/132.5 MiB (159.1 MiB/s) with 1 file(s) remainingCompleted 91.2 MiB/132.5 MiB (159.3 MiB/s) with 1 file(s) remainingCompleted 91.5 MiB/132.5 MiB (159.5 MiB/s) with 1 file(s) remainingCompleted 91.8 MiB/132.5 MiB (159.9 MiB/s) with 

upload: all_data_supp_data_live_deployment/2023_02_16_21_supp_data_422.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_422/001_data_files/2023_02_16_21_supp_data_422.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_21_supp_data_423.parquet
upload: all_data_supp_data_live_deployment/2023_02_16_21_supp_data_423.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_423/001_data_files/2023_02_16_21_supp_data_423.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_21_supp_data_424.parquet
upload: all_data_supp_data_live_deployment/2023_02_16_21_supp_data_424.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_424/001_data_files/2023_02_16_21_supp_data_424.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2

Completed 256.0 KiB/157.9 MiB (1.9 MiB/s) with 1 file(s) remainingCompleted 512.0 KiB/157.9 MiB (3.8 MiB/s) with 1 file(s) remainingCompleted 768.0 KiB/157.9 MiB (5.4 MiB/s) with 1 file(s) remainingCompleted 1.0 MiB/157.9 MiB (7.1 MiB/s) with 1 file(s) remaining  Completed 1.2 MiB/157.9 MiB (8.6 MiB/s) with 1 file(s) remaining  Completed 1.5 MiB/157.9 MiB (10.3 MiB/s) with 1 file(s) remaining Completed 1.8 MiB/157.9 MiB (11.7 MiB/s) with 1 file(s) remaining Completed 2.0 MiB/157.9 MiB (13.4 MiB/s) with 1 file(s) remaining Completed 2.2 MiB/157.9 MiB (14.9 MiB/s) with 1 file(s) remaining Completed 2.5 MiB/157.9 MiB (16.5 MiB/s) with 1 file(s) remaining Completed 2.8 MiB/157.9 MiB (18.1 MiB/s) with 1 file(s) remaining Completed 3.0 MiB/157.9 MiB (19.5 MiB/s) with 1 file(s) remaining Completed 3.2 MiB/157.9 MiB (21.0 MiB/s) with 1 file(s) remaining Completed 3.5 MiB/157.9 MiB (22.5 MiB/s) with 1 file(s) remaining Completed 3.8 MiB/157.9 MiB (23.9 MiB/s) with 1 file(s) remain

Completed 48.2 MiB/157.9 MiB (145.2 MiB/s) with 1 file(s) remainingCompleted 48.5 MiB/157.9 MiB (145.1 MiB/s) with 1 file(s) remainingCompleted 48.8 MiB/157.9 MiB (145.8 MiB/s) with 1 file(s) remainingCompleted 49.0 MiB/157.9 MiB (146.1 MiB/s) with 1 file(s) remainingCompleted 49.2 MiB/157.9 MiB (145.8 MiB/s) with 1 file(s) remainingCompleted 49.5 MiB/157.9 MiB (146.5 MiB/s) with 1 file(s) remainingCompleted 49.8 MiB/157.9 MiB (146.6 MiB/s) with 1 file(s) remainingCompleted 50.0 MiB/157.9 MiB (147.1 MiB/s) with 1 file(s) remainingCompleted 50.2 MiB/157.9 MiB (146.6 MiB/s) with 1 file(s) remainingCompleted 50.5 MiB/157.9 MiB (147.3 MiB/s) with 1 file(s) remainingCompleted 50.8 MiB/157.9 MiB (148.0 MiB/s) with 1 file(s) remainingCompleted 51.0 MiB/157.9 MiB (148.3 MiB/s) with 1 file(s) remainingCompleted 51.2 MiB/157.9 MiB (148.8 MiB/s) with 1 file(s) remainingCompleted 51.5 MiB/157.9 MiB (147.1 MiB/s) with 1 file(s) remainingCompleted 51.8 MiB/157.9 MiB (147.7 MiB/s) with 

Completed 94.2 MiB/157.9 MiB (176.4 MiB/s) with 1 file(s) remainingCompleted 94.5 MiB/157.9 MiB (176.6 MiB/s) with 1 file(s) remainingCompleted 94.8 MiB/157.9 MiB (176.9 MiB/s) with 1 file(s) remainingCompleted 95.0 MiB/157.9 MiB (177.1 MiB/s) with 1 file(s) remainingCompleted 95.2 MiB/157.9 MiB (177.4 MiB/s) with 1 file(s) remainingCompleted 95.5 MiB/157.9 MiB (177.6 MiB/s) with 1 file(s) remainingCompleted 95.8 MiB/157.9 MiB (177.8 MiB/s) with 1 file(s) remainingCompleted 96.0 MiB/157.9 MiB (178.0 MiB/s) with 1 file(s) remainingCompleted 96.2 MiB/157.9 MiB (169.4 MiB/s) with 1 file(s) remainingCompleted 96.5 MiB/157.9 MiB (168.6 MiB/s) with 1 file(s) remainingCompleted 96.8 MiB/157.9 MiB (168.9 MiB/s) with 1 file(s) remainingCompleted 97.0 MiB/157.9 MiB (169.0 MiB/s) with 1 file(s) remainingCompleted 97.2 MiB/157.9 MiB (169.2 MiB/s) with 1 file(s) remainingCompleted 97.5 MiB/157.9 MiB (169.6 MiB/s) with 1 file(s) remainingCompleted 97.8 MiB/157.9 MiB (169.7 MiB/s) with 

upload: all_data_supp_data_live_deployment/2023_02_16_21_supp_data_444.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_444/001_data_files/2023_02_16_21_supp_data_444.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_21_supp_data_445.parquet
upload: all_data_supp_data_live_deployment/2023_02_16_21_supp_data_445.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_445/001_data_files/2023_02_16_21_supp_data_445.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_21_supp_data_446.parquet
upload: all_data_supp_data_live_deployment/2023_02_16_21_supp_data_446.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_446/001_data_files/2023_02_16_21_supp_data_446.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2

Completed 256.0 KiB/150.1 MiB (1.7 MiB/s) with 1 file(s) remainingCompleted 512.0 KiB/150.1 MiB (3.3 MiB/s) with 1 file(s) remainingCompleted 768.0 KiB/150.1 MiB (4.9 MiB/s) with 1 file(s) remainingCompleted 1.0 MiB/150.1 MiB (6.5 MiB/s) with 1 file(s) remaining  Completed 1.2 MiB/150.1 MiB (8.0 MiB/s) with 1 file(s) remaining  Completed 1.5 MiB/150.1 MiB (9.6 MiB/s) with 1 file(s) remaining  Completed 1.8 MiB/150.1 MiB (11.1 MiB/s) with 1 file(s) remaining Completed 2.0 MiB/150.1 MiB (12.3 MiB/s) with 1 file(s) remaining Completed 2.2 MiB/150.1 MiB (13.8 MiB/s) with 1 file(s) remaining Completed 2.5 MiB/150.1 MiB (15.2 MiB/s) with 1 file(s) remaining Completed 2.8 MiB/150.1 MiB (16.6 MiB/s) with 1 file(s) remaining Completed 3.0 MiB/150.1 MiB (18.0 MiB/s) with 1 file(s) remaining Completed 3.2 MiB/150.1 MiB (19.5 MiB/s) with 1 file(s) remaining Completed 3.5 MiB/150.1 MiB (20.9 MiB/s) with 1 file(s) remaining Completed 3.8 MiB/150.1 MiB (22.3 MiB/s) with 1 file(s) remain

Completed 51.5 MiB/150.1 MiB (147.2 MiB/s) with 1 file(s) remainingCompleted 51.8 MiB/150.1 MiB (147.8 MiB/s) with 1 file(s) remainingCompleted 52.0 MiB/150.1 MiB (148.3 MiB/s) with 1 file(s) remainingCompleted 52.2 MiB/150.1 MiB (148.5 MiB/s) with 1 file(s) remainingCompleted 52.5 MiB/150.1 MiB (148.3 MiB/s) with 1 file(s) remainingCompleted 52.8 MiB/150.1 MiB (148.9 MiB/s) with 1 file(s) remainingCompleted 53.0 MiB/150.1 MiB (149.0 MiB/s) with 1 file(s) remainingCompleted 53.2 MiB/150.1 MiB (149.3 MiB/s) with 1 file(s) remainingCompleted 53.5 MiB/150.1 MiB (149.5 MiB/s) with 1 file(s) remainingCompleted 53.8 MiB/150.1 MiB (149.9 MiB/s) with 1 file(s) remainingCompleted 54.0 MiB/150.1 MiB (150.5 MiB/s) with 1 file(s) remainingCompleted 54.2 MiB/150.1 MiB (151.0 MiB/s) with 1 file(s) remainingCompleted 54.5 MiB/150.1 MiB (151.3 MiB/s) with 1 file(s) remainingCompleted 54.8 MiB/150.1 MiB (151.8 MiB/s) with 1 file(s) remainingCompleted 55.0 MiB/150.1 MiB (152.0 MiB/s) with 

Completed 96.2 MiB/150.1 MiB (174.6 MiB/s) with 1 file(s) remainingCompleted 96.5 MiB/150.1 MiB (174.7 MiB/s) with 1 file(s) remainingCompleted 96.8 MiB/150.1 MiB (174.8 MiB/s) with 1 file(s) remainingCompleted 97.0 MiB/150.1 MiB (174.6 MiB/s) with 1 file(s) remainingCompleted 97.2 MiB/150.1 MiB (174.5 MiB/s) with 1 file(s) remainingCompleted 97.5 MiB/150.1 MiB (172.3 MiB/s) with 1 file(s) remainingCompleted 97.8 MiB/150.1 MiB (172.0 MiB/s) with 1 file(s) remainingCompleted 98.0 MiB/150.1 MiB (172.3 MiB/s) with 1 file(s) remainingCompleted 98.2 MiB/150.1 MiB (171.8 MiB/s) with 1 file(s) remainingCompleted 98.5 MiB/150.1 MiB (172.0 MiB/s) with 1 file(s) remainingCompleted 98.8 MiB/150.1 MiB (171.9 MiB/s) with 1 file(s) remainingCompleted 99.0 MiB/150.1 MiB (172.2 MiB/s) with 1 file(s) remainingCompleted 99.2 MiB/150.1 MiB (172.3 MiB/s) with 1 file(s) remainingCompleted 99.5 MiB/150.1 MiB (172.5 MiB/s) with 1 file(s) remainingCompleted 99.8 MiB/150.1 MiB (172.8 MiB/s) with 

upload: all_data_supp_data_live_deployment/2023_02_16_22_supp_data_466.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_466/001_data_files/2023_02_16_22_supp_data_466.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_22_supp_data_467.parquet
upload: all_data_supp_data_live_deployment/2023_02_16_22_supp_data_467.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_467/001_data_files/2023_02_16_22_supp_data_467.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_22_supp_data_468.parquet
upload: all_data_supp_data_live_deployment/2023_02_16_22_supp_data_468.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_468/001_data_files/2023_02_16_22_supp_data_468.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2

Completed 256.0 KiB/137.1 MiB (1.8 MiB/s) with 1 file(s) remainingCompleted 512.0 KiB/137.1 MiB (3.4 MiB/s) with 1 file(s) remainingCompleted 768.0 KiB/137.1 MiB (4.9 MiB/s) with 1 file(s) remainingCompleted 1.0 MiB/137.1 MiB (6.4 MiB/s) with 1 file(s) remaining  Completed 1.2 MiB/137.1 MiB (7.8 MiB/s) with 1 file(s) remaining  Completed 1.5 MiB/137.1 MiB (9.1 MiB/s) with 1 file(s) remaining  Completed 1.8 MiB/137.1 MiB (10.5 MiB/s) with 1 file(s) remaining Completed 2.0 MiB/137.1 MiB (12.0 MiB/s) with 1 file(s) remaining Completed 2.2 MiB/137.1 MiB (13.4 MiB/s) with 1 file(s) remaining Completed 2.5 MiB/137.1 MiB (14.8 MiB/s) with 1 file(s) remaining Completed 2.8 MiB/137.1 MiB (16.2 MiB/s) with 1 file(s) remaining Completed 3.0 MiB/137.1 MiB (17.6 MiB/s) with 1 file(s) remaining Completed 3.2 MiB/137.1 MiB (19.0 MiB/s) with 1 file(s) remaining Completed 3.5 MiB/137.1 MiB (20.2 MiB/s) with 1 file(s) remaining Completed 3.8 MiB/137.1 MiB (21.6 MiB/s) with 1 file(s) remain

Completed 46.5 MiB/137.1 MiB (135.9 MiB/s) with 1 file(s) remainingCompleted 46.8 MiB/137.1 MiB (136.5 MiB/s) with 1 file(s) remainingCompleted 47.0 MiB/137.1 MiB (137.0 MiB/s) with 1 file(s) remainingCompleted 47.2 MiB/137.1 MiB (137.6 MiB/s) with 1 file(s) remainingCompleted 47.5 MiB/137.1 MiB (138.1 MiB/s) with 1 file(s) remainingCompleted 47.8 MiB/137.1 MiB (138.4 MiB/s) with 1 file(s) remainingCompleted 48.0 MiB/137.1 MiB (138.8 MiB/s) with 1 file(s) remainingCompleted 48.2 MiB/137.1 MiB (138.9 MiB/s) with 1 file(s) remainingCompleted 48.5 MiB/137.1 MiB (138.4 MiB/s) with 1 file(s) remainingCompleted 48.8 MiB/137.1 MiB (138.7 MiB/s) with 1 file(s) remainingCompleted 49.0 MiB/137.1 MiB (139.4 MiB/s) with 1 file(s) remainingCompleted 49.2 MiB/137.1 MiB (139.8 MiB/s) with 1 file(s) remainingCompleted 49.5 MiB/137.1 MiB (140.3 MiB/s) with 1 file(s) remainingCompleted 49.8 MiB/137.1 MiB (140.9 MiB/s) with 1 file(s) remainingCompleted 50.0 MiB/137.1 MiB (141.5 MiB/s) with 

Completed 83.8 MiB/137.1 MiB (151.4 MiB/s) with 1 file(s) remainingCompleted 84.0 MiB/137.1 MiB (151.6 MiB/s) with 1 file(s) remainingCompleted 84.2 MiB/137.1 MiB (152.0 MiB/s) with 1 file(s) remainingCompleted 84.5 MiB/137.1 MiB (152.1 MiB/s) with 1 file(s) remainingCompleted 84.8 MiB/137.1 MiB (151.2 MiB/s) with 1 file(s) remainingCompleted 85.0 MiB/137.1 MiB (151.1 MiB/s) with 1 file(s) remainingCompleted 85.2 MiB/137.1 MiB (151.3 MiB/s) with 1 file(s) remainingCompleted 85.5 MiB/137.1 MiB (151.3 MiB/s) with 1 file(s) remainingCompleted 85.8 MiB/137.1 MiB (151.8 MiB/s) with 1 file(s) remainingCompleted 86.0 MiB/137.1 MiB (152.0 MiB/s) with 1 file(s) remainingCompleted 86.2 MiB/137.1 MiB (151.9 MiB/s) with 1 file(s) remainingCompleted 86.5 MiB/137.1 MiB (152.4 MiB/s) with 1 file(s) remainingCompleted 86.8 MiB/137.1 MiB (152.7 MiB/s) with 1 file(s) remainingCompleted 87.0 MiB/137.1 MiB (152.4 MiB/s) with 1 file(s) remainingCompleted 87.2 MiB/137.1 MiB (152.7 MiB/s) with 

upload: all_data_supp_data_live_deployment/2023_02_16_22_supp_data_488.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_488/001_data_files/2023_02_16_22_supp_data_488.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_22_supp_data_489.parquet
upload: all_data_supp_data_live_deployment/2023_02_16_22_supp_data_489.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_489/001_data_files/2023_02_16_22_supp_data_489.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_22_supp_data_49.parquet
upload: all_data_supp_data_live_deployment/2023_02_16_22_supp_data_49.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_49/001_data_files/2023_02_16_22_supp_data_49.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_

Completed 256.0 KiB/131.8 MiB (2.2 MiB/s) with 1 file(s) remainingCompleted 512.0 KiB/131.8 MiB (4.4 MiB/s) with 1 file(s) remainingCompleted 768.0 KiB/131.8 MiB (6.3 MiB/s) with 1 file(s) remainingCompleted 1.0 MiB/131.8 MiB (8.3 MiB/s) with 1 file(s) remaining  Completed 1.2 MiB/131.8 MiB (10.2 MiB/s) with 1 file(s) remaining Completed 1.5 MiB/131.8 MiB (12.2 MiB/s) with 1 file(s) remaining Completed 1.8 MiB/131.8 MiB (14.1 MiB/s) with 1 file(s) remaining Completed 2.0 MiB/131.8 MiB (15.9 MiB/s) with 1 file(s) remaining Completed 2.2 MiB/131.8 MiB (17.4 MiB/s) with 1 file(s) remaining Completed 2.5 MiB/131.8 MiB (19.1 MiB/s) with 1 file(s) remaining Completed 2.8 MiB/131.8 MiB (20.8 MiB/s) with 1 file(s) remaining Completed 3.0 MiB/131.8 MiB (22.5 MiB/s) with 1 file(s) remaining Completed 3.2 MiB/131.8 MiB (24.0 MiB/s) with 1 file(s) remaining Completed 3.5 MiB/131.8 MiB (25.7 MiB/s) with 1 file(s) remaining Completed 3.8 MiB/131.8 MiB (27.3 MiB/s) with 1 file(s) remain

Completed 46.2 MiB/131.8 MiB (146.8 MiB/s) with 1 file(s) remainingCompleted 46.5 MiB/131.8 MiB (147.5 MiB/s) with 1 file(s) remainingCompleted 46.8 MiB/131.8 MiB (148.2 MiB/s) with 1 file(s) remainingCompleted 47.0 MiB/131.8 MiB (148.7 MiB/s) with 1 file(s) remainingCompleted 47.2 MiB/131.8 MiB (149.1 MiB/s) with 1 file(s) remainingCompleted 47.5 MiB/131.8 MiB (149.3 MiB/s) with 1 file(s) remainingCompleted 47.8 MiB/131.8 MiB (149.8 MiB/s) with 1 file(s) remainingCompleted 48.0 MiB/131.8 MiB (149.8 MiB/s) with 1 file(s) remainingCompleted 48.2 MiB/131.8 MiB (150.2 MiB/s) with 1 file(s) remainingCompleted 48.5 MiB/131.8 MiB (149.4 MiB/s) with 1 file(s) remainingCompleted 48.8 MiB/131.8 MiB (150.0 MiB/s) with 1 file(s) remainingCompleted 49.0 MiB/131.8 MiB (150.6 MiB/s) with 1 file(s) remainingCompleted 49.2 MiB/131.8 MiB (151.1 MiB/s) with 1 file(s) remainingCompleted 49.5 MiB/131.8 MiB (151.5 MiB/s) with 1 file(s) remainingCompleted 49.8 MiB/131.8 MiB (152.2 MiB/s) with 

Completed 88.2 MiB/131.8 MiB (153.3 MiB/s) with 1 file(s) remainingCompleted 88.5 MiB/131.8 MiB (151.5 MiB/s) with 1 file(s) remainingCompleted 88.8 MiB/131.8 MiB (151.7 MiB/s) with 1 file(s) remainingCompleted 89.0 MiB/131.8 MiB (152.0 MiB/s) with 1 file(s) remainingCompleted 89.2 MiB/131.8 MiB (152.2 MiB/s) with 1 file(s) remainingCompleted 89.5 MiB/131.8 MiB (152.3 MiB/s) with 1 file(s) remainingCompleted 89.8 MiB/131.8 MiB (152.4 MiB/s) with 1 file(s) remainingCompleted 90.0 MiB/131.8 MiB (152.7 MiB/s) with 1 file(s) remainingCompleted 90.2 MiB/131.8 MiB (153.0 MiB/s) with 1 file(s) remainingCompleted 90.5 MiB/131.8 MiB (153.1 MiB/s) with 1 file(s) remainingCompleted 90.8 MiB/131.8 MiB (153.3 MiB/s) with 1 file(s) remainingCompleted 91.0 MiB/131.8 MiB (153.5 MiB/s) with 1 file(s) remainingCompleted 91.2 MiB/131.8 MiB (153.5 MiB/s) with 1 file(s) remainingCompleted 91.5 MiB/131.8 MiB (153.8 MiB/s) with 1 file(s) remainingCompleted 91.8 MiB/131.8 MiB (153.7 MiB/s) with 

upload: all_data_supp_data_live_deployment/2023_02_16_23_supp_data_59.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_59/001_data_files/2023_02_16_23_supp_data_59.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_23_supp_data_6.parquet
upload: all_data_supp_data_live_deployment/2023_02_16_23_supp_data_6.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_6/001_data_files/2023_02_16_23_supp_data_6.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_23_supp_data_60.parquet
upload: all_data_supp_data_live_deployment/2023_02_16_23_supp_data_60.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_60/001_data_files/2023_02_16_23_supp_data_60.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_supp_data_live_deployment/2023_02_16_23_su

In [11]:
local_filepaths = glob.glob(f"/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/*")

In [12]:
local_filenames = [x.split("/")[-1] for x in local_filepaths]
file_int = [int(x.split("_")[-1].split(".")[0]) for x in local_filenames]

In [13]:
local_filepaths[:2]

['/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_124.parquet',
 '/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_169.parquet']

In [14]:
file_int[:2]

[124, 169]

In [15]:
for local_file, file_i in zip(local_filepaths, file_int):
    print(local_file)
    bucket_name = "author-name-disambiguation"
    prefix = f"V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_{node_mapping_dict[file_i]['node']}/"
    os.system(f"aws s3 cp {local_file} s3://{bucket_name}/{prefix}partition_{file_i}/002_cluster_files/ ")

/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_124.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_124.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_124/002_cluster_files/final_clustering_data_124.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_169.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_169.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_169/002_cluster_files/final_clustering_data_169.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_127.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_127.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_127/002_cluster_files/final_clustering_dat

upload: all_data_new_clusters_orcid_lops/final_clustering_data_106.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_106/002_cluster_files/final_clustering_data_106.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_111.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_111.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_111/002_cluster_files/final_clustering_data_111.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_141.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_141.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_141/002_cluster_files/final_clustering_data_141.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_dat

upload: all_data_new_clusters_orcid_lops/final_clustering_data_118.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_118/002_cluster_files/final_clustering_data_118.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_16.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_16.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_16/002_cluster_files/final_clustering_data_16.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_119.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_119.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_119/002_cluster_files/final_clustering_data_119.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_13

upload: all_data_new_clusters_orcid_lops/final_clustering_data_161.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_161/002_cluster_files/final_clustering_data_161.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_158.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_158.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_158/002_cluster_files/final_clustering_data_158.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_233.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_233.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_233/002_cluster_files/final_clustering_data_233.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_dat

upload: all_data_new_clusters_orcid_lops/final_clustering_data_176.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_176/002_cluster_files/final_clustering_data_176.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_175.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_175.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_175/002_cluster_files/final_clustering_data_175.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_178.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_178.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_178/002_cluster_files/final_clustering_data_178.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_dat

upload: all_data_new_clusters_orcid_lops/final_clustering_data_24.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_24/002_cluster_files/final_clustering_data_24.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_202.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_202.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_202/002_cluster_files/final_clustering_data_202.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_204.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_204.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_204/002_cluster_files/final_clustering_data_204.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_2

upload: all_data_new_clusters_orcid_lops/final_clustering_data_242.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_242/002_cluster_files/final_clustering_data_242.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_207.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_207.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_207/002_cluster_files/final_clustering_data_207.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_210.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_210.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_210/002_cluster_files/final_clustering_data_210.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_dat

upload: all_data_new_clusters_orcid_lops/final_clustering_data_3.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_3/002_cluster_files/final_clustering_data_3.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_244.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_244.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_244/002_cluster_files/final_clustering_data_244.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_277.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_277.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_277/002_cluster_files/final_clustering_data_277.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_288.

upload: all_data_new_clusters_orcid_lops/final_clustering_data_258.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_258/002_cluster_files/final_clustering_data_258.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_255.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_255.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_255/002_cluster_files/final_clustering_data_255.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_257.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_257.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_257/002_cluster_files/final_clustering_data_257.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_dat

upload: all_data_new_clusters_orcid_lops/final_clustering_data_27.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_27/002_cluster_files/final_clustering_data_27.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_268.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_268.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_268/002_cluster_files/final_clustering_data_268.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_283.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_283.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_283/002_cluster_files/final_clustering_data_283.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_2

upload: all_data_new_clusters_orcid_lops/final_clustering_data_312.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_312/002_cluster_files/final_clustering_data_312.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_314.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_314.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_314/002_cluster_files/final_clustering_data_314.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_313.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_313.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_313/002_cluster_files/final_clustering_data_313.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_dat

upload: all_data_new_clusters_orcid_lops/final_clustering_data_325.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_325/002_cluster_files/final_clustering_data_325.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_326.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_326.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_326/002_cluster_files/final_clustering_data_326.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_322.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_322.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_322/002_cluster_files/final_clustering_data_322.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_dat

upload: all_data_new_clusters_orcid_lops/final_clustering_data_35.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_35/002_cluster_files/final_clustering_data_35.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_356.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_356.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_356/002_cluster_files/final_clustering_data_356.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_337.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_337.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_337/002_cluster_files/final_clustering_data_337.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_3

upload: all_data_new_clusters_orcid_lops/final_clustering_data_383.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_383/002_cluster_files/final_clustering_data_383.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_415.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_415.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_415/002_cluster_files/final_clustering_data_415.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_419.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_419.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_419/002_cluster_files/final_clustering_data_419.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_dat

upload: all_data_new_clusters_orcid_lops/final_clustering_data_418.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_418/002_cluster_files/final_clustering_data_418.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_423.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_423.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_423/002_cluster_files/final_clustering_data_423.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_393.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_393.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_393/002_cluster_files/final_clustering_data_393.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_dat

upload: all_data_new_clusters_orcid_lops/final_clustering_data_405.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_405/002_cluster_files/final_clustering_data_405.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_439.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_439.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_439/002_cluster_files/final_clustering_data_439.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_413.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_413.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_413/002_cluster_files/final_clustering_data_413.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_dat

upload: all_data_new_clusters_orcid_lops/final_clustering_data_45.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_45/002_cluster_files/final_clustering_data_45.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_451.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_451.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_451/002_cluster_files/final_clustering_data_451.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_452.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_452.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_452/002_cluster_files/final_clustering_data_452.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_4

upload: all_data_new_clusters_orcid_lops/final_clustering_data_495.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_495/002_cluster_files/final_clustering_data_495.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_462.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_462.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_462/002_cluster_files/final_clustering_data_462.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_465.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_465.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_465/002_cluster_files/final_clustering_data_465.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_dat

upload: all_data_new_clusters_orcid_lops/final_clustering_data_48.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_48/002_cluster_files/final_clustering_data_48.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_481.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_481.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_481/002_cluster_files/final_clustering_data_481.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_499.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_499.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_3/partition_499/002_cluster_files/final_clustering_data_499.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_4

upload: all_data_new_clusters_orcid_lops/final_clustering_data_97.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_97/002_cluster_files/final_clustering_data_97.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_74.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_74.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_74/002_cluster_files/final_clustering_data_74.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_76.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_76.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_76/002_cluster_files/final_clustering_data_76.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_75.parque

upload: all_data_new_clusters_orcid_lops/final_clustering_data_99.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_2/partition_99/002_cluster_files/final_clustering_data_99.parquet
/home/ec2-user/WorkFolder/author_disambiguation/all_data_new_clusters_orcid_lops/final_clustering_data_95.parquet
upload: all_data_new_clusters_orcid_lops/final_clustering_data_95.parquet to s3://author-name-disambiguation/V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_95/002_cluster_files/final_clustering_data_95.parquet


### Rename Files

In [16]:
s3_client = boto3.client("s3")
bucket_name = "author-name-disambiguation"
paginator = s3_client.get_paginator("list_objects_v2")
response = paginator.paginate(Bucket=bucket_name, Prefix="V1/data/000_SUPPORT_FILES/000_NODE_DATA/",
                              PaginationConfig={"PageSize": 50})
data_filenames = []

for page in response:
    files = page.get("Contents")
    for file in files:
        if 'final_clustering_data' in file['Key']:
            data_filenames.append(file['Key'])

In [17]:
len(data_filenames)

501

In [18]:
data_filenames[0]

'V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_0/002_cluster_files/final_clustering_data_0.parquet'

In [19]:
old_file_keys = [x for x in data_filenames]
new_file_keys = [f"{'/'.join(x.split('/')[:-1])}/2023_02_16_20_init_clusters.parquet" for x in data_filenames]

In [20]:
old_file_keys[0]

'V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_0/002_cluster_files/final_clustering_data_0.parquet'

In [21]:
new_file_keys[0]

'V1/data/000_SUPPORT_FILES/000_NODE_DATA/NODE_1/partition_0/002_cluster_files/2023_02_16_20_init_clusters.parquet'

In [23]:
s3 = boto3.resource('s3')
for new_file_key, old_file_key in zip(new_file_keys, old_file_keys):
#     s3.Object(bucket_name, new_file_key).copy_from(CopySource=f'{bucket_name}/{old_file_key}')
    s3.Object(bucket_name, old_file_key).delete()