In [1]:
import pandas as pd
import numpy as np
import json
import os
import boto3
import glob
import sys
import time
import random
import logging
from unidecode import unidecode
from datetime import datetime
from nameparser import HumanName

In [2]:
# starting logging
_logger = logging.getLogger(__name__)
logging.basicConfig(
    filename='log_file.log',
    format='%(asctime)s.%(msecs)03dZ,%(pathname)s:%(lineno)d,%(levelname)s,%(module)s,%(funcName)s: %(message)s',
    datefmt="%Y-%m-%d %H:%M:%S")

_logger.setLevel(10)

### Get input data

In [3]:
def get_new_data_paths_s3(bucket_name, data_prefix):
    s3 = boto3.resource('s3')
    my_bucket = s3.Bucket(bucket_name)

    new_data_file_paths = []
    for my_bucket_object in my_bucket.objects.filter(Prefix=data_prefix).all():
        if my_bucket_object.key.endswith('.parquet'):
            _logger.info(f"Found new data file: {my_bucket_object.key}")
            new_data_file_paths.append(my_bucket_object.key)
    return new_data_file_paths

####### ALL ABOVE IS DONE (except for variables and paths)

In [4]:
def get_new_data_file(bucket_name, data_prefix):
    new_file_key = get_new_data_paths_s3(bucket_name, data_prefix)[0]
    
    new_file = pd.read_parquet(f"s3://{bucket_name}/{new_file_key}")
    return new_file, new_file_key

####### ALL ABOVE IS DONE (except for variables and paths)

In [5]:
def check_latin_name(text):
    try:        
        str(text).encode('latin-1')
        return True
    except:
        return False

####### ALL ABOVE IS DONE (except for variables and paths)

In [6]:
def human_name(name_text):
    name_text = name_text.strip().replace(".", " ").replace("-", " ").replace("  ", " ").replace("  ", " ")\
                .replace("  ", " ")
    person = HumanName(name_text)
    first_name = "".join([x for x in person.first if x not in [".", "-"]])
    last_name = person.last
    middle_name_1 = person.middle.strip()
    if len(middle_name_1.split(" ")) > 1:
        middle_name_2 = " ".join(middle_name_1.split(" ")[1:]).strip()
        middle_name_1 = middle_name_1.split(" ")[0].strip()
    else:
        middle_name_2 = ""
        
    middle_name_1 = "".join([x for x in middle_name_1 if x not in [".", "-"]])
    middle_name_2 = "".join([x for x in middle_name_2 if x not in [".", "-"]])
        
    if (len(first_name) == 3) and (first_name.isupper()) and (not middle_name_2 and not middle_name_1):
        middle_name_1 = first_name[1]
        middle_name_2 = first_name[2]
        first_name = first_name[0]
    elif (len(first_name) == 2) and (first_name.isupper()) and (not middle_name_1):
        middle_name_1 = first_name[1]
        first_name = first_name[0]
        
    return [unidecode(first_name), unidecode(middle_name_1), unidecode(middle_name_2), unidecode(last_name)]

####### ALL ABOVE IS DONE (except for variables and paths)

In [7]:
def create_author_name_match(author_names):
    
    if not isinstance(author_names, list):
        if isinstance(author_names, str):
            author_names = [author_names]
        else:
            author_names = author_names.tolist()
        
    first_names = []
    first_initials = []
    middle_1_names = []
    middle_1_initials = []
    middle_2_names = []
    middle_2_initials = []
    
    for author_name in author_names:
        if not author_name:
            pass
        else:
            if (check_latin_name(author_name)) and (len(author_name.split(" "))>1): 
                name = human_name(author_name)
                # get all of the different versions of the name here

                if name[0] and name[1] and name[2] and name[3]:
                    # first name
                    if len(name[0]) > 1:
                        first_names.append(str(name[0]).lower())
                        first_initials.append(str(name[0])[0].lower())
                    else:
                        first_initials.append(str(name[0]).lower())

                    # middle 1 names
                    if len(name[1]) > 1:
                        middle_1_names.append(str(name[1]).lower())
                        middle_1_initials.append(str(name[1])[0].lower())
                    else:
                        middle_1_initials.append(str(name[1]).lower())

                    # middle 2 names
                    if len(name[2]) > 1:
                        middle_2_names.append(str(name[2]).lower())
                        middle_2_initials.append(str(name[2])[0].lower())
                    else:
                        middle_2_initials.append(str(name[2]).lower())

                elif name[0] and name[1] and name[3]:
                    # first name
                    if len(name[0]) > 1:
                        first_names.append(str(name[0]).lower())
                        first_initials.append(str(name[0])[0].lower())
                    else:
                        first_initials.append(str(name[0]).lower())

                    # middle 1 names
                    if len(name[1]) > 1:
                        middle_1_names.append(str(name[1]).lower())
                        middle_1_initials.append(str(name[1])[0].lower())
                    else:
                        middle_1_initials.append(str(name[1]).lower())

                elif name[0] and name[3]:
                    # first name
                    if len(name[0]) > 1:
                        first_names.append(str(name[0]).lower())
                        first_initials.append(str(name[0])[0].lower())
                    else:
                        first_initials.append(str(name[0]).lower())
                else:
                    pass

    return [list(set(first_names)), list(set(first_initials)), list(set(middle_1_names)), 
            list(set(middle_1_initials)), list(set(middle_2_names)), list(set(middle_2_initials))]

####### ALL ABOVE IS DONE (except for variables and paths)

In [8]:
def match_block_names(block_1_names, block_1_initials, block_2_names, block_2_initials):
    if block_1_names and block_2_names:
        if block_1_names == block_2_names:
            return True
        else:
            return False
    elif block_1_names and not block_2_names:
        if block_2_initials:
            if block_1_initials == block_2_initials:
                return True
            else:
                return False
        else:
            return True
    elif not block_1_names and block_2_names:
        if block_1_initials:
            if block_1_initials == block_2_initials:
                return True
            else:
                return False
        else:
            return True
    elif block_1_initials and block_2_initials:
        if block_1_initials == block_2_initials:
            return True
        else:
            return False
    else:
        return True
    

####### ALL ABOVE IS DONE (except for variables and paths)

In [9]:
def check_block_vs_block(block_1_names_list, block_2_names_list):
    
    # check first names
    first_check = match_block_names(block_1_names_list[0], block_1_names_list[1], block_2_names_list[0], 
                                    block_2_names_list[1])
    
    if first_check:

        middle_1_check = match_block_names(block_1_names_list[2], block_1_names_list[3], block_2_names_list[2], 
                                           block_2_names_list[3])

        if middle_1_check:
            # check middle 2 names
            middle_2_check = match_block_names(block_1_names_list[4], block_1_names_list[5], block_2_names_list[4], 
                                               block_2_names_list[5])

            if middle_2_check:
                return True
            else:
                return False
        else:
            return False
    else:
        return False
    

####### ALL ABOVE IS DONE (except for variables and paths)

In [10]:
def compile_cluster_coauthors(list_of_coauthors):
    if not isinstance(list_of_coauthors, list):
        try:
            list_of_coauthors = list_of_coauthors.tolist()
        except:
            return []
    
    if isinstance(list_of_coauthors, list):
        try:
            list_of_coauthors = list_of_coauthors[0].tolist()
        except:
            return []
        
    if list_of_coauthors:
        new_list_of_coauthors = [x for x in list_of_coauthors if x]
        if new_list_of_coauthors:
            if isinstance(new_list_of_coauthors[0], list):
                final_list = list(set([x.lower().replace('.', '') for y in new_list_of_coauthors for x in y]))
                final_list = [x for x in final_list if len(x) > 5]
                return final_list
            else:
                final_list = list(set([x.lower().replace('.', '') for x in new_list_of_coauthors]))
                final_list = [x for x in final_list if len(x) > 5]
                return final_list
        else:
            return []
    else:
        return []
    
####### ALL ABOVE IS DONE (except for variables and paths)

In [11]:
def get_orcid_id_from_set(orcid_set):
    orcid_list = [x for x in list(orcid_set) if x!='NONE']
    if orcid_list:
        return orcid_list[0]
    else:
        return 'NONE'

In [12]:
def score_row(coauthors):
    final_score = 0
    if isinstance(coauthors, list):
        final_score = len(coauthors)
    return final_score

In [13]:
def new_data_and_transformed(bucket_name, new_data_prefix, block_id_partition_file, node_id):
    # read data from S3 (IN PROGRESS folder)
    new_data_init, new_data_file_path = get_new_data_file(bucket_name, new_data_prefix)
    
    # read mapping file
    block_id_to_partition_mapping = pd.read_parquet(block_id_partition_file)
    
    # keep partitions for later
    new_data_init = new_data_init.merge(block_id_to_partition_mapping, how='left', on='block_id').copy()
    
    # this should only be done for the node that takes in new block_ids
    if node_id=="1":
        new_data_init['partition'] = new_data_init['partition'].fillna(501).astype('int')
    
    # get partition and block ids list
    temp_df = new_data_init.groupby('partition')['block_id'].apply(set).reset_index()
    block_partition_list = [[x,list(y)] for x,y in zip(temp_df['partition'].tolist(), 
                                                 temp_df['block_id'].tolist())]
    
    return new_data_init, block_partition_list, new_data_file_path

####### ALL ABOVE IS DONE (except for variables and paths)

In [14]:
def get_all_data_from_files(all_files, file_type='data'):
    df = pd.DataFrame()
    
    if file_type in ['data','new_data']:
        for data_file_name in all_files:
            temp_df = pd.read_parquet(data_file_name)
            temp_df['file_date'] = data_file_name.split("/")[-1][:-5]
            df = pd.concat([df, temp_df], axis=0)
    else:
        for cluster_file_name in all_files:
            temp_df = pd.read_parquet(cluster_file_name)
            temp_df['file_date'] = cluster_file_name.split("/")[-1][:-8]
            temp_df['final_clust_num'] = temp_df.apply(lambda x: f"{x.block_id}_{x.final_clust}", axis=1)
            df = pd.concat([df, temp_df], axis=0)
        df = df.sort_values('file_date', ascending=False).drop_duplicates(subset=['block_id','data_id']).copy()
    return df

In [15]:
def get_all_filenames(local_data_path, partition_id, sub_directory='001_data_files'):
    all_files = []
    for data_file in glob.glob(f"{local_data_path}/partition_{partition_id}/{sub_directory}/*"):
        all_files.append(data_file)
    return all_files

In [16]:
def get_data_file_and_dedup(data_files, file_type='data'):
    # read_all data files
    
    if file_type == 'new_data':
        df = get_all_data_from_files(data_files, file_type)
        df.columns = ['block_id', 'data_id', 'orcid', 'author', 'coauthors','partition', 'count_col', 'file_date']
    else:
        df = get_all_data_from_files(data_files, file_type)
        
    
    # deduplicate data_ids by how much data is available
    df['row_score'] = df.apply(lambda x: score_row(x.coauthors), axis=1)
    
    final_df = df.sort_values(['file_date','row_score'], ascending=False) \
        .drop_duplicates(subset=['data_id']) \
        .reset_index(drop=True) \
        .sort_values(['block_id']).copy()
    
    return final_df

In [17]:
def get_merged_files_for_clustering(data_files, cluster_files, block_ids):
    old_data = get_data_file_and_dedup(data_files[:-1])
    cluster_df = get_all_data_from_files(cluster_files, file_type='cluster')
    
    final_df = old_data.drop('block_id', axis=1) \
        .merge(cluster_df, how='inner', on='data_id')
    
    final_df['orcid'] = final_df['orcid'].fillna("NONE")
    
    # get the latest df and cluster data for the given block_ids
    blocks_to_check = final_df[final_df['block_id'].isin(block_ids)].copy()
    
    new_data = get_data_file_and_dedup(data_files[-1:], 'new_data')
    new_data['orcid'] = new_data['orcid'].fillna('NONE')
    return blocks_to_check, new_data

In [18]:
def assign_new_data_to_clusters(old_df, to_cluster, block_id, date_str):
    # filter by block_id
    df = old_df[old_df['block_id']==block_id].copy()
    new_df = to_cluster[to_cluster['block_id']==block_id].copy()
    
    if df.shape[0] > 0:
    
        # making sure there are two columns of author text for the groupby
        df['author_full_text'] = df['author']
        new_df['author_full_text'] = new_df['author']

        # columns to keep for cluster data
        cluster_cols = ['block_id','data_id','final_clust_num']

        # get features for new data
        new_data = new_df.groupby(['block_id','data_id']) \
            .agg({"author": list,
                  "orcid": list,
                  "author_full_text": create_author_name_match,
                  "coauthors": compile_cluster_coauthors}) \
            .reset_index()

        new_data['author'] = new_data['author'].apply(lambda x: x[0])
        new_data['orcid'] = new_data['orcid'].apply(lambda x: x[0])

        # group by cluster to get features to match to
        grouped_df = df.groupby(['block_id','final_clust_num']) \
            .agg({"author": set,
                  "orcid": set,
                  "author_full_text": create_author_name_match,
                  "coauthors": compile_cluster_coauthors, 
                  "data_id": list}) \
            .reset_index()

        grouped_df['orcid'] = grouped_df['orcid'].apply(get_orcid_id_from_set)
        grouped_df['author'] = grouped_df['author'].apply(lambda x: list(x))

        # dataframe to append all data to
        cluster_data = pd.DataFrame()

        # match new data with orcid if possible
        orcid_cluster_data = new_data \
            .merge(grouped_df[grouped_df['orcid']!='NONE']\
                   [['orcid','block_id','final_clust_num']].drop_duplicates(), 
                   how='left', on=['block_id','orcid'])

        cluster_data = pd.concat([cluster_data,
                                  orcid_cluster_data[~orcid_cluster_data['final_clust_num'].isnull()].copy()], 
                                 axis=0)

        data_left_to_cluster = orcid_cluster_data[orcid_cluster_data['final_clust_num'].isnull()].copy()

        if data_left_to_cluster.shape[0] > 0:
            # do name coauthor and coauthor matching

            author_match_cluster_data = author_coauthor_match(grouped_df, data_left_to_cluster)

            cluster_data = pd.concat([cluster_data,
                                      author_match_cluster_data[~author_match_cluster_data['final_clust_num']
                                                                .isnull()].copy()], 
                                     axis=0)

            data_left_to_cluster = new_data.merge(author_match_cluster_data[author_match_cluster_data['final_clust_num']
                                                             .isnull()][['block_id','data_id']].copy(), 
                                                  how='inner', on=['block_id','data_id'])
            if data_left_to_cluster.shape[0] > 0:
                # check if block has single large cluster
                lop_cluster_bool = check_for_lop_cluster(grouped_df)

                if lop_cluster_bool:
                    # if so and name matches with no ror clash, merge into cluster

                    lop_cluster_data = merge_data_into_lop_cluster(grouped_df, data_left_to_cluster)

                    cluster_data = pd.concat([cluster_data,
                                              lop_cluster_data[~lop_cluster_data['final_clust_num'].isnull()].copy()], 
                                         axis=0)

                    data_left_to_cluster = new_data.merge(lop_cluster_data[lop_cluster_data['final_clust_num']
                                                             .isnull()][['block_id','data_id']].copy(), 
                                                          how='inner', on=['block_id','data_id'])
                else:
                    pass

            else:
                pass

        else:
            pass
        
        # after all of above if there is still data, assign it to its own cluster
        if data_left_to_cluster.shape[0] > 0:
            single_cols = list(data_left_to_cluster.columns)
            single_clusters = data_left_to_cluster.reset_index().copy()
            single_clusters.columns = ['id'] + single_cols
            single_clusters['final_clust_num'] = single_clusters.apply(lambda x: 
                                                                       f"{x.block_id}_{x.id}_{date_str}SC", axis=1)

            final_cluster_data = pd.concat([df[df['final_clust_num'].isin(cluster_data['final_clust_num'].tolist())]\
                                            [cluster_cols], 
                                            cluster_data[cluster_cols], 
                                            single_clusters[cluster_cols]], axis=0)
        else:
            final_cluster_data = pd.concat([df[df['final_clust_num'].isin(cluster_data['final_clust_num'].tolist())]\
                                            [cluster_cols], 
                                            cluster_data[cluster_cols]], axis=0)
            
        final_cluster_data['final_clust'] = final_cluster_data.apply(lambda x: 
                                                                 x.final_clust_num[len(x.block_id)+1:], axis=1)
    else:
        # for when there are no clusters to match to
        single_cols = list(new_df.columns)
        final_cluster_data = new_df.reset_index().copy()
        final_cluster_data.columns = ['id'] + single_cols
        final_cluster_data['final_clust'] = final_cluster_data.apply(lambda x: f"{x.id}_{date_str}NCD", axis=1)
    
    return final_cluster_data[['block_id','data_id','final_clust']]

In [19]:
def check_for_lop_cluster(cluster_df):
    num_clusts = cluster_df['final_clust_num'].nunique()
    
    clust_size = cluster_df['data_id'].apply(len).tolist()[0]
    
    if (num_clusts == 1) & (clust_size > 10):
        return True
    else:
        return False

In [20]:
def merge_data_into_lop_cluster(cluster_df, new_df):
    new_data_ids = new_df['data_id'].tolist()
    new_author_names = new_df['author'].tolist()
    new_author_lists = new_df['author_full_text'].tolist()
    new_orcid_lists = new_df['orcid'].tolist()
    
    cluster_ids = cluster_df['final_clust_num'].tolist()
    cluster_author_names = cluster_df['author'].tolist()
    cluster_author_lists = cluster_df['author_full_text'].tolist()
    cluster_orcid_lists = cluster_df['orcid'].tolist()
    
    new_cluster_dict = {}
    
    for new_data_id, new_author_name, new_author_list, new_orcid in zip(new_data_ids,
                                                                        new_author_names, 
                                                                        new_author_lists,
                                                                        new_orcid_lists):
        for cluster_id, author_names, author_list, orcid in zip(cluster_ids, 
                                                                cluster_author_names,
                                                                cluster_author_lists,
                                                                cluster_orcid_lists):
            if ((author_list[0] or author_list[1] or author_list[2] or author_list[3] or author_list[4] or 
                 author_list[5]) and (new_author_list[0] or new_author_list[1] or new_author_list[2] or 
                new_author_list[3] or new_author_list[4] or new_author_list[5])):
                # use author lists and coauthors to match
                match_check = check_for_match_with_author_list(new_author_list, author_list)
            else:
                match_check = check_for_match_no_author_list(new_author_name, author_names)
                
            if (match_check & check_orcid(orcid, new_orcid)):
                new_cluster_dict[new_data_id] = cluster_id
                break
                
    new_cluster_data = new_df[['block_id','data_id']].copy()
    new_cluster_data['final_clust_num'] = new_cluster_data['data_id'].apply(lambda x: new_cluster_dict.get(x, np.NaN))
    
    return new_cluster_data

In [21]:
def author_coauthor_match(cluster_df, new_df):
    new_data_ids = new_df['data_id'].tolist()
    new_author_names = new_df['author'].tolist()
    new_author_lists = new_df['author_full_text'].tolist()
    new_coauthor_lists = new_df['coauthors'].tolist()
    new_orcid_lists = new_df['orcid'].tolist()
    
    cluster_ids = cluster_df['final_clust_num'].tolist()
    cluster_author_names = cluster_df['author'].tolist()
    cluster_author_lists = cluster_df['author_full_text'].tolist()
    cluster_coauthor_lists = cluster_df['coauthors'].tolist()
    cluster_orcid_lists = cluster_df['orcid'].tolist()
    
    new_cluster_dict = {}
    
    for new_data_id, new_author_name, new_author_list, new_coauthors_list, new_orcid in zip(new_data_ids,
                                                                                            new_author_names, 
                                                                                            new_author_lists, 
                                                                                            new_coauthor_lists,
                                                                                            new_orcid_lists):
        for cluster_id, author_names, author_list, coauthor_list, orcid in zip(cluster_ids, 
                                                                               cluster_author_names,
                                                                               cluster_author_lists,
                                                                               cluster_coauthor_lists,
                                                                               cluster_orcid_lists):
            if ((author_list[0] or author_list[1] or author_list[2] or author_list[3] or author_list[4] or 
                 author_list[5]) and (new_author_list[0] or new_author_list[1] or new_author_list[2] or 
                new_author_list[3] or new_author_list[4] or new_author_list[5])):
                # use author lists and coauthors to match
                match_check = check_for_match_with_author_list(new_author_list, author_list)
            else:
                match_check = check_for_match_no_author_list(new_author_name, author_names)
                
            if (match_check & 
                any(x in new_coauthors_list for x in coauthor_list) & 
                    check_orcid(orcid, new_orcid)):
                new_cluster_dict[new_data_id] = cluster_id
                break
                
    new_cluster_data = new_df[['block_id','data_id']].copy()
    new_cluster_data['final_clust_num'] = new_cluster_data['data_id'].apply(lambda x: new_cluster_dict.get(x, np.NaN))
    
    return new_cluster_data

In [22]:
def check_orcid(cluster_orcid, new_orcid):
    if cluster_orcid == 'NONE':
        return True
    elif new_orcid == 'NONE':
        return True
    else:
        if new_orcid == cluster_orcid:
            return True
        else:
            return False

In [23]:
def check_for_match_no_author_list(new_author_name, author_names):
    if (new_author_name.lower() in [author_name.lower() for author_name in author_names]):
        return True
    else:
        return False

In [24]:
def check_for_match_with_author_list(new_author_list, author_list):
    if check_block_vs_block(new_author_list, author_list):
        return True
    else:
        return False

In [25]:
def new_data_to_clusters(partition_id, block_ids, local_data_path, date_str):
    
    _logger.info(f"----------------- PARTITION {partition_id} -----------------")
    
    # get all files for partition ID (both data files and cluster files)
    data_files = get_all_filenames(local_data_path, partition_id, sub_directory='001_data_files')
    cluster_files = get_all_filenames(local_data_path, partition_id, sub_directory='002_cluster_files')
    
    if len(cluster_files) > 0:
    
        _logger.info("-------- found data and cluster files")

        final_df, new_df = get_merged_files_for_clustering(data_files, cluster_files, block_ids)

        _logger.info("-------- loaded data")

        # send block_ids through clustering
        final_new_clust_df = pd.DataFrame()
        for block_id in block_ids:
            output_df = assign_new_data_to_clusters(final_df, new_df, block_id, date_str)
            final_new_clust_df = pd.concat([final_new_clust_df, output_df], axis=0)

    else:
        new_df = get_data_file_and_dedup(data_files, file_type='new_data')
        
        new_file_cols = list(new_df.columns)
        
        final_new_clust_df = new_df.reset_index()
        final_new_clust_df.columns = ['row_index'] + new_file_cols
        final_new_clust_df['final_clust'] = final_new_clust_df['row_index'].apply(lambda x: f"{x}NB")
        
    _logger.info(f"New works: {new_df.shape[0]}")
    _logger.info(f"Number of works to be updated: {final_new_clust_df.shape[0]}")
    _logger.info(f"Number of new clusters: {final_new_clust_df[final_new_clust_df['final_clust'].str.contains(f'{date_str}SC')].shape[0]}")
    _logger.info(f"Number of new ids: {final_new_clust_df[final_new_clust_df['final_clust'].str.contains('NB')].shape[0]}")
    
    # write out clusters to the correct partition
    final_new_clust_df \
    .to_parquet(f"{local_data_path}/partition_{partition_id}/002_cluster_files/{date_str}_clusters.parquet")

In [26]:
def compile_all_new_clusters(partition_list, local_path, date_str):
    full_cluster_df = pd.DataFrame()
    for partition_id in partition_list:
        temp_df = pd\
            .read_parquet(
                f"{local_path}/partition_{partition_id}/002_cluster_files/{date_str}_clusters.parquet")
        full_cluster_df = pd.concat([full_cluster_df, temp_df], axis=0)
    return full_cluster_df

In [27]:
def write_file_to_s3(clusters, save_cluster_prefix, date_str):
    clusters['cluster_id'] = clusters.apply(lambda x: f"{x.block_id}_{x.final_clust}", axis=1)
    
    grouped_data = clusters.groupby('cluster_id')['data_id'].apply(list).reset_index()\
        .rename(columns={"data_id":"matched_papers"}).reset_index()

    grouped_data.columns = ['cluster_id','full_id','matched_papers']

    to_write = grouped_data[['cluster_id','matched_papers']].explode('matched_papers').drop_duplicates().copy()

    to_write['work'] = to_write['matched_papers'].apply(lambda x: x.split("_")[0][1:])
    to_write['seq_no'] = to_write['matched_papers'].apply(lambda x: x.split("_")[1])

    to_write[['cluster_id','work','seq_no']] \
    .to_csv(f"s3://author-name-disambiguation/{save_cluster_prefix}clusters_{date_str}.csv.gz", 
                                                    compression='gzip', header=None, index=None)
    return True

In [65]:
def main():
    # variables and paths
    node = "1"
    local_data_path = "/home/ec2-user/data"
    datetime_str = datetime.now().strftime("%Y_%m_%d_%H")
    date_str = datetime.now().strftime("%Y_%m_%d")
    time_str = datetime.now().strftime("%H_%M")
    bucket_name = "author-name-disambiguation"
    new_data_prefix = f"V1/data/002_IN_PROGRESS/NODE_{node}/"
    save_cluster_prefix = f"V1/data/003_COMPLETED_CLUSTERS/NODE_{node}/{date_str}/"
    supp_data_path = "/home/ec2-user/data/000_supp_data/"
    block_id_partition_file = f"{supp_data_path}block_id_partition_mapping.parquet"
    
    # transform new data
    _logger.info("Transforming data and generating block ID and partition list")
    transformed_data, block_part_list, new_data_file_path = new_data_and_transformed(bucket_name, 
                                                                                     new_data_prefix, 
                                                                                     block_id_partition_file,
                                                                                     node)
    print(transformed_data.shape)
    
    if transformed_data.shape[0] > 0:
        # write data to appropriate partition folder (using mapping)
        _logger.info("Writing data to each partition")
        for partition_id, block_ids in block_part_list:
            _logger.info(f"____{partition_id} - {transformed_data[transformed_data['partition']==partition_id].shape[0]} works")
            transformed_data[transformed_data['partition']==partition_id] \
            .to_parquet(
                f"{local_data_path}/partition_{partition_id}/001_data_files/{datetime_str}_supp_data.parquet")
        
        # run all code to put all new data into existing cluster or new cluster
        _logger.info("Clustering the data")
        for partition_id, block_ids in block_part_list:
            _logger.info(f"Running clustering for partition {partition_id}")
            _ = new_data_to_clusters(partition_id, block_ids, local_data_path, datetime_str)

        # compile all new clusters from partitions into new file
        _logger.info("Compiling into single file")
        compiled_cluster_file = compile_all_new_clusters([i[0] for i in block_part_list], 
                                                         local_data_path, datetime_str)

        # write data to S3
        _logger.info("Writing to S3")
        _ = write_file_to_s3(compiled_cluster_file, save_cluster_prefix, datetime_str)
        
        # move to archive
        _logger.info(f"Completed file: {new_data_file_path}")
        os.system(f"aws s3 mv s3://{bucket_name}/{new_data_file_path} s3://author-name-disambiguation/V1/data/ZZZ_Archive/")
    else:
        _logger.info("Empty dataframe so exiting the program")

In [58]:
%%time
main()

(651029, 7)
move: s3://author-name-disambiguation/V1/data/002_IN_PROGRESS/NODE_1/2023_02_20_20_10_1.parquet to s3://author-name-disambiguation/V1/data/ZZZ_Archive/2023_02_20_20_10_1.parquet
CPU times: user 5min 44s, sys: 21.3 s, total: 6min 5s
Wall time: 6min 11s
