In [1]:
import logging
import sys
import re
import os
import boto3
import time
import pandas as pd
from nameparser import HumanName
from datetime import datetime

In [2]:
_logger = logging.getLogger(__name__)
logging.basicConfig(
    filename='log_file.log',
    format='%(asctime)s.%(msecs)03dZ,%(pathname)s:%(lineno)d,%(levelname)s,%(module)s,%(funcName)s: %(message)s',
    datefmt="%Y-%m-%d %H:%M:%S")

In [3]:
def check_author_name(author):
    author_names = author.strip().split(" ")
    author_names = [x for x in author_names if x != " "]
    
    if len(author_names) == 1:
        pass
    elif len(author_names) == 2:
        if len(author_names[1]) == 1:
            if author_names[0][-1]==",":
                author_names = [author_names[1].upper()] + [author_names[0][:-1]]
            else:
                author_names = [author_names[1].upper()] + [author_names[0]]
        elif len(author_names[1])==2:
            if author_names[1][1] in [".",",",";",":"]:
                if author_names[0][-1]==",":
                    author_names = [author_names[1].upper()] + [author_names[0][:-1]]
                else:
                    author_names = [author_names[1].upper()] + [author_names[0]]
            elif ((author_names[1].lower()[0] not in ["a","e","i","o","u"]) and 
                  (author_names[1].lower()[1] not in ["a","e","i","o","u"]) and 
                  (re.match(r'[a-zA-Z]{2}', author_names[1])) and 
                  (author_names[1].lower() not in ['ng','ty'])):
                author_names = [author_names[1].upper()] + [author_names[0]]
        elif len(author_names[1]) >=3:
            if author_names[0][-1] == ",":
                author_names = [author_names[1]] + [author_names[0][:-1]]
        else:
            pass
    elif len(author_names) == 3:
        pass
    elif len(author_names) >= 4:
        if ("(" in author_names) and (")" in author_names):
            temp_names = author_names[author_names.index("(")+1:author_names.index(")")]
            if len(temp_names) > 1:
                author_names = temp_names
            else:
                author_names = author_names[:author_names.index("(")]
    else:
        pass
    
    return " ".join(author_names)

In [4]:
def transform_coauthors(coauthors, author):
    if coauthors:
        if isinstance(coauthors, list):
            final_coauthors = []
            for coauthor in coauthors:
                coauthor = check_author_name(coauthor)
                if coauthor != author:
                    final_coauthors.append(coauthor)
            if not final_coauthors:
                final_coauthors = None
        else:
            final_coauthors = None
    else:
        final_coauthors = None
    return final_coauthors

In [5]:
def get_block_id(name):
    person = HumanName(name)
    last_name = person.last
    first_name = person.first
    if (len(first_name) < 1) & (len(last_name) < 1):
        return name.lower()
    elif len(first_name) < 1:
        return last_name
    elif len(last_name) < 1:
        return name.lower()
    else:
        initials = f"{first_name[0]}_{last_name}"
        return initials.lower()

In [6]:
def get_new_data_paths_s3(bucket_name, data_prefix):
    s3 = boto3.resource('s3')
    my_bucket = s3.Bucket(bucket_name)

    new_data_file_paths = []
    for my_bucket_object in my_bucket.objects.filter(Prefix=data_prefix).all():
        if my_bucket_object.key.endswith('.jsonl.gz'):
            _logger.info(f"Found new data file: {my_bucket_object.key}")
            new_data_file_paths.append(my_bucket_object.key)
    return new_data_file_paths

In [7]:
def start_ec2_instance(instance_id, client):
    response = {'ResponseMetadata':{'HTTPStatusCode': 0}}
    while response['ResponseMetadata']['HTTPStatusCode'] != 200:
        try:
            _logger.info(f"Starting EC2 instance {instance_id}")
            response = ec2.start_instances(InstanceIds=[instance_id], DryRun=False)
        except ClientError as e:
            _logger.error('Error turning on the EC2', exc_info=True,
                            stack_info=True)
        time.sleep(1)
    return response

In [8]:
def stop_ec2_instance(instance_id, client):
    response = {'ResponseMetadata':{'HTTPStatusCode': 0}}
    while response['ResponseMetadata']['HTTPStatusCode'] != 200:
        try:
            _logger.info(f"Stopping EC2 instance {instance_id}")
            response = ec2.stop_instances(InstanceIds=[instance_id], DryRun=False)
        except ClientError as e:
            _logger.error('Error turning off the EC2', exc_info=True,
                            stack_info=True)
        time.sleep(1)
    return response

In [9]:
def save_and_delete_log_file(bucket, save_prefix):
    # If S3 object_name was not specified, use file_name
    file_name = "log_file.log"
    curr_datetime = datetime.now()
    file_date = curr_datetime.strftime("%Y_%m_%d")
    file_time = curr_datetime.strftime("%H_%M")
    object_name = f"{save_prefix}/{file_date}/log_file_{file_time}.txt"

    # Upload the file
    s3_client = boto3.client('s3')
    
    
    try:
        response = s3_client.upload_file(file_name, bucket, object_name)
    except ClientError as e:
        print(f"Log file save did not work: {e}")
        response = s3_client.upload_file(file_name, bucket, object_name)

    return True

In [10]:
def remove_transferred_data_from_001_new_data(bucket_name, data_paths):
    s3 = boto3.resource('s3')
    for data_path in data_paths:
        os.system(f"aws s3 mv s3://{bucket_name}/{data_path} s3://author-name-disambiguation/V1/data/ZZZ_Archive/")
#         s3.Object(bucket_name, data_path).delete()

In [11]:
# def lambda_handler(event, context):
_logger.setLevel(10)
# _logger.info('Received event: ' + str(event))

# define variables and paths
s3_bucket = "author-name-disambiguation"
new_data_prefix = "V1/data/001_NEW_DATA"
curr_datetime = datetime.now()
year_month = curr_datetime.strftime("%Y_%m")
date_str = curr_datetime.strftime("%Y_%m_%d_%H_%M")
max_num_works = 3000000
no_author_data_prefix = "V1/data/XXX_NO_AUTHOR"
new_block_id_node = 1
node_1_data_path = "V1/data/002_IN_PROGRESS/NODE_1/"
node_2_data_path = "V1/data/002_IN_PROGRESS/NODE_2/"
node_3_data_path = "V1/data/002_IN_PROGRESS/NODE_3/"
log_file_save_prefix = "V1/log_files/main"
node_1_id = ""
node_2_id = ""
node_3_id = ""
ec2 = boto3.client('ec2')
data_cols = ['block_id','paper_author_id','orcid','author','coauthors']

# get any data in bucket/prefix
data_in_bucket = get_new_data_paths_s3(s3_bucket, new_data_prefix)

_logger.info(f"Found {len(data_in_bucket)} files in S3")

if data_in_bucket:

    # iterate through to only take take certain number of works
    new_data = pd.DataFrame()
    data_transferred = []
    for data in data_in_bucket:
        try:
            temp_new_data = pd.read_json(f"s3://{s3_bucket}/{data}", 
                                         compression='gzip', lines=True, orient='records')

            if (new_data.shape[0] + temp_new_data.shape[0]) > max_num_works:
                if new_data.shape[0] == 0:
                    new_data = temp_new_data.copy()
                    data_transferred = [data]
                    _logger.info(f"Added file with {temp_new_data.shape[0]} rows")
                    print(f"Added file with {temp_new_data.shape[0]} rows")
                else:
                    break
            else:
                data_transferred.append(data)
                _logger.info(f"Added file with {temp_new_data.shape[0]} rows")
                print(f"Added file with {temp_new_data.shape[0]} rows")
                new_data = pd.concat([new_data, temp_new_data], axis=0)
        except:
            _logger.error(f'Error trying to read new data: bucket[{s3_bucket}] prefix[{data}]', 
                          exc_info=True, stack_info=True)
            
    new_data_no_author = new_data[new_data['author'].isnull()].copy()
    new_data = new_data[~new_data['author'].isnull()].copy()
    
    # save papers with no authors to bucket
    new_data_no_author[['paper_author_id','title']] \
        .to_parquet(f"s3://{s3_bucket}/{no_author_data_prefix}/{year_month}/{date_str}.parquet")
            
    # normalize author and coauthors
    print("")
    print("Normalizing authors and coauthors")
    new_data['author'] = new_data['author'].apply(check_author_name)
    new_data['coauthors'] = new_data.apply(lambda x: transform_coauthors(x.coauthors, x.author), axis=1)

    # create block_id
    print("Creating block ID")
    new_data['block_id'] = new_data['author'].astype('str').apply(get_block_id)

    # load block_id to node mapping
    print("Loading block ID to node mapping file")
    try:
        block_id_node_mapping = pd.read_parquet("block_id_node_mapping.parquet")
    except:
        print("Error loading the mapping table")
        _logger.error('Error loading the mapping table', exc_info=True,
                          stack_info=True)
else:
    _logger.info(f"No new data to disambiguate!")

In [12]:
#     # start up nodes
#     _ = start_ec2_instance(node_1_id, ec2)
#     _ = start_ec2_instance(node_2_id, ec2)
#     _ = start_ec2_instance(node_3_id, ec2)

In [13]:
# go through steps to sort and send data to nodes
data_to_write = new_data.merge(block_id_node_mapping, how='left', on='block_id')

In [14]:
data_to_write['node'] = data_to_write['node'].fillna(new_block_id_node).astype('int')

In [15]:
data_to_write['node'].value_counts()

2    473091
1    466945
3    425051
Name: node, dtype: int64

In [16]:
date_str

'2023_02_21_15_26'

In [17]:
# data for node 1
if data_to_write[data_to_write['node']==1].shape[0] > 0:
    data_to_write[data_to_write['node']==1][data_cols]\
        .to_parquet(f"s3://{s3_bucket}/{node_1_data_path}{date_str}_1.parquet")
    # add something for triggering the node here
else:
    _logger.info("No data for node 1!")
    # add node shutdown here

# data for node 2
if data_to_write[data_to_write['node']==2].shape[0] > 0:
    data_to_write[data_to_write['node']==2][data_cols]\
        .to_parquet(f"s3://{s3_bucket}/{node_2_data_path}{date_str}_2.parquet")
    # add something for triggering the node here
else:
    _logger.info("No data for node 2!")
    # add node shutdown here

# data for node 3
if data_to_write[data_to_write['node']==3].shape[0] > 0:
    data_to_write[data_to_write['node']==3][data_cols]\
        .to_parquet(f"s3://{s3_bucket}/{node_3_data_path}{date_str}_3.parquet")
    # add something for triggering the node here
else:
    _logger.info("No data for node 3!")
    # add node shutdown here

In [18]:
# delete file that was transferred to nodes
remove_transferred_data_from_001_new_data(s3_bucket, data_transferred)

move: s3://author-name-disambiguation/V1/data/001_NEW_DATA/batch-398.jsonl.gz to s3://author-name-disambiguation/V1/data/ZZZ_Archive/batch-398.jsonl.gz
move: s3://author-name-disambiguation/V1/data/001_NEW_DATA/batch-399.jsonl.gz to s3://author-name-disambiguation/V1/data/ZZZ_Archive/batch-399.jsonl.gz
move: s3://author-name-disambiguation/V1/data/001_NEW_DATA/batch-400.jsonl.gz to s3://author-name-disambiguation/V1/data/ZZZ_Archive/batch-400.jsonl.gz
move: s3://author-name-disambiguation/V1/data/001_NEW_DATA/batch-401.jsonl.gz to s3://author-name-disambiguation/V1/data/ZZZ_Archive/batch-401.jsonl.gz
move: s3://author-name-disambiguation/V1/data/001_NEW_DATA/batch-402.jsonl.gz to s3://author-name-disambiguation/V1/data/ZZZ_Archive/batch-402.jsonl.gz
move: s3://author-name-disambiguation/V1/data/001_NEW_DATA/batch-403.jsonl.gz to s3://author-name-disambiguation/V1/data/ZZZ_Archive/batch-403.jsonl.gz
move: s3://author-name-disambiguation/V1/data/001_NEW_DATA/batch-404.jsonl.gz to s3://au

In [35]:
# save log to S3
_ = save_and_delete_log_file(s3_bucket, log_file_save_prefix)

In [None]:






    


else:
    print("no new data to disambiguate")
#     _logger.info("No new data to disambiguate!")

#     _ = save_and_delete_log_file()
