In [None]:
import boto3
import configparser
from io import StringIO
from botocore.exceptions import ClientError

In [None]:
config = configparser.ConfigParser()
config.read_file(open('covid19-analytics.config'))

In [None]:
KEY = config.get('AWS', 'KEY')
SECRET = config.get('AWS', 'SECRET')

SRC_S3 = config.get('S3', 'SRC_S3')
SRC_BUCKETS = config.get('S3', 'SRC_BUCKETS')
SRC_REGION = config.get('S3', 'SRC_REGION')

TARGET_S3 = config.get('S3', 'TARGET_S3')
TARGET_OUTPUT_S3 = config.get('S3', 'TARGET_OUTPUT_S3')
TARGET_REGION = config.get('S3', 'TARGET_REGION')

In [None]:
# Initialize s3 client to access config file
config_s3_client = boto3.client('s3') # Grant glue access with IAM role

# Specify the s3 bucket and file path
config_bucket_name = CONFIG_BUCKET_NAME # Configure in Glue Job parameters
config_file_key = CONFIG_FILE_KEY # Configure in Glue Job parameters

# Download the config file from s3
config_obj = config_s3_client.get_object(Bucket=config_bucket_name, Key=config_file_key)
config_data = config_obj['Body'].read().decode('utf-8')

# Parse the config file
config = configparser.ConfigParser()
config.read_string(config_data)

In [None]:
# Access parameters stored in config file
aws_key = config['AWS']['KEY'] # Consider removing and granting glue necessary permisions
aws_secret = config['AWS']['SECRET'] # Consider removing and granting glue necessary permisions

src_bucket_name = ['S3']['SRC_BUCKET_NAME'] # Public data source bucket name 
src_bucket_prefixes = ['S3']['SRC_BUCKET_PREFIXES'] # Source data in multiple prefixes (folders)
src_bucket_region = ['S3']['SRC_BUCKET_REGION']

job_bucket_name = ['S3']['JOB_BUCKET_NAME']
job_bucket_prefix = ['S3']['JOB_BUCKET_PREFIX']
job_bucket_region = ['S3']['JOB_BUCKET_REGION']

In [None]:
# Initialize s3 client to access source data s3 bucket
src_s3_client = boto3.client('s3', region_name=src_bucket_region)

# Initialize s3 client to create and/or access staging s3 bucket
job_s3_client = boto3.client(
    's3', 
    region_name=job_bucket_region,
    aws_access_key_id=aws_key, 
    aws_secret_access_key=aws_secret
)

In [None]:
# Method to create s3 bucket if it doesn't exist
def create_s3_if_not_exists(bucket_name, bucket_region, s3_client):
    try:
        # Check if bucket exists
        s3_client.head_bucket(Bucket=bucket_name)
        print(f"S3 bucket '{bucket_name}' already exists.")
        
    except ClientError as e:
        error_code = e.response['Error']['Code'] # Get error code
        if error_code == '404': 
            # If error code is 404, bucket doesn't exist. Create bucket
            s3_client.create_bucket(
                Bucket=bucket_name,
                CreateBucketConfiguration={
                    'LocationConstraint': bucket_region
                }
            )
            print(f"S3 bucket '{bucket_name}' has been created.")
        else:
            print(e)
    except Exception as e:
        print(e)

In [None]:
# Function downloads files from multiple bucket keys to another bucket using same source key names
def copy_objects_from_s3_to_s3(src_bucket_name, src_bucket_prefixes, target_bucket_name, src_s3_client, target_s3_client):
    # Source_bucket_keys is a set of source bucket keys stored in config file
    src_bucket_prefixes = src_bucket_prefixes.split(',') # Convert to comma dilimited list
    
    # Iterate through the surce bucket list to download files within each bucket key
    for src_bucket_prefix in src_bucket_prefixes:
        src_bucket_prefix = src_bucket_prefix.strip() # Remove any white spaces
        print(f"\nAccessing src_bucket_prefix: '{src_bucket_prefix}' >>>")

        # Use paginator to iterate over src_bucket contents
        paginator = src_s3_client.get_paginator('list_objects_v2')
        for page in paginator.paginate(Bucket=src_bucket_name, Prefix=src_bucket_prefix):
            if 'Contents' in page:
                for obj in page['Contents']:
                    copy_source = {'Bucket': src_bucket_name, 'Key': obj['Key']}
                    target_key = f"raw_data/{obj['Key']}" # creates a raw_data folder and puts downloaded file in it

                    try:
                        target_s3_client.head_object(Bucket=target_bucket_name, Key=target_key)
                        # Skip to the next source bucket prefix if file already exists in the target bucket location
                        print(f"Skipping {target_key}, already exists")
                        continue 

                    except ClientError as e:
                        if e.response['Error']['Code'] == '404': 
                            # Error code is 404 indicates file doesn't exist in target location, proceed to copy
                            print(f"Copying {obj['Key']} to {target_bucket_name}/{target_key}")
                            
                            try:
                                target_s3_client.copy_object(
                                    CopySource=copy_source, 
                                    Bucket=target_bucket_name,
                                    Key=target_key
                                )
                            except ClientError as e:
                                print(f"ClientError: {e}")
                            except Exception as e:
                                print(f"Exception: {e}")
            else:
                print(f"No content in '{src_bucket_prefix}'\n")

In [None]:
create_s3_if_not_exists(job_bucket_name, job_bucket_region, job_s3_client)

In [None]:
create_s3_if_not_exists(job_bucket_name, job_bucket_region, job_s3_client)

In [None]:
copy_objects_from_s3_to_s3(src_bucket_name, src_bucket_prefixes, job_bucket_name, src_s3_client, job_s3_client)