In [1]:
import time
import configparser
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed

import boto3
from botocore.exceptions import ClientError

In [2]:
config = configparser.ConfigParser()
config.read_file(open('covid19-analytics.config'))

KEY = config.get('AWS', 'KEY')
SECRET = config.get('AWS', 'SECRET')

TARGET_OUTPUT_S3 = config.get('S3', 'TARGET_OUTPUT_S3')
TARGET_OUTPUT_BUCKET=config.get('S3', 'TARGET_OUTPUT_BUCKET')
TARGET_OUTPUT_DIR=config.get('S3', 'TARGET_OUTPUT_DIR')
TARGET_REGION = config.get('S3', 'TARGET_REGION')
SCHEMA_NAME = config.get('GLUE', 'SCHEMA_NAME')
TMP_DIR = config.get('FILE_PATHS', 'TMP_DIR')

In [3]:
OUTPUT_S3_CLIENT = boto3.client(
    's3', 
    region_name=TARGET_REGION,
    aws_access_key_id=KEY, 
    aws_secret_access_key=SECRET
)

GLUE_CLIENT = boto3.client(
    'glue', 
    aws_access_key_id=KEY,
    aws_secret_access_key=SECRET,
    region_name=TARGET_REGION
)

ATHENA_CLIENT = boto3.client(
    'athena',
    aws_access_key_id=KEY,
    aws_secret_access_key=SECRET,
    region_name=TARGET_REGION,
)

In [4]:
def list_athena_tables(client, database_name) -> list:
    tables = []
    paginator = client.get_paginator('get_tables')

    # Use paginator to handle potentially large number of tables
    for page in paginator.paginate(DatabaseName=database_name):
        for table in page['TableList']:
            tables.append(table['Name'])
            
    return tables

In [5]:
# Function to execute athena query and retrieve data in all tables
def query_athena_and_fetch_results(
        athena_client,
        s3_client, 
        database, 
        query,
        table,
        output_s3,
        output_dir,
        output_location,
        tmp_dir):
    
    response = athena_client.start_query_execution(
        QueryString=query,
        QueryExecutionContext={'Database': database},
        ResultConfiguration={
            'OutputLocation': output_location,
            'EncryptionConfiguration': {'EncryptionOption': 'SSE_S3'},
        }
    )
    query_execution_id = response['QueryExecutionId']

    # Loop till query execution is complete
    while True:
        try:
            response = athena_client.get_query_execution(QueryExecutionId=query_execution_id)
        except ClientError as e:
            print(f"\nQuery Error: \n{e}")
            
        state = response['QueryExecution']['Status']['State']
        if state == 'SUCCEEDED':
            print(f"\n{query_execution_id} query has completed successfuly")

            results_path = f'{output_dir}/{query_execution_id}.csv'
            local_filename = f'{tmp_dir}/{table}.csv'

            if Path(local_filename).exists():
                print(f"{local_filename} already exists, skip download")
            else:
                try:
                    s3_client.download_file(output_s3, results_path, local_filename)
                    print(f"\n{local_filename} downloaded successfuly")
                except ClientError as e:
                    print(f"Download Error: \n{e}")
                    
            try:
                s3_client.delete_objects(Bucket=output_s3, Delete={'Objects': [{'Key': results_path}, {'Key': f'{results_path}.metadata'}], 'Quiet': True})
            except ClientError as e:
                print(f"S3 cleanup Error: \n{e}")

            return
        
        elif state in ['FAILED', 'CANCELLED']:
            raise Exception(f"Query {state.lower()} with reason: {response['QueryExecution']['Status']['StateChangeReason']}")
        else:
            print(f"/n{query_execution_id} query is still running, waiting 3 seconds...")
            time.sleep(3)

In [6]:
tables = list_athena_tables(GLUE_CLIENT, SCHEMA_NAME)

In [7]:
def download_table_data(tables, max_workers):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_table = {
            executor.submit(
                query_athena_and_fetch_results,
                athena_client=ATHENA_CLIENT,
                s3_client=OUTPUT_S3_CLIENT, 
                database=SCHEMA_NAME,  
                query=f'SELECT * FROM "{TABLE}";',
                table=TABLE,
                output_s3=TARGET_OUTPUT_S3,
                output_dir=TARGET_OUTPUT_DIR,
                output_location=TARGET_OUTPUT_BUCKET,
                tmp_dir=TMP_DIR
            ): TABLE for TABLE in tables
        }
        for future in as_completed(future_to_table):
            table = future_to_table[future]
            try:
                future.result()
            except Exception as e:
                print(f"Failed to download csv for {table}: {str(e)}")


In [8]:
download_table_data(tables, max_workers=3)

/n6b782899-7d2e-4b86-8323-f38fb5eb7ed7 query is still running, waiting 3 seconds...
/ndfd520ab-4b07-4223-8b56-d6c4d4a82770 query is still running, waiting 3 seconds...
/n3383a124-dc44-4656-ae47-ecf43462488d query is still running, waiting 3 seconds...

6b782899-7d2e-4b86-8323-f38fb5eb7ed7 query has completed successfuly
/n3383a124-dc44-4656-ae47-ecf43462488d query is still running, waiting 3 seconds...

dfd520ab-4b07-4223-8b56-d6c4d4a82770 query has completed successfuly

/workspaces/covid-19-data-pipeline-aws/covid-19-ETL/tmp/static-datacountrycode.csv downloaded successfuly
/n23acb010-2c11-4c45-9b65-4c3764569123 query is still running, waiting 3 seconds...

/workspaces/covid-19-data-pipeline-aws/covid-19-ETL/tmp/hospital-bedsjson.csv downloaded successfuly
/n17cbd773-f7c9-4470-a3e9-ddede66d148a query is still running, waiting 3 seconds...

3383a124-dc44-4656-ae47-ecf43462488d query has completed successfuly

23acb010-2c11-4c45-9b65-4c3764569123 query has completed successfuly

17cbd7