In [None]:
import boto3 
from botocore.exceptions import ClientError
import time
import pandas as pd
import configparser
from pathlib import Path
from io import StringIO

In [None]:
config = configparser.ConfigParser()
config.read_file(open('covid19-analytics.config'))

KEY = config.get('AWS', 'KEY')
SECRET = config.get('AWS', 'SECRET')
TARGET_S3 = config.get('S3', 'TARGET_S3')
TARGET_OUTPUT_S3 = config.get('S3', 'TARGET_OUTPUT_S3')
TARGET_OUTPUT_BUCKET=config.get('S3', 'TARGET_OUTPUT_BUCKET')
TARGET_OUTPUT_DIR=config.get('S3', 'TARGET_OUTPUT_DIR')
TARGET_REGION = config.get('S3', 'TARGET_REGION')
SCHEMA_NAME = config.get('GLUE', 'SCHEMA_NAME')
TMP_DIR = config.get('FILE_PATHS', 'TMP_DIR')

In [None]:
OUTPUT_S3_CLIENT = boto3.client(
    's3', 
    region_name=TARGET_REGION,
    aws_access_key_id=KEY, 
    aws_secret_access_key=SECRET
)

GLUE_CLIENT = boto3.client(
    'glue', 
    aws_access_key_id=KEY,
    aws_secret_access_key=SECRET,
    region_name=TARGET_REGION
)

ATHENA_CLIENT = boto3.client(
    'athena',
    aws_access_key_id=KEY,
    aws_secret_access_key=SECRET,
    region_name=TARGET_REGION,
)

In [None]:
def list_athena_tables(client, database_name) -> list:
    tables = []
    paginator = client.get_paginator('get_tables')

    # Use paginator to handle potentially large number of tables
    for page in paginator.paginate(DatabaseName=database_name):
        for table in page['TableList']:
            tables.append(table['Name'])
            
    return tables

In [None]:
# Function to execute athenaa query and retrieve data in all tables
def query_athena_and_fetch_results(
        athena_client,
        s3_client, 
        database, 
        query,
        table,
        output_s3,
        output_dir,
        output_location,
        tmp_dir):
    
    response = athena_client.start_query_execution(
        QueryString=query,
        QueryExecutionContext={'Database': database},
        ResultConfiguration={
            'OutputLocation': output_location,
            'EncryptionConfiguration': {'EncryptionOption': 'SSE_S3'},
        }
    )
    query_execution_id = response['QueryExecutionId']

    # Loop till query execution is complete
    while True:
        try:
            response = athena_client.get_query_execution(QueryExecutionId=query_execution_id)
        except ClientError as e:
            print(f"\nQuery Error: \n{e}")
            
        state = response['QueryExecution']['Status']['State']
        if state == 'SUCCEEDED':
            print(f"\n{query_execution_id} query has completed successfuly")

            results_path = f'{output_dir}/{query_execution_id}.csv'
            local_filename = f'{tmp_dir}/{table}.csv'

            if Path(local_filename).exists():
                print(f"{local_filename} already exists, skip download")
            else:
                try:
                    s3_client.download_file(output_s3, results_path, local_filename)
                    print(f"\n{local_filename} downloaded successfuly")
                except ClientError as e:
                    print(f"Download Error: \n{e}")
                    
            try:
                s3_client.delete_objects(Bucket=output_s3, Delete={'Objects': [{'Key': results_path}, {'Key': f'{results_path}.metadata'}], 'Quiet': True})
            except ClientError as e:
                print(f"S3 cleanup Error: \n{e}")

            return
        
        elif state in ['FAILED', 'CANCELLED']:
            raise Exception(f"Query {state.lower()} with reason: {response['QueryExecution']['Status']['StateChangeReason']}")
        else:
            print(f"/n{query_execution_id} query is still running, waiting 3 seconds...")
            time.sleep(3)

In [None]:
tables = list_athena_tables(GLUE_CLIENT, SCHEMA_NAME)

In [None]:
for TABLE in tables:
    QUERY = f'SELECT * FROM "{TABLE}";'
    try:
        query_athena_and_fetch_results(
            athena_client=ATHENA_CLIENT,
            s3_client=OUTPUT_S3_CLIENT, 
            database=SCHEMA_NAME,  
            query=QUERY,
            table=TABLE,
            output_s3=TARGET_OUTPUT_S3,
            output_dir=TARGET_OUTPUT_DIR,
            output_location=TARGET_OUTPUT_BUCKET,
            tmp_dir=TMP_DIR
        )
    except Exception as e:
        print(f"Failed to download csv for table {TABLE}: {str(e)}")

In [None]:
# iterate through the tmp folder
from pathlib import Path

dataframes = {}

for table in tables:
    for file in Path(TMP_DIR).iterdir():
        if table in file.name:
            dataframes[f'{table}_df'] = pd.read_csv(file)
            print(f"\nLoaded {file} into {table}_df dataframe") 
