In [None]:
import time
import configparser
from io import StringIO
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed

import boto3 
import pandas as pd
from botocore.exceptions import ClientError

In [None]:
config = configparser.ConfigParser()
config.read_file(open('covid19-analytics.config'))

KEY = config.get('AWS', 'KEY')
SECRET = config.get('AWS', 'SECRET')
TARGET_S3 = config.get('S3', 'TARGET_S3')
TARGET_OUTPUT_S3 = config.get('S3', 'TARGET_OUTPUT_S3')
TARGET_OUTPUT_BUCKET=config.get('S3', 'TARGET_OUTPUT_BUCKET')
TARGET_OUTPUT_DIR=config.get('S3', 'TARGET_OUTPUT_DIR')
TARGET_REGION = config.get('S3', 'TARGET_REGION')
SCHEMA_NAME = config.get('GLUE', 'SCHEMA_NAME')
TMP_DIR = config.get('FILE_PATHS', 'TMP_DIR')

In [None]:
OUTPUT_S3_CLIENT = boto3.client(
    's3', 
    region_name=TARGET_REGION,
    aws_access_key_id=KEY, 
    aws_secret_access_key=SECRET
)

GLUE_CLIENT = boto3.client(
    'glue', 
    aws_access_key_id=KEY,
    aws_secret_access_key=SECRET,
    region_name=TARGET_REGION
)

ATHENA_CLIENT = boto3.client(
    'athena',
    aws_access_key_id=KEY,
    aws_secret_access_key=SECRET,
    region_name=TARGET_REGION,
)

In [None]:
def list_athena_tables(client, database_name) -> list:
    tables = []
    paginator = client.get_paginator('get_tables')

    # Use paginator to handle potentially large number of tables
    for page in paginator.paginate(DatabaseName=database_name):
        for table in page['TableList']:
            tables.append(table['Name'])
            
    return tables

In [None]:
# Function to execute athenaa query and retrieve data in all tables
def query_athena_and_fetch_results(
        athena_client,
        s3_client, 
        database, 
        query,
        table,
        output_s3,
        output_dir,
        output_location,
        tmp_dir):
    
    response = athena_client.start_query_execution(
        QueryString=query,
        QueryExecutionContext={'Database': database},
        ResultConfiguration={
            'OutputLocation': output_location,
            'EncryptionConfiguration': {'EncryptionOption': 'SSE_S3'},
        }
    )
    query_execution_id = response['QueryExecutionId']

    # Loop till query execution is complete
    while True:
        try:
            response = athena_client.get_query_execution(QueryExecutionId=query_execution_id)
        except ClientError as e:
            print(f"\nQuery Error: \n{e}")
            
        state = response['QueryExecution']['Status']['State']
        if state == 'SUCCEEDED':
            print(f"\n{query_execution_id} query has completed successfuly")

            results_path = f'{output_dir}/{query_execution_id}.csv'
            local_filename = f'{tmp_dir}/{table}.csv'

            if Path(local_filename).exists():
                print(f"{local_filename} already exists, skip download")
            else:
                try:
                    s3_client.download_file(output_s3, results_path, local_filename)
                    print(f"\n{local_filename} downloaded successfuly")
                except ClientError as e:
                    print(f"Download Error: \n{e}")
                    
            try:
                s3_client.delete_objects(Bucket=output_s3, Delete={'Objects': [{'Key': results_path}, {'Key': f'{results_path}.metadata'}], 'Quiet': True})
            except ClientError as e:
                print(f"S3 cleanup Error: \n{e}")

            return
        
        elif state in ['FAILED', 'CANCELLED']:
            raise Exception(f"Query {state.lower()} with reason: {response['QueryExecution']['Status']['StateChangeReason']}")
        else:
            print(f"/n{query_execution_id} query is still running, waiting 3 seconds...")
            time.sleep(3)

In [None]:
tables = list_athena_tables(GLUE_CLIENT, SCHEMA_NAME)

In [None]:
def download_table_data(tables, max_workers):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_table = {
            executor.submit(
                query_athena_and_fetch_results,
                athena_client=ATHENA_CLIENT,
                s3_client=OUTPUT_S3_CLIENT, 
                database=SCHEMA_NAME,  
                query=f'SELECT * FROM "{TABLE}";',
                table=TABLE,
                output_s3=TARGET_OUTPUT_S3,
                output_dir=TARGET_OUTPUT_DIR,
                output_location=TARGET_OUTPUT_BUCKET,
                tmp_dir=TMP_DIR
            ): TABLE for TABLE in tables
        }
        for future in as_completed(future_to_table):
            table = future_to_table[future]
            try:
                future.result()
            except Exception as e:
                print(f"Failed to download csv for {table}: {str(e)}")


In [None]:
download_table_data(tables, max_workers=3)

In [None]:
enigma_jhu = pd.read_csv(f'{TMP_DIR}/enigma_jhu.csv')

testing_data_states_daily = pd.read_csv(f'{TMP_DIR}/testing-datastates_daily.csv')


In [None]:
factCovid_1 = enigma_jhu[['fips', 'province_state', 'country_region', 'confirmed', 'deaths', 'recovered', 'active' ]]
factCovid_2 = testing_data_states_daily[['fips', 'date', 'positive', 'negative', 'hospitalizedcurrently', 'hospitalized', 'hospitalizeddischarged' ]]
factCovid = pd.merge(factCovid_1, factCovid_2, on='fips', how='inner')

In [None]:
dimHospital = pd.read_csv(f'{TMP_DIR}/hospital-bedsjson.csv')
dimHospital =  dimHospital[['fips', 'state_name', 'latitude', 'longtitude', 'hq_address', 'hospital_name', 'hospital_type', 'hq_city', 'hq_state']]

In [None]:
dimDate = pd.read_csv(f'{TMP_DIR}/testing-datastates_daily.csv')
dimDate = dimDate[['fips', 'date']]

In [None]:
dimDate['date'] = pd.to_datetime(dimDate['date'], format='%Y%m%d')
dimDate['year'] = dimDate['date'].dt.year
dimDate['month'] = dimDate['date'].dt.month
dimDate["day_of_week"] = dimDate['date'].dt.dayofweek

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName("Join").getOrCreate()

In [None]:
enigma_jhu = spark.read.csv(f'{TMP_DIR}/enigma_jhu.csv', header=True, inferSchema=True)
ny_times_us_county = spark.read.csv(f'{TMP_DIR}/us_county.csv', header=True, inferSchema=True)

In [None]:
dimRegion_1 = enigma_jhu.select('fips', 'province_state', 'country_region', 'latitude', 'longitude')
dimRegion_2 = ny_times_us_county.select('fips', 'county', 'state')

dimRegion_1 = dimRegion_1.repartition(4, 'fips')
dimRegion_2 = dimRegion_2.repartition(4, 'fips')
dimRegion_2 = dimRegion_2.withColumnRenamed('fips', 'fips2')

In [None]:
dimRegion = dimRegion_1.join(dimRegion_2, dimRegion_1["fips"] == dimRegion_2["fips2"], "inner")

In [None]:
dimRegion = dimRegion.drop('fips2')

In [None]:
dimRegion.coalesce(1).write.csv(f'{TMP_DIR}/dimRegion.csv', header=True)

In [None]:
%mv {TMP_DIR}/dimRegion.csv/part-00000* {TMP_DIR}/dimRegions.csv

In [None]:
%rm -r -f {TMP_DIR}/dimRegion.csv

In [None]:
csv_buffer = StringIO()

In [None]:
factCovid.to_csv(csv_buffer)

OUTPUT_S3_CLIENT.put_object(
    Bucket=TARGET_OUTPUT_S3,
    Key=f'{TARGET_OUTPUT_DIR}/factCovid.csv',
    Body=csv_buffer.getvalue(),
    ContentType='text/csv'
)

In [None]:
dimHospital.to_csv(csv_buffer)

OUTPUT_S3_CLIENT.put_object(
    Bucket=TARGET_OUTPUT_S3,
    Key=f'{TARGET_OUTPUT_DIR}/dimHospital.csv',
    Body=csv_buffer.getvalue(),
    ContentType='text/csv'
)

In [None]:
dimDate.to_csv(csv_buffer)

OUTPUT_S3_CLIENT.put_object(
    Bucket=TARGET_OUTPUT_S3,
    Key=f'{TARGET_OUTPUT_DIR}/dimDate.csv',
    Body=csv_buffer.getvalue(),
    ContentType='text/csv'
)

In [None]:
OUTPUT_S3_CLIENT.upload_file(
    f'{TMP_DIR}/dimRegions.csv',
    Bucket=TARGET_OUTPUT_S3,
    Key=f'{TARGET_OUTPUT_DIR}/dimRegions.csv',
)

In [None]:
%rm -r -f {TMP_DIR}/*