In [1]:
import time
import configparser
from io import StringIO

import boto3
import psycopg2
import pandas as pd
import numpy as np
from botocore.exceptions import ClientError

from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

In [2]:
spark = SparkSession.builder.master("local[*]").appName("Join").getOrCreate()

24/08/09 19:01:55 WARN Utils: Your hostname, codespaces-595706 resolves to a loopback address: 127.0.0.1; using 10.0.1.158 instead (on interface eth0)
24/08/09 19:01:55 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/09 19:01:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
config = configparser.ConfigParser()
config.read_file(open('covid19-analytics.config'))

KEY = config.get('AWS', 'KEY')
SECRET = config.get('AWS', 'SECRET')

TARGET_OUTPUT_BUCKET=config.get('S3', 'TARGET_OUTPUT_BUCKET')
TARGET_OUTPUT_S3 = config.get('S3', 'TARGET_OUTPUT_S3')
TARGET_OUTPUT_DIR=config.get('S3', 'TARGET_OUTPUT_DIR')
TARGET_REGION = config.get('S3', 'TARGET_REGION')
TMP_DIR = config.get('FILE_PATHS', 'TMP_DIR')

DWH_CLUSTER_TYPE = config.get('DWH', 'DWH_CLUSTER_TYPE')
DWH_NUM_NODES = config.get('DWH', 'DWH_NUM_NODES')
DWH_NODE_TYPE = config.get('DWH', 'DWH_NODE_TYPE')
DWH_CLUSTER_IDENTIFIER = config.get('DWH', 'DWH_CLUSTER_IDENTIFIER')
DWH_DB = config.get('DWH', 'DWH_DB')
DWH_DB_USER = config.get('DWH', 'DWH_DB_USER')
DWH_DB_PASSWORD = config.get('DWH', 'DWH_DB_PASSWORD')
DWH_PORT = config.get('DWH', 'DWH_PORT')
DWH_IAM_ROLE_NAME = config.get('DWH', 'DWH_IAM_ROLE_NAME')

In [4]:
OUTPUT_S3_CLIENT = boto3.client(
    's3', 
    region_name=TARGET_REGION,
    aws_access_key_id=KEY, 
    aws_secret_access_key=SECRET
)

redshift_client = boto3.client(
    'redshift',
    region_name=TARGET_REGION,
    aws_access_key_id=KEY,
    aws_secret_access_key=SECRET
)

ec2_client = boto3.resource(
    'ec2',
    region_name=TARGET_REGION,
    aws_access_key_id=KEY,
    aws_secret_access_key=SECRET
)

iam_client = boto3.client(
    'iam',
    region_name=TARGET_REGION,
    aws_access_key_id=KEY,
    aws_secret_access_key=SECRET
)

In [5]:
enigma_jhu = pd.read_csv(f'{TMP_DIR}/enigma_jhu.csv')
testing_data_states_daily = pd.read_csv(f'{TMP_DIR}/testing-datastates_daily.csv')

factCovid_1 = enigma_jhu[['fips', 'province_state', 'country_region', 'confirmed', 'deaths', 'recovered', 'active' ]]
factCovid_2 = testing_data_states_daily[['fips', 'date', 'positive', 'negative', 'hospitalizedcurrently', 'hospitalized', 'hospitalizeddischarged' ]]
factCovid = pd.merge(factCovid_1, factCovid_2, on='fips', how='inner')
print(len(factCovid))

factCovid = factCovid.drop_duplicates(keep='first')

26418


24/08/09 19:02:08 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [6]:
dimHospital = pd.read_csv(f'{TMP_DIR}/hospital-bedsjson.csv')
dimHospital =  dimHospital[['fips', 'state_name', 'latitude', 'longtitude', 'hq_address', 'hospital_name', 'hospital_type', 'hq_city', 'hq_state']]
dimHospital = dimHospital.rename(columns={'longtitude': 'longitude'})

dimHospital = dimHospital.drop_duplicates(keep='first')

dimHospital['latitude'] = pd.to_numeric(dimHospital['latitude'], errors= 'coerce')
dimHospital['longitude'] = pd.to_numeric(dimHospital['longitude'], errors= 'coerce')

In [7]:
dimDate = pd.read_csv(f'{TMP_DIR}/testing-datastates_daily.csv')
dimDate = dimDate[['fips', 'date']]

dimDate['date'] = pd.to_datetime(dimDate['date'], format='%Y%m%d')
dimDate['year'] = dimDate['date'].dt.year
dimDate['month'] = dimDate['date'].dt.month
dimDate["day_of_week"] = dimDate['date'].dt.dayofweek

dimDate = dimDate.drop_duplicates(keep='first')

dimDate['fips'] = dimDate['fips'].astype(float)
dimDate['date'] = pd.to_datetime(dimDate['date'], errors= 'coerce')
dimDate['date'] = dimDate['date'].astype('datetime64[ns]')


In [8]:
enigma_jhu = spark.read.csv(
    f'{TMP_DIR}/enigma_jhu.csv', 
    header=True, 
    inferSchema=True
)

ny_times_us_county = spark.read.csv(
    f'{TMP_DIR}/us_county.csv', 
    header=True, 
    inferSchema=True
)

                                                                                

In [9]:
dimRegion_1 = enigma_jhu.select('fips', 'province_state', 'country_region', 'latitude', 'longitude')
dimRegion_2 = ny_times_us_county.select('fips', 'county', 'state')

dimRegion_1 = dimRegion_1.repartition(4, 'fips')
dimRegion_2 = dimRegion_2.repartition(4, 'fips')
dimRegion_2 = dimRegion_2.withColumnRenamed('fips', 'fips2')

In [10]:
dimRegion = dimRegion_1.join(
    dimRegion_2, 
    dimRegion_1["fips"] == dimRegion_2["fips2"], 
    "inner"
)

In [11]:
dimRegion = dimRegion.drop('fips2')
print(dimRegion.count())

dimRegion = dimRegion.distinct()
print(dimRegion.count())

                                                                                

8660980




2882


                                                                                

In [12]:
dimRegion = dimRegion.toPandas()
dimRegion['fips'] = dimRegion['fips'].astype(float)

dimRegion['latitude'] = pd.to_numeric(dimRegion['latitude'], errors= 'coerce')
dimRegion['longitude'] = pd.to_numeric(dimRegion['longitude'], errors= 'coerce')

                                                                                

In [13]:
factCovid.to_csv(f"{TMP_DIR}/factCovid.csv")

OUTPUT_S3_CLIENT.upload_file(
    f"{TMP_DIR}/factCovid.csv",
    Bucket=TARGET_OUTPUT_S3,
    Key=f'{TARGET_OUTPUT_DIR}/factCovid.csv',
)

In [14]:
dimHospital.to_csv(f"{TMP_DIR}/dimHospital.csv")

OUTPUT_S3_CLIENT.upload_file(
    f"{TMP_DIR}/dimHospital.csv",
    Bucket=TARGET_OUTPUT_S3,
    Key=f'{TARGET_OUTPUT_DIR}/dimHospital.csv',
)

In [15]:
dimDate.to_csv(f"{TMP_DIR}/dimDate.csv")

OUTPUT_S3_CLIENT.upload_file(
    f"{TMP_DIR}/dimDate.csv",
    Bucket=TARGET_OUTPUT_S3,
    Key=f'{TARGET_OUTPUT_DIR}/dimDate.csv',
)

In [16]:
dimRegion.to_csv(f"{TMP_DIR}/dimRegion.csv")

OUTPUT_S3_CLIENT.upload_file(
    f"{TMP_DIR}/dimRegion.csv",
    Bucket=TARGET_OUTPUT_S3,
    Key=f'{TARGET_OUTPUT_DIR}/dimRegion.csv',
)

In [None]:
# %rm -r -f {TMP_DIR}/* # Cleanup tmp directory

In [17]:
# Construct CREATE TABLE SQL dynamically from pandas dataframe
factCovid_sql = f"{pd.io.sql.get_schema(factCovid.reset_index(), 'factCovid')};"
staging_factCovid_sql =  f"{pd.io.sql.get_schema(factCovid.reset_index(), 'staging_factCovid')};"
print(factCovid_sql)

dimHospital_sql = f"{pd.io.sql.get_schema(dimHospital.reset_index(), 'dimHospital')};"
staging_dimHospital_sql = f"{pd.io.sql.get_schema(dimHospital.reset_index(), 'staging_dimHospital')};"
print(dimHospital_sql)

dimDate_sql = f"{pd.io.sql.get_schema(dimDate.reset_index(), 'dimDate')};"
staging_dimDate_sql = f"{pd.io.sql.get_schema(dimDate.reset_index(), 'staging_dimDate')};"
print(dimDate_sql)

dimRegion_sql = f"{pd.io.sql.get_schema(dimRegion.reset_index(), 'dimRegion')};"
staging_dimRegion_sql = f"{pd.io.sql.get_schema(dimRegion.reset_index(), 'staging_dimRegion')};"
print(dimRegion_sql)

CREATE TABLE "factCovid" (
"index" INTEGER,
  "fips" REAL,
  "province_state" TEXT,
  "country_region" TEXT,
  "confirmed" REAL,
  "deaths" REAL,
  "recovered" REAL,
  "active" TEXT,
  "date" INTEGER,
  "positive" REAL,
  "negative" REAL,
  "hospitalizedcurrently" REAL,
  "hospitalized" REAL,
  "hospitalizeddischarged" REAL
);
CREATE TABLE "dimHospital" (
"index" INTEGER,
  "fips" REAL,
  "state_name" TEXT,
  "latitude" REAL,
  "longitude" REAL,
  "hq_address" TEXT,
  "hospital_name" TEXT,
  "hospital_type" TEXT,
  "hq_city" TEXT,
  "hq_state" TEXT
);
CREATE TABLE "dimDate" (
"index" INTEGER,
  "fips" REAL,
  "date" TIMESTAMP,
  "year" INTEGER,
  "month" INTEGER,
  "day_of_week" INTEGER
);
CREATE TABLE "dimRegion" (
"index" INTEGER,
  "fips" REAL,
  "province_state" TEXT,
  "country_region" TEXT,
  "latitude" REAL,
  "longitude" REAL,
  "county" TEXT,
  "state" TEXT
);


In [18]:
# Method implements retries while obtaining redshift properties in case creating cluster is yet complete
def get_redshift_props(redshift_client, cluster_identifier):
    retries = 30
    retry_delay = 30 # Delay between retries in seconds
    for attempt in range(retries):
        try:
            clusterProps = redshift_client.describe_clusters(ClusterIdentifier=cluster_identifier)['Clusters'][0]
            if clusterProps['ClusterAvailabilityStatus'] == 'Available':
                return clusterProps
            elif clusterProps['ClusterAvailabilityStatus'] != 'Available':
                if attempt < retries -1:
                    print(f"Cluster '{cluster_identifier}' not ready. Retrying in {retry_delay} seconds...")
                    time.sleep(retry_delay)
        except redshift_client.exceptions.ClusterNotFoundFault as e:
            if attempt < retries -1:
                print(f"Cluster '{cluster_identifier}' not found. Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay/3)
            else:
                raise e # Raise the last exception if the retries are exhausted

In [19]:
def pretty_redshift_props(props):
    pd.set_option('display.max.colwidth', 0)
    keysToShow = ['ClusterIdentifier', 'ClusterStatus', 'NodeType', 'NumberOfNodes', 'DBName', 'MasterUsername', 'Endpoint', 'VpcId']
    x = [(k, v) for k, v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=['Parameter', 'value'])

In [20]:
clusterProps = get_redshift_props(redshift_client, DWH_CLUSTER_IDENTIFIER)
if clusterProps:
    prettyClusterProps = pretty_redshift_props(clusterProps)
    DWH_ENDPOINT = clusterProps['Endpoint']['Address']
    DWH_ROLE_ARN = clusterProps['IamRoles'][0]['IamRoleArn']

In [21]:
prettyClusterProps

Unnamed: 0,Parameter,value
0,ClusterIdentifier,covid19-redshift-cluster-1
1,NodeType,dc2.large
2,ClusterStatus,available
3,MasterUsername,oseloka
4,DBName,covid19-redshift-db-1
5,Endpoint,"{'Address': 'covid19-redshift-cluster-1.covkciolfldm.us-east-2.redshift.amazonaws.com', 'Port': 5439}"
6,VpcId,vpc-0af307b31fe59e41a
7,NumberOfNodes,1


In [22]:
try:
    vpc = ec2_client.Vpc(id=clusterProps['VpcId'])
    default_SG = list(vpc.security_groups.all())[0]
    print(default_SG)

    default_SG.authorize_ingress(
        GroupName=default_SG.group_name,
        CidrIp='0.0.0.0/0',
        IpProtocol='TCP',
        FromPort=int(DWH_PORT),
        ToPort=int(DWH_PORT),
    )
except ClientError as e:
    # Check for duplicate rule errors
    error_code = e.response['Error']['Code']
    if error_code == 'InvalidPermission.Duplicate':
        print('Security group rule exists, no further actions required')
    else:
        raise e
except Exception as e:
    raise e

ec2.SecurityGroup(id='sg-019036ec555f0d73b')
Security group rule exists, no further actions required


In [23]:
try:
    conn = psycopg2.connect(
        host=DWH_ENDPOINT,
        dbname=DWH_DB,
        user=DWH_DB_USER,
        password=DWH_DB_PASSWORD,
        port=int(DWH_PORT)
    )
except Exception as e:
    print(e)

conn.set_session(autocommit=True)

In [24]:
try:
    cur = conn.cursor()
except Exception as e:
    print("Error: Could not obtain database cursor")
    print(e)

In [54]:
# Create tables
try:
    cur.execute(staging_factCovid_sql)
    cur.execute(factCovid_sql)
except Exception as e:
    print(e)

try:
    cur.execute(staging_dimHospital_sql)
    cur.execute(dimHospital_sql)
except Exception as e:
    print(e)

try:
    cur.execute(staging_dimDate_sql)
    cur.execute(dimDate_sql)
except Exception as e:
    print(e)
    
try:
    cur.execute(staging_dimRegion_sql)
    cur.execute(dimRegion_sql)
except Exception as e:
    print(e)

Relation "staging_factcovid" already exists

Relation "staging_dimhospital" already exists

Relation "staging_dimdate" already exists

Relation "staging_dimregion" already exists



In [55]:
factCovid.head(2)

Unnamed: 0,fips,province_state,country_region,confirmed,deaths,recovered,active,date,positive,negative,hospitalizedcurrently,hospitalized,hospitalizeddischarged
0,72.0,Puerto Rico,US,3.0,0.0,0.0,,20210307,101327.0,305972.0,147.0,,
1,72.0,Puerto Rico,US,3.0,0.0,0.0,,20210306,101327.0,305972.0,147.0,,


In [None]:
dimDate.head(2)

Unnamed: 0,fips,date,year,month,day_of_week
0,2.0,2021-03-07,2021,3,6
1,1.0,2021-03-07,2021,3,6


In [None]:
dimRegion.head(2)

Unnamed: 0,fips,province_state,country_region,latitude,longitude,county,state
0,53061.0,Washington,US,48.033,-121.834,Snohomish,Washington
1,44007.0,Rhode Island,US,41.824,-71.413,Providence,Rhode Island


In [None]:
dimHospital.head(2)

Unnamed: 0,fips,state_name,latitude,longitude,hq_address,hospital_name,hospital_type,hq_city,hq_state
0,4013.0,Arizona,33.495498,-112.066157,650 E Indian School Rd,Phoenix VA Health Care System (AKA Carl T Hayden VA Medical Center),VA Hospital,Phoenix,AZ
1,4019.0,Arizona,32.181263,-110.965885,3601 S 6th Ave,Southern Arizona VA Health Care System,VA Hospital,Tucson,AZ


In [25]:
try:
    cur.execute(
    f"""
    copy staging_dimhospital
    from '{TARGET_OUTPUT_BUCKET}dimHospital.csv'
    credentials 'aws_iam_role={DWH_ROLE_ARN}'
    delimiter ','
    region '{TARGET_REGION}'
    IGNOREHEADER 1
    EMPTYASNULL
    BLANKSASNULL
    MAXERROR 100
    """
    )
except ClientError as error:
    print(error)
except Exception as e:
    print(e)

In [38]:
try:
    cur.execute(
    f"""
    copy staging_factCovid
    from '{TARGET_OUTPUT_BUCKET}factCovid.csv'
    credentials 'aws_iam_role={DWH_ROLE_ARN}'
    delimiter ','
    region '{TARGET_REGION}'
    IGNOREHEADER 1
    EMPTYASNULL
    BLANKSASNULL
    MAXERROR 100
    """
    )
except ClientError as error:
    print(error)
except Exception as e:
    print(e)

In [26]:
try:
    cur.execute(
    f"""
    copy staging_dimdate
    from '{TARGET_OUTPUT_BUCKET}dimDate.csv'
    credentials 'aws_iam_role={DWH_ROLE_ARN}'
    delimiter ','
    region '{TARGET_REGION}'
    IGNOREHEADER 1
    EMPTYASNULL
    BLANKSASNULL
    MAXERROR 100
    """
    )
except ClientError as error:
    print(error)
except Exception as e:
    print(e)

In [27]:
try:
    cur.execute(
    f"""
    copy staging_dimRegion
    from '{TARGET_OUTPUT_BUCKET}dimRegion.csv'
    credentials 'aws_iam_role={DWH_ROLE_ARN}'
    delimiter ','
    region '{TARGET_REGION}'
    IGNOREHEADER 1
    EMPTYASNULL
    BLANKSASNULL
    MAXERROR 100
    """
    )
except ClientError as error:
    print(error)
except Exception as e:
    print(e)

In [37]:
columns = [col for col in factCovid.columns if col != 'index']


select_cols = ({','.join(columns)})
select_sub = {','.join([f'sub.{col}' for col in columns])}

print(select_cols)
print(select_sub)

{'fips,province_state,country_region,confirmed,deaths,recovered,active,date,positive,negative,hospitalizedcurrently,hospitalized,hospitalizeddischarged'}
{'sub.fips,sub.province_state,sub.country_region,sub.confirmed,sub.deaths,sub.recovered,sub.active,sub.date,sub.positive,sub.negative,sub.hospitalizedcurrently,sub.hospitalized,sub.hospitalizeddischarged'}


In [39]:
# Inserting unique 'dimHospital' records using all columes exclusing 'index' to verify uniqueness
columns = [col for col in dimHospital.columns if col != 'index']

insert_dimHospital = f"""
insert into dimHospital ({','.join(columns)})
select {','.join([f'sub.{col}' for col in columns])}
from (
    select {','.join(columns)},
        row_number() over (partition by {','.join(columns)} order by index) as row_num
    from staging_dimHospital
) sub
where row_num = 1;
"""


# Inserting unique 'dimDate' records using all columes exclusing 'index' to verify uniqueness
columns = [col for col in dimDate.columns if col != 'index']

insert_dimDate = f"""
insert into dimDate ({','.join(columns)})
select {','.join([f'sub.{col}' for col in columns])}
from (
    select {','.join(columns)},
        row_number() over (partition by {','.join(columns)} order by index) as row_num
    from staging_dimDate
) sub
where row_num = 1;
"""


# Inserting unique 'dimRegion' records using all columes exclusing 'index' to verify uniqueness
columns = [col for col in dimRegion.columns if col != 'index']

insert_dimRegion = f"""
insert into dimRegion ({','.join(columns)})
select {','.join([f'sub.{col}' for col in columns])}
from (
    select {','.join(columns)},
        row_number() over (partition by {','.join(columns)} order by index) as row_num
    from staging_dimRegion
) sub
where row_num = 1;
"""


# Inserting unique 'factCovid' records using all columes exclusing 'index' to verify uniqueness
columns = [col for col in factCovid.columns if col != 'index']

insert_factCovid = f"""
insert into factCovid ({','.join(columns)})
select {','.join([f'sub.{col}' for col in columns])}
from (
    select {','.join(columns)},
    from staging_factCovid
) sub
where row_num = 1;
"""

In [40]:
try:
    cur.execute(insert_factCovid)
except Exception as e:
    print(e)

syntax error at or near "from" in context ",hospitalizeddischarged,
    from", at line 6, column 5
LINE 6:     from staging_factCovid
            ^



In [41]:
column_names = [desc[0] for desc in cur.description]

TypeError: 'NoneType' object is not iterable

In [None]:
rows =cur.fetchmany(10)
print(column_names)
for row in rows:
    print(row)

In [None]:
try:
    cur.execute("select * from users;")
except Exception as e:
    print("Unable to select from 'users' table")
    print(e)

In [None]:
rows =cur.fetchmany(10)
for row in rows:
    print(row)

In [None]:
"""
try:
    conn.close()
except psycopg2.Error as e:
    print(e)
"""

In [None]:
# redshift_client.delete_cluster(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER, SkipFinalClusterSnapshot=True)