In [2]:
import time
import configparser
from io import StringIO
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed

import boto3
import psycopg2
import pandas as pd
from botocore.exceptions import ClientError

In [3]:
config = configparser.ConfigParser()
config.read_file(open('covid19-analytics.config'))

KEY = config.get('AWS', 'KEY')
SECRET = config.get('AWS', 'SECRET')

TARGET_S3 = config.get('S3', 'TARGET_S3')
TARGET_OUTPUT_S3 = config.get('S3', 'TARGET_OUTPUT_S3')
TARGET_OUTPUT_BUCKET=config.get('S3', 'TARGET_OUTPUT_BUCKET')
TARGET_OUTPUT_DIR=config.get('S3', 'TARGET_OUTPUT_DIR')
TARGET_REGION = config.get('S3', 'TARGET_REGION')
SCHEMA_NAME = config.get('GLUE', 'SCHEMA_NAME')
TMP_DIR = config.get('FILE_PATHS', 'TMP_DIR')

DWH_CLUSTER_TYPE = config.get('DWH', 'DWH_CLUSTER_TYPE')
DWH_NUM_NODES = config.get('DWH', 'DWH_NUM_NODES')
DWH_NODE_TYPE = config.get('DWH', 'DWH_NODE_TYPE')
DWH_CLUSTER_IDENTIFIER = config.get('DWH', 'DWH_CLUSTER_IDENTIFIER')
DWH_DB = config.get('DWH', 'DWH_DB')
DWH_DB_USER = config.get('DWH', 'DWH_DB_USER')
DWH_DB_PASSWORD = config.get('DWH', 'DWH_DB_PASSWORD')
DWH_PORT = config.get('DWH', 'DWH_PORT')
DWH_IAM_ROLE_NAME = config.get('DWH', 'DWH_IAM_ROLE_NAME')

In [4]:
pd.DataFrame(
    {
        'Param':
            [
                'DWH_CLUSTER_TYPE',
                'DWH_NUM_NODES',
                'DWH_NODE_TYPE',
                'DWH_CLUSTER_IDENTIFIER',
                'DWH_DB',
                'DWH_DB_USER',
                'DWH_DB_PASSWORD',
                'DWH_PORT',
                'DWH_IAM_ROLE_NAME'
            ],
        'Value':
            [
                DWH_CLUSTER_TYPE,
                DWH_NUM_NODES,
                DWH_NODE_TYPE,
                DWH_CLUSTER_IDENTIFIER,
                DWH_DB,
                DWH_DB_USER,
                DWH_DB_PASSWORD,
                DWH_PORT,
                DWH_IAM_ROLE_NAME

            ]
    }
)

Unnamed: 0,Param,Value
0,DWH_CLUSTER_TYPE,single-node
1,DWH_NUM_NODES,1
2,DWH_NODE_TYPE,dc2.large
3,DWH_CLUSTER_IDENTIFIER,covid19-redshift-cluster-1
4,DWH_DB,covid19-redshift-db-1
5,DWH_DB_USER,oseloka
6,DWH_DB_PASSWORD,Oseloka1
7,DWH_PORT,5439
8,DWH_IAM_ROLE_NAME,redshift-tutorial-s3-access-role


In [5]:
OUTPUT_S3_CLIENT = boto3.client(
    's3', 
    region_name=TARGET_REGION,
    aws_access_key_id=KEY, 
    aws_secret_access_key=SECRET
)

GLUE_CLIENT = boto3.client(
    'glue', 
    aws_access_key_id=KEY,
    aws_secret_access_key=SECRET,
    region_name=TARGET_REGION
)

ATHENA_CLIENT = boto3.client(
    'athena',
    aws_access_key_id=KEY,
    aws_secret_access_key=SECRET,
    region_name=TARGET_REGION,
)

redshift_client = boto3.client(
    'redshift',
    region_name=TARGET_REGION,
    aws_access_key_id=KEY,
    aws_secret_access_key=SECRET
)

ec2_client = boto3.resource(
    'ec2',
    region_name=TARGET_REGION,
    aws_access_key_id=KEY,
    aws_secret_access_key=SECRET
)

iam_client = boto3.client(
    'iam',
    region_name=TARGET_REGION,
    aws_access_key_id=KEY,
    aws_secret_access_key=SECRET
)



In [6]:
redshiftToS3_roleArn = iam_client.get_role(RoleName=DWH_IAM_ROLE_NAME)['Role']['Arn']


In [7]:
def list_athena_tables(client, database_name) -> list:
    tables = []
    paginator = client.get_paginator('get_tables')

    # Use paginator to handle potentially large number of tables
    for page in paginator.paginate(DatabaseName=database_name):
        for table in page['TableList']:
            tables.append(table['Name'])
            
    return tables

In [8]:
# Function to execute athena query and retrieve data in all tables
def query_athena_and_fetch_results(
        athena_client,
        s3_client, 
        database, 
        query,
        table,
        output_s3,
        output_dir,
        output_location,
        tmp_dir):
    
    response = athena_client.start_query_execution(
        QueryString=query,
        QueryExecutionContext={'Database': database},
        ResultConfiguration={
            'OutputLocation': output_location,
            'EncryptionConfiguration': {'EncryptionOption': 'SSE_S3'},
        }
    )
    query_execution_id = response['QueryExecutionId']

    # Loop till query execution is complete
    while True:
        try:
            response = athena_client.get_query_execution(QueryExecutionId=query_execution_id)
        except ClientError as e:
            print(f"\nQuery Error: \n{e}")
            
        state = response['QueryExecution']['Status']['State']
        if state == 'SUCCEEDED':
            print(f"\n{query_execution_id} query has completed successfuly")

            results_path = f'{output_dir}/{query_execution_id}.csv'
            local_filename = f'{tmp_dir}/{table}.csv'

            if Path(local_filename).exists():
                print(f"{local_filename} already exists, skip download")
            else:
                try:
                    s3_client.download_file(output_s3, results_path, local_filename)
                    print(f"\n{local_filename} downloaded successfuly")
                except ClientError as e:
                    print(f"Download Error: \n{e}")
                    
            try:
                s3_client.delete_objects(Bucket=output_s3, Delete={'Objects': [{'Key': results_path}, {'Key': f'{results_path}.metadata'}], 'Quiet': True})
            except ClientError as e:
                print(f"S3 cleanup Error: \n{e}")

            return
        
        elif state in ['FAILED', 'CANCELLED']:
            raise Exception(f"Query {state.lower()} with reason: {response['QueryExecution']['Status']['StateChangeReason']}")
        else:
            print(f"/n{query_execution_id} query is still running, waiting 3 seconds...")
            time.sleep(3)

In [9]:
tables = list_athena_tables(GLUE_CLIENT, SCHEMA_NAME)

In [10]:
def download_table_data(tables, max_workers):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_table = {
            executor.submit(
                query_athena_and_fetch_results,
                athena_client=ATHENA_CLIENT,
                s3_client=OUTPUT_S3_CLIENT, 
                database=SCHEMA_NAME,  
                query=f'SELECT * FROM "{TABLE}";',
                table=TABLE,
                output_s3=TARGET_OUTPUT_S3,
                output_dir=TARGET_OUTPUT_DIR,
                output_location=TARGET_OUTPUT_BUCKET,
                tmp_dir=TMP_DIR
            ): TABLE for TABLE in tables
        }
        for future in as_completed(future_to_table):
            table = future_to_table[future]
            try:
                future.result()
            except Exception as e:
                print(f"Failed to download csv for {table}: {str(e)}")


In [11]:
download_table_data(tables, max_workers=3)

/ne81c4c6e-07cd-4d34-a6ae-c1351bcaff46 query is still running, waiting 3 seconds...
/nf20f41af-6694-4598-9fb3-b91aafef7005 query is still running, waiting 3 seconds...
/nec818b20-ace3-4fb9-92cf-9415b9b56e25 query is still running, waiting 3 seconds...

e81c4c6e-07cd-4d34-a6ae-c1351bcaff46 query has completed successfuly
/workspaces/covid-19-data-pipeline-aws/covid-19-ETL/tmp/hospital-bedsjson.csv already exists, skip download
/nf20f41af-6694-4598-9fb3-b91aafef7005 query is still running, waiting 3 seconds...

ec818b20-ace3-4fb9-92cf-9415b9b56e25 query has completed successfuly
/workspaces/covid-19-data-pipeline-aws/covid-19-ETL/tmp/static-datacountrycode.csv already exists, skip download
/n91db549b-93e2-4208-9efd-3f1d99881fd2 query is still running, waiting 3 seconds...
/ne0c4e8b5-bdf7-440e-b5fe-66465c6a516e query is still running, waiting 3 seconds...

f20f41af-6694-4598-9fb3-b91aafef7005 query has completed successfuly
/workspaces/covid-19-data-pipeline-aws/covid-19-ETL/tmp/enigma_jh

In [12]:
enigma_jhu = pd.read_csv(f'{TMP_DIR}/enigma_jhu.csv')

testing_data_states_daily = pd.read_csv(f'{TMP_DIR}/testing-datastates_daily.csv')


In [13]:
factCovid_1 = enigma_jhu[['fips', 'province_state', 'country_region', 'confirmed', 'deaths', 'recovered', 'active' ]]
factCovid_2 = testing_data_states_daily[['fips', 'date', 'positive', 'negative', 'hospitalizedcurrently', 'hospitalized', 'hospitalizeddischarged' ]]
factCovid = pd.merge(factCovid_1, factCovid_2, on='fips', how='inner')

In [14]:
dimHospital = pd.read_csv(f'{TMP_DIR}/hospital-bedsjson.csv')
dimHospital =  dimHospital[['fips', 'state_name', 'latitude', 'longtitude', 'hq_address', 'hospital_name', 'hospital_type', 'hq_city', 'hq_state']]

In [15]:
dimDate = pd.read_csv(f'{TMP_DIR}/testing-datastates_daily.csv')
dimDate = dimDate[['fips', 'date']]

In [16]:
dimDate['date'] = pd.to_datetime(dimDate['date'], format='%Y%m%d')
dimDate['year'] = dimDate['date'].dt.year
dimDate['month'] = dimDate['date'].dt.month
dimDate["day_of_week"] = dimDate['date'].dt.dayofweek

In [17]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
spark = SparkSession.builder.master("local[*]").appName("Join").getOrCreate()

24/08/08 17:59:35 WARN Utils: Your hostname, codespaces-595706 resolves to a loopback address: 127.0.0.1; using 10.0.0.100 instead (on interface eth0)
24/08/08 17:59:35 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/08 17:59:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [18]:
enigma_jhu = spark.read.csv(
    f'{TMP_DIR}/enigma_jhu.csv', 
    header=True, 
    inferSchema=True
)

ny_times_us_county = spark.read.csv(
    f'{TMP_DIR}/us_county.csv', 
    header=True, 
    inferSchema=True
)

                                                                                

In [19]:
dimRegion_1 = enigma_jhu.select('fips', 'province_state', 'country_region', 'latitude', 'longitude')
dimRegion_2 = ny_times_us_county.select('fips', 'county', 'state')

dimRegion_1 = dimRegion_1.repartition(4, 'fips')
dimRegion_2 = dimRegion_2.repartition(4, 'fips')
dimRegion_2 = dimRegion_2.withColumnRenamed('fips', 'fips2')

24/08/08 17:59:52 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [20]:
dimRegion = dimRegion_1.join(
    dimRegion_2, 
    dimRegion_1["fips"] == dimRegion_2["fips2"], 
    "inner"
)

In [21]:
dimRegion = dimRegion.drop('fips2')

In [22]:
dimRegion.coalesce(1).write.csv(f'{TMP_DIR}/dimRegion.csv', header=True)

                                                                                

In [23]:
%mv {TMP_DIR}/dimRegion.csv/part-00000* {TMP_DIR}/dimRegions.csv

In [24]:
%rm -r -f {TMP_DIR}/dimRegion.csv

In [25]:
csv_buffer = StringIO()

In [26]:
factCovid.to_csv(csv_buffer)

OUTPUT_S3_CLIENT.put_object(
    Bucket=TARGET_OUTPUT_S3,
    Key=f'{TARGET_OUTPUT_DIR}/factCovid.csv',
    Body=csv_buffer.getvalue(),
    ContentType='text/csv'
)

{'ResponseMetadata': {'RequestId': 'CBJX6WWKBMR5XQW4',
  'HostId': 'ZGwm+CzQH6aihz50vsCY9SOs6Hv7EI2w6M+4hzcBAzqS7RbfTITYl1zstuUpiwjXtXsUKad0sX9+jqBpVNh7kA==',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'ZGwm+CzQH6aihz50vsCY9SOs6Hv7EI2w6M+4hzcBAzqS7RbfTITYl1zstuUpiwjXtXsUKad0sX9+jqBpVNh7kA==',
   'x-amz-request-id': 'CBJX6WWKBMR5XQW4',
   'date': 'Thu, 08 Aug 2024 18:00:29 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"9e84c6b6a22cee482ae52a6000e415f0"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"9e84c6b6a22cee482ae52a6000e415f0"',
 'ServerSideEncryption': 'AES256'}

In [27]:
dimHospital.to_csv(csv_buffer)

OUTPUT_S3_CLIENT.put_object(
    Bucket=TARGET_OUTPUT_S3,
    Key=f'{TARGET_OUTPUT_DIR}/dimHospital.csv',
    Body=csv_buffer.getvalue(),
    ContentType='text/csv'
)

{'ResponseMetadata': {'RequestId': 'GDC26NWKVBCGJ29S',
  'HostId': 'nVU0FJhN3THhVaKj4yTpFzwp1n21p+7/pMmyWnc+K+Xt25m/t3otIH3Cpt9er4PHhF8wqUwgPSM1KPCFQ2c3PQ==',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'nVU0FJhN3THhVaKj4yTpFzwp1n21p+7/pMmyWnc+K+Xt25m/t3otIH3Cpt9er4PHhF8wqUwgPSM1KPCFQ2c3PQ==',
   'x-amz-request-id': 'GDC26NWKVBCGJ29S',
   'date': 'Thu, 08 Aug 2024 18:00:33 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"b598ab67bb0fa5268bf9a99a62622af9"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"b598ab67bb0fa5268bf9a99a62622af9"',
 'ServerSideEncryption': 'AES256'}

In [28]:
dimDate.to_csv(csv_buffer)

OUTPUT_S3_CLIENT.put_object(
    Bucket=TARGET_OUTPUT_S3,
    Key=f'{TARGET_OUTPUT_DIR}/dimDate.csv',
    Body=csv_buffer.getvalue(),
    ContentType='text/csv'
)

{'ResponseMetadata': {'RequestId': 'DV82NNBSGENM0429',
  'HostId': 'vOwxx0YSIuqqqtPV2TLMEYco43k/WS8syogV5OEhhvx2orq78FXd+v3FnKaToGTUoCUbecPeEi9YrKQsrYnaeQ==',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'vOwxx0YSIuqqqtPV2TLMEYco43k/WS8syogV5OEhhvx2orq78FXd+v3FnKaToGTUoCUbecPeEi9YrKQsrYnaeQ==',
   'x-amz-request-id': 'DV82NNBSGENM0429',
   'date': 'Thu, 08 Aug 2024 18:00:37 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"82acca58660c7b1b726c6874ad1aced0"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"82acca58660c7b1b726c6874ad1aced0"',
 'ServerSideEncryption': 'AES256'}

In [29]:
OUTPUT_S3_CLIENT.upload_file(
    f'{TMP_DIR}/dimRegions.csv',
    Bucket=TARGET_OUTPUT_S3,
    Key=f'{TARGET_OUTPUT_DIR}/dimRegions.csv',
)

In [30]:
%rm -r -f {TMP_DIR}/* # Cleanup tmp directory

In [31]:
# Construct CREATE TABLE SQL dynamically from pandas dataframe
factCovid_sql = f"{pd.io.sql.get_schema(factCovid.reset_index(), 'factCovid')};"
dimHospital_sql = f"{pd.io.sql.get_schema(dimHospital.reset_index(), 'dimHospital')};"
dimDate_sql = f"{pd.io.sql.get_schema(dimDate.reset_index(), 'dimDate')};"
print(dimDate_sql)

# Prepare spark datafram dimRegion to match redshift database table structure
# Define a spark window specificaton
window_spec = Window.orderBy("fips")

# Add a sequential index column
dimRegion = dimRegion.withColumn("index", row_number().over(window_spec))

# Reorder columns to have index as first column
dimRegion = dimRegion.select(["index"] + [col for col in dimRegion.columns if col != "index"])

# Mapping Spark data types to SQL data types
data_type_mapping = {
    "STRING": "TEXT",
    "DOUBLE": "REAL",
    "LONG": "INTEGER",
    "INT": "INTEGER"
}

# Construct CREATE TABLE SQL dynamically from spark dataframe
columns = ",\n".join(
    [f'"{field.name}" {data_type_mapping.get(field.dataType.simpleString().upper(), "TEXT")}' for field in dimRegion.schema.fields]
)
dimRegion_sql = f'CREATE TABLE "dimRegion" (\n{columns}\n);'
print(dimRegion_sql)

CREATE TABLE "dimDate" (
"index" INTEGER,
  "fips" INTEGER,
  "date" TIMESTAMP,
  "year" INTEGER,
  "month" INTEGER,
  "day_of_week" INTEGER
);
CREATE TABLE "dimRegion" (
"index" INTEGER,
"fips" INTEGER,
"province_state" TEXT,
"country_region" TEXT,
"latitude" REAL,
"longitude" REAL,
"county" TEXT,
"state" TEXT
);


In [32]:
def pretty_redshift_props(props):
    pd.set_option('display.max.colwidth', 0)
    keysToShow = ['ClusterIdentifier', 'ClusterStatus', 'NodeType', 'NumberOfNodes', 'DBName', 'MasterUsername', 'Endpoint', 'VpcId']
    x = [(k, v) for k, v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=['Parameter', 'value'])

In [33]:
# Method implements retries while obtaining redshift properties in case creating cluster is yet complete
def get_redshift_props(redshift_client, cluster_identifier):
    retries = 30
    retry_delay = 30 # Delay between retries in seconds
    for attempt in range(retries):
        try:
            clusterProps = redshift_client.describe_clusters(ClusterIdentifier=cluster_identifier)['Clusters'][0]
            return clusterProps
        except redshift_client.exceptions.ClusterNotFoundFault as e:
            if attempt < retries -1:
                print(f"Cluster '{cluster_identifier}' not found. Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                raise e # Raise the last exception if the retries are exhausted

In [34]:
redshift_client.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]

{'ClusterIdentifier': 'covid19-redshift-cluster-1',
 'NodeType': 'dc2.large',
 'ClusterStatus': 'available',
 'ClusterAvailabilityStatus': 'Available',
 'MasterUsername': 'oseloka',
 'DBName': 'covid19-redshift-db-1',
 'Endpoint': {'Address': 'covid19-redshift-cluster-1.covkciolfldm.us-east-2.redshift.amazonaws.com',
  'Port': 5439},
 'ClusterCreateTime': datetime.datetime(2024, 8, 8, 16, 45, 51, 892000, tzinfo=tzlocal()),
 'AutomatedSnapshotRetentionPeriod': 1,
 'ManualSnapshotRetentionPeriod': -1,
 'ClusterSecurityGroups': [],
 'VpcSecurityGroups': [{'VpcSecurityGroupId': 'sg-019036ec555f0d73b',
   'Status': 'active'}],
 'ClusterParameterGroups': [{'ParameterGroupName': 'default.redshift-1.0',
   'ParameterApplyStatus': 'in-sync'}],
 'ClusterSubnetGroupName': 'default',
 'VpcId': 'vpc-0af307b31fe59e41a',
 'AvailabilityZone': 'us-east-2b',
 'PreferredMaintenanceWindow': 'thu:06:00-thu:06:30',
 'PendingModifiedValues': {},
 'ClusterVersion': '1.0',
 'AllowVersionUpgrade': True,
 'Numbe

In [35]:
clusterProps = get_redshift_props(redshift_client, DWH_CLUSTER_IDENTIFIER)
if clusterProps:
    prettyClusterProps = pretty_redshift_props(clusterProps)

In [36]:
prettyClusterProps

Unnamed: 0,Parameter,value
0,ClusterIdentifier,covid19-redshift-cluster-1
1,NodeType,dc2.large
2,ClusterStatus,available
3,MasterUsername,oseloka
4,DBName,covid19-redshift-db-1
5,Endpoint,"{'Address': 'covid19-redshift-cluster-1.covkciolfldm.us-east-2.redshift.amazonaws.com', 'Port': 5439}"
6,VpcId,vpc-0af307b31fe59e41a
7,NumberOfNodes,1


In [37]:
DWH_ENDPOINT = clusterProps['Endpoint']['Address']
DWH_ROLE_ARN = clusterProps['IamRoles'][0]['IamRoleArn']
DB_NAME = clusterProps['DBName']
DB_USER = clusterProps['MasterUsername']

In [38]:
try:
    vpc = ec2_client.Vpc(id=clusterProps['VpcId'])
    default_SG = list(vpc.security_groups.all())[0]
    print(default_SG)

    default_SG.authorize_ingress(
        GroupName=default_SG.group_name,
        CidrIp='0.0.0.0/0',
        IpProtocol='TCP',
        FromPort=int(DWH_PORT),
        ToPort=int(DWH_PORT),
    )
except Exception as e:
    print(e)

ec2.SecurityGroup(id='sg-019036ec555f0d73b')
An error occurred (InvalidPermission.Duplicate) when calling the AuthorizeSecurityGroupIngress operation: the specified rule "peer: 0.0.0.0/0, TCP, from port: 5439, to port: 5439, ALLOW" already exists


In [39]:
try:
    conn = psycopg2.connect(
        host=DWH_ENDPOINT,
        dbname=DB_NAME,
        user=DB_USER,
        password=DWH_DB_PASSWORD,
        port=int(DWH_PORT)
    )
except Exception as e:
    print(e)

conn.set_session(autocommit=True)

In [40]:
try:
    cur = conn.cursor()
except Exception as e:
    print("Error: Could not obtain database cursor")
    print(e)

In [42]:
try:
    cur.execute(factCovid_sql)
    cur.execute(dimHospital_sql)
    cur.execute(dimDate_sql)
    cur.execute(dimRegion_sql)
except Exception as e:
    print(e)

In [43]:
try:
    cur.execute(
    f"""
    copy factCovid 
    from '{TARGET_OUTPUT_BUCKET}/dimHospital.csv'
    credentials 'aws_iam_role={redshiftToS3_roleArn}'
    delimiter ','
    region '{TARGET_REGION}'
    IGNOREHEADER 1
    """
    )
except Exception as e:
    print("Unable to copy 'factCovid' data from s3 into redshift")
    print(e)

Unable to copy 'factCovid' data from s3 into redshift
The specified S3 prefix 'output//dimHospital.csv' does not exist
DETAIL:  
  -----------------------------------------------
  error:  The specified S3 prefix 'output//dimHospital.csv' does not exist
  code:      8001
  context:   
  query:     2822
  location:  s3_utility.cpp:717
  process:   padbmaster [pid=1073799386]
  -----------------------------------------------




In [None]:
try:
    cur.execute("select * from users;")
except Exception as e:
    print("Unable to select from 'users' table")
    print(e)

In [None]:
rows =cur.fetchmany(10)
for row in rows:
    print(row)

In [None]:
try:
    conn.close()
except psycopg2.Error as e:
    print(e)

In [None]:
# redshift_client.delete_cluster(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER, SkipFinalClusterSnapshot=True)