# Provisions AWS resources required for the ELT

0. Prerequisites and clients for `IAM`, `EC2`, `S3` and `Redshift`
1. Create **IAM role** for authorizing Redshift to read from S3
2. Create **Redshift cluster** with attached IAM role
3. Create **inbound traffic rule** via VPC and security group
4. Verify Redshift **connection** and **security settings**
5. [Optional] **Decommission** resources

In [8]:
import configparser
import json

import boto3

## 0.a Prerequisites

- Create an IAM user with credentials for programmatic access to AWS

In [9]:
config = configparser.ConfigParser()
config.read_file(open("../redshift.cfg"))

## 0.b Load configuration

In [10]:
# credentials for boto3 clients
KEY=config.get("AWS", "KEY")
SECRET=config.get("AWS", "SECRET")
REGION=config.get("AWS", "REGION")

# cluster config
CLUSTER_IDENTIFIER=config.get("CLUSTER", "CLUSTER_IDENTIFIER")
CLUSTER_TYPE=config.get("CLUSTER", "CLUSTER_TYPE")
NODE_TYPE=config.get("CLUSTER", "NODE_TYPE")
NUM_NODES=config.get("CLUSTER", "NUM_NODES")

# database config
DB_NAME=config.get("CLUSTER", "DB_NAME")
DB_USER=config.get("CLUSTER", "DB_USER")
DB_PASSWORD=config.get("CLUSTER", "DB_PASSWORD")
DB_PORT=config.get("CLUSTER", "DB_PORT")

# security and authorization
IAM_ROLE_NAME=config.get("CLUSTER", "IAM_ROLE_NAME")
VPC_INBOUND_TPC_CIDR=config.get("CLUSTER", "VPC_INBOUND_TPC_CIDR")

## 0.c Create clients

In [11]:
ec2 = boto3.resource('ec2',
                     region_name=REGION,
                     aws_access_key_id=KEY,
                     aws_secret_access_key=SECRET
                    )

s3 = boto3.resource('s3',
                    region_name=REGION,
                    aws_access_key_id=KEY,
                    aws_secret_access_key=SECRET
                   )

iam = boto3.client('iam',
                   aws_access_key_id=KEY,
                   aws_secret_access_key=SECRET,
                   region_name='us-west-2'
                  )

redshift = boto3.client('redshift',
                        region_name=REGION,
                        aws_access_key_id=KEY,
                        aws_secret_access_key=SECRET
                       )

## 1. Create **IAM role** for authorizing Redshift to read from S3

In [40]:
# create iam role associated with Redshift
dwhRole = iam.create_role(
    Path='/',
    RoleName=IAM_ROLE_NAME,
    Description = "Allows Redshift clusters to call AWS services on your behalf.",
    AssumeRolePolicyDocument=json.dumps({
            'Statement': [{
                'Action': 'sts:AssumeRole',
                'Effect': 'Allow',
                'Principal': {'Service': 'redshift.amazonaws.com'}
            }],
            'Version': '2012-10-17'})
)

In [None]:
# attach S3 read only policy to the role
iam.attach_role_policy(RoleName=IAM_ROLE_NAME,
                       PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
                      )

# retrieve the arn for attaching role to cluster
IAM_ROLE_ARN = iam.get_role(RoleName=IAM_ROLE_NAME)['Role']['Arn']

## 2. Create **Redshift cluster** with attached IAM role

In [52]:
response = redshift.create_cluster(        

    # hardware
    ClusterType=CLUSTER_TYPE,
    NodeType=NODE_TYPE,
    NumberOfNodes=int(NUM_NODES),

    # identifiers and credentials
    DBName=DB_NAME,
    ClusterIdentifier=CLUSTER_IDENTIFIER,
    MasterUsername=DB_USER,
    MasterUserPassword=DB_PASSWORD,

    # role for reading from s3
    IamRoles=[IAM_ROLE_ARN]
)

In [5]:
cluster_props = redshift.describe_clusters(ClusterIdentifier=CLUSTER_IDENTIFIER)['Clusters'][0]
DWH_ENDPOINT = cluster_props['Endpoint']['Address']
DWH_ROLE_ARN = cluster_props['IamRoles'][0]['IamRoleArn']
print("DWH_ENDPOINT :: ", DWH_ENDPOINT)
print("DWH_ROLE_ARN :: ", DWH_ROLE_ARN)

DWH_ENDPOINT ::  redshifts3dev.cmjyoa1m3fts.us-west-2.redshift.amazonaws.com
DWH_ROLE_ARN ::  arn:aws:iam::898125645366:role/RedshiftS3Role


In [20]:
# redshift.pause_cluster(ClusterIdentifier=CLUSTER_IDENTIFIER)
# redshift.resume_cluster(ClusterIdentifier=CLUSTER_IDENTIFIER)

In [None]:
redshift.describe_clusters(ClusterIdentifier=CLUSTER_IDENTIFIER)

## 3. Create **inbound traffic rule** via VPC and security group

In [8]:
# retrieve the clusters vpc boto3 instance
vpc = ec2.Vpc(id=cluster_props["VpcId"])

# retrieve the vpc security group boto3 instance
default_security_group = ec2.SecurityGroup(id=cluster_props["VpcSecurityGroups"][0]["VpcSecurityGroupId"])

In [None]:
# add inbound traffic rule for configured TCP ip range and port
default_security_group.authorize_ingress(
    GroupName=default_security_group.group_name,
    CidrIp=VPC_INBOUND_TPC_CIDR,
    IpProtocol="TCP",
    FromPort=int(DB_PORT),
    ToPort=int(DB_PORT)
)

In [None]:
default_security_group.ip_permissions

## Verify Redshift **connection** and **security settings**

In [10]:
%reload_ext sql

In [143]:
# wrong TCP settings
wrong_conn_string_port = "postgresql://{}:{}@{}:{}/{}".format(DB_USER, DB_PASSWORD, DWH_ENDPOINT, 111, DB_NAME)
%sql $wrong_conn_string_port

(psycopg2.OperationalError) connection to server at "redshifts3dev.cmjyoa1m3fts.us-west-2.redshift.amazonaws.com" (35.82.48.87), port 100 failed: Operation timed out
	Is the server running on that host and accepting TCP/IP connections?

(Background on this error at: https://sqlalche.me/e/14/e3q8)
Connection info needed in SQLAlchemy format, example:
               postgresql://username:password@hostname/dbname
               or an existing connection: dict_keys(['postgresql://redshift_db_dev_user:***@redshifts3dev.cmjyoa1m3fts.us-west-2.redshift.amazonaws.com:5439/redshift_db_dev'])


In [11]:
# wrong password
wrong_conn_string_pw = "postgresql://{}:{}@{}:{}/{}".format(DB_USER, "foo", DWH_ENDPOINT, DB_PORT, DB_NAME)
%sql $wrong_conn_string_pw

(psycopg2.OperationalError) connection to server at "redshifts3dev.cmjyoa1m3fts.us-west-2.redshift.amazonaws.com" (35.82.48.87), port 5439 failed: FATAL:  password authentication failed for user "redshift_db_dev_user"
connection to server at "redshifts3dev.cmjyoa1m3fts.us-west-2.redshift.amazonaws.com" (35.82.48.87), port 5439 failed: FATAL:  password authentication failed for user "redshift_db_dev_user"

(Background on this error at: https://sqlalche.me/e/14/e3q8)
Connection info needed in SQLAlchemy format, example:
               postgresql://username:password@hostname/dbname
               or an existing connection: dict_keys([])


In [12]:
# correct credentials and TCP settings
correct_conn_string = "postgresql://{}:{}@{}:{}/{}".format(DB_USER, DB_PASSWORD, DWH_ENDPOINT, DB_PORT, DB_NAME)
%sql $correct_conn_string

In [21]:
%sql SELECT * FROM pg_database

 * postgresql://redshift_db_dev_user:***@redshifts3dev.cmjyoa1m3fts.us-west-2.redshift.amazonaws.com:5439/redshift_db_dev
5 rows affected.


datname,datdba,encoding,datistemplate,datallowconn,datlastsysoid,datvacuumxid,datfrozenxid,dattablespace,datconfig,datacl
dev,1,6,False,True,101706,0,0,1663,,
redshift_db_dev,100,6,False,True,101706,1054,1054,1663,,
padb_harvest,1,6,False,True,101706,0,0,1663,,
template1,1,6,True,True,101706,1054,1054,1663,,{rdsdb=CT/rdsdb}
template0,1,6,True,False,101706,1054,1054,1663,,{rdsdb=CT/rdsdb}


## 5. [Optional] **Decommission** resources

<b><font color='red'>DO NOT RUN THIS UNLESS YOU ARE SURE <br/> 
    We will be using these resources in the next exercises</span></b>

In [None]:
redshift.delete_cluster(ClusterIdentifier=CLUSTER_IDENTIFIER,  SkipFinalClusterSnapshot=True)

In [None]:
redshift.describe_clusters(ClusterIdentifier=CLUSTER_IDENTIFIER)

In [None]:
iam.detach_role_policy(RoleName=IAM_ROLE_NAME, PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess")
iam.delete_role(RoleName=IAM_ROLE_NAME)