# Project 3 - Data Warehouse on AWS (Redshift)
## Infrastructure as Code - Create Redshift Cluster

In [1]:
import pandas as pd
import boto3
import json
import configparser


#### Load Params from file 'dwh.cfg'
###### (mixed configuration file for Cluster creation and cluster connection)

In [16]:
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

## AWS credentials ##
KEY                    = config.get('AWS','KEY')
SECRET                 = config.get('AWS','SECRET')

# Cluster specific configuration parameters
CLUSTER_TYPE           = config.get("CLUSTER-DETAILS","CLUSTER_TYPE")
NUM_NODES              = config.get("CLUSTER-DETAILS","NUM_NODES")
NODE_TYPE              = config.get("CLUSTER-DETAILS","NODE_TYPE")
REGION_NAME            = config.get("CLUSTER-DETAILS","REGION_NAME")
CLUSTER_IDENTIFIER     = config.get("CLUSTER-DETAILS","CLUSTER_IDENTIFIER")

# Details für Redshift connection
DB_NAME                = config.get("CLUSTER","DB_NAME")
DB_USER                = config.get("CLUSTER","DB_USER")
DB_PASSWORD            = config.get("CLUSTER","DB_PASSWORD")
DB_PORT                = config.get("CLUSTER","DB_PORT")
DB_HOST                = config.get("CLUSTER","DB_HOST") #this value exists AFTER Cluster creation !!!

# IAM_Role
IAM_ROLE_NAME         = config.get("IAM_ROLE", "IAM_ROLE_NAME")
ARN                   = config.get("IAM_ROLE", "ARN")  #this value exists AFTER IAM Role creation !!!

pd.DataFrame({"Param":
                  ["CLUSTER_TYPE", "NUM_NODES", "NODE_TYPE", "CLUSTER_IDENTIFIER", "DB_NAME", "DB_USER", "DB_PASSWORD", "DB_PORT", "IAM_ROLE_NAME", "REGION_NAME"],
              "Value":
                  [CLUSTER_TYPE, NUM_NODES, NODE_TYPE, CLUSTER_IDENTIFIER, DB_NAME, DB_USER, DB_PASSWORD, DB_PORT, IAM_ROLE_NAME, REGION_NAME]
             })

Unnamed: 0,Param,Value
0,CLUSTER_TYPE,multi-node
1,NUM_NODES,4
2,NODE_TYPE,dc2.large
3,CLUSTER_IDENTIFIER,dwhCluster
4,DB_NAME,dwh
5,DB_USER,dwhuser
6,DB_PASSWORD,Passw0rd
7,DB_PORT,5439
8,IAM_ROLE_NAME,dwhRole
9,REGION_NAME,eu-central-1


#### Create clients for EC2, S3, IAM and Redshift

In [3]:
ec2 = boto3.resource('ec2',
                       region_name=REGION_NAME,
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                    )

s3 = boto3.resource('s3',
                       region_name=REGION_NAME,
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                    )

iam = boto3.client('iam',
                       region_name=REGION_NAME,
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                  )

redshift =  boto3.client('redshift',
                        region_name=REGION_NAME,
                        aws_access_key_id=KEY,
                        aws_secret_access_key=SECRET
                  )


### 1. IAM ROLE
#### 1.1 Creation of an IAM Role makes Redshift able to access S3 bucket (ReadOnly)

In [5]:
try:
    print("1.1 Creating a new IAM Role")
    dwhRole = iam.create_role(
        Path='/',
        RoleName=IAM_ROLE_NAME,
        Description = "Allows Redshift clusters to call AWS services on your behalf.",
        AssumeRolePolicyDocument=json.dumps(
            {'Statement': [{'Action': 'sts:AssumeRole',
               'Effect': 'Allow',
               'Principal': {'Service': 'redshift.amazonaws.com'}}],
             'Version': '2012-10-17'})
    )
except Exception as e:
    print(e)

1.1 Creating a new IAM Role
An error occurred (EntityAlreadyExists) when calling the CreateRole operation: Role with name dwhRole already exists.


#### 1.2 Attach Policy to Role

In [6]:
print("1.2 Attaching Policy")
try:
    iam.attach_role_policy(RoleName=IAM_ROLE_NAME,
                           PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
                          )['ResponseMetadata']['HTTPStatusCode']
except Exception as e:
    print(e)


1.2 Attaching Policy


#### 1.3 Get and print the IAM role ARN

In [2]:
print('1.3 Get the IAM role ARN')
try:
    roleArn = iam.get_role(RoleName=IAM_ROLE_NAME)['Role']['Arn']
    print(roleArn)

except Exception as e:
    print(e)

1.3 Get the IAM role ARN
name 'iam' is not defined


### 2:  Creation of the Redshift Cluster

In [10]:
try:
    response = redshift.create_cluster(
        # parameters for hardware
        ClusterType=CLUSTER_TYPE,
        NodeType=NODE_TYPE,
        NumberOfNodes=int(NUM_NODES),

        # parameters for identifiers & credentials
        DBName=DB_NAME,
        ClusterIdentifier=CLUSTER_IDENTIFIER,
        MasterUsername=DB_USER,
        MasterUserPassword=DB_PASSWORD,

        # parameter for role (to allow s3 access)
        IamRoles=[roleArn]
    )
except Exception as e:
    print(e)

#### 2.1 *Describe* the cluster to see its status
<font color='red'>
run this block several times until the cluster status becomes `Available`
<font/>

In [3]:
def prettyRedshiftProps(props):
    pd.set_option('display.max_colwidth', -1)
    keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId']
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=["Key", "Value"])

myClusterProps = redshift.describe_clusters(ClusterIdentifier=CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

NameError: name 'redshift' is not defined

#### 2.2 Take note of the cluster <font color='red'> endpoint and role ARN </font> </h2>

Status, Host and Role ARN of created cluster

In [4]:
CLUSTERSTATUS = myClusterProps['ClusterStatus']
print("STATUS OF CLUSTER  :: ", CLUSTERSTATUS )


HOST = myClusterProps['Endpoint']['Address']
ROLE_ARN = myClusterProps['IamRoles'][0]['IamRoleArn']
print("HOST               :: ", HOST)
print("ROLE_ARN           :: ", ROLE_ARN)

NameError: name 'myClusterProps' is not defined

### 3: Open an incoming TCP port to access the cluster ednpoint (host)

In [5]:
# create inbound port settings for redshift
try:
    vpc = ec2.Vpc(id=myClusterProps['VpcId'])
    defaultSg = list(vpc.security_groups.all())[0]
    print(defaultSg)

    defaultSg.authorize_ingress(
        GroupName=defaultSg.group_name ,
        CidrIp='0.0.0.0/0',
        IpProtocol='TCP',
        FromPort=int(DB_PORT),
        ToPort=int(DB_PORT)
    )
except Exception as e:
    print(e)


name 'ec2' is not defined


### 4: Testing the connection to the cluster

In [6]:
%load_ext sql

conn_string="postgresql://{}:{}@{}:{}/{}".format(DB_USER, DB_PASSWORD, DB_HOST, DB_PORT, DB_NAME)
print(conn_string)

%sql $conn_string

NameError: name 'DB_USER' is not defined

#### 4.1 Execute some SQL statements for testing Redshift connectivity

In [7]:
# test table creation --> insert --> select --> drop
%sql drop table if exists test_table;
%sql create table test_table (name text);
%sql insert into test_table (name) values ('FirstName');
%sql insert into test_table (name) values ('LastName');
%sql select * from test_table;

Environment variable $DATABASE_URL not set, and no connect string given.
Connection info needed in SQLAlchemy format, example:
               postgresql://username:password@hostname/dbname
               or an existing connection: dict_keys([])
Environment variable $DATABASE_URL not set, and no connect string given.
Connection info needed in SQLAlchemy format, example:
               postgresql://username:password@hostname/dbname
               or an existing connection: dict_keys([])
Environment variable $DATABASE_URL not set, and no connect string given.
Connection info needed in SQLAlchemy format, example:
               postgresql://username:password@hostname/dbname
               or an existing connection: dict_keys([])
Environment variable $DATABASE_URL not set, and no connect string given.
Connection info needed in SQLAlchemy format, example:
               postgresql://username:password@hostname/dbname
               or an existing connection: dict_keys([])
Environment variable

#### 4.2 Clean up test_table

In [8]:
%sql drop table test_table;


Environment variable $DATABASE_URL not set, and no connect string given.
Connection info needed in SQLAlchemy format, example:
               postgresql://username:password@hostname/dbname
               or an existing connection: dict_keys([])


## STEP 5: Clean up resources

<b><font color='red'>DO NOT RUN THIS UNLESS YOU ARE SURE <br/>
    Clean up these resources ONLY if not needed anymore </span></b>

In [None]:
#### CAREFUL!!
# delete inbound port settings for redshift
try:
     defaultSg.revoke_ingress(
        GroupName=defaultSg.group_name ,  # TODO: fill out
        CidrIp='0.0.0.0/0',  # TODO: fill out
        IpProtocol='TCP',  # TODO: fill out
        FromPort=int(DB_PORT),
        ToPort=int(DB_PORT)
     )
except Exception as e:
    print(e)
#### CAREFUL!!

#### delete redshift cluster

In [None]:
#### CAREFUL!!
try:
    redshift.delete_cluster( ClusterIdentifier=CLUSTER_IDENTIFIER,  SkipFinalClusterSnapshot=True)
except Exception as e:
    print(e)
#### CAREFUL!!


#### check status of deletion

In [9]:
myClusterProps = redshift.describe_clusters(ClusterIdentifier=CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

NameError: name 'redshift' is not defined

#### detach and delete IAM Role

In [None]:
try:
    iam.detach_role_policy(RoleName=IAM_ROLE_NAME, PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess")
    iam.delete_role(RoleName=IAM_ROLE_NAME)
except Exception as e:
    print(e)