## Sparkify Control Point

In [9]:
# First things first, we have to create a redshift cluster for our project on AWS
# Here, we'd be using IaC to proceed with the processes

# importing boto3, AWS python SDK
import boto3
from botocore.exceptions import ClientError

import configparser
import json
import pandas as pd

In [22]:
# Extracting config variables 
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

KEY = config.get('USER', 'KEY')
SECRET = config.get('USER', 'SECRET')

In [23]:
DWH_ROLE_NAME = config.get('CLUSTER', 'DWH_ROLE_NAME')
DWH_DB_NAME = config.get('CLUSTER', 'DWH_DB_NAME')
DWH_CLUSTER_ID = config.get('CLUSTER', 'DWH_CLUSTER_ID')
DWH_NODE_TYPE = config.get('CLUSTER', 'DWH_NODE_TYPE')
DWH_USER_NAME = config.get('CLUSTER', 'DWH_USER_NAME')
DWH_USER_PASSWORD = config.get('CLUSTER', 'DWH_USER_PASSWORD')
DWH_NUMBER_0F_NODES = int(config.get('CLUSTER', 'DWH_NUMBER_0F_NODES'))
DWH_PORT = int(config.get('CLUSTER', 'DWH_PORT'))

variables = pd.DataFrame({
    'keys':['DWH_ROLE_NAME', 'DWH_DB_NAME', 'DWH_CLUSTER_ID', 'DWH_NODE_TYPE', 'DWH_NUMBER_0F_NODES', 'DWH_PORT'], 
    'values':[DWH_ROLE_NAME, DWH_DB_NAME, DWH_CLUSTER_ID, DWH_NODE_TYPE, DWH_NUMBER_0F_NODES, DWH_PORT]
})

variables

Unnamed: 0,keys,values
0,DWH_ROLE_NAME,redshift_s3_readonly
1,DWH_DB_NAME,sparkifydb
2,DWH_CLUSTER_ID,sparkify-cluster
3,DWH_NODE_TYPE,dc2.large
4,DWH_NUMBER_0F_NODES,4
5,DWH_PORT,5439


### Create IAM role for Redshift cluster
This role will grant redshift AmazonS3ReadOnlyAccess

In [16]:
# Instantiating IAM client
iam = boto3.client('iam', region_name='us-east-2', aws_access_key_id=KEY, aws_secret_access_key=SECRET)

try:
    print('Creating IAM role for Redshift cluster...')
    iam_role = iam.create_role(
        RoleName=DWH_ROLE_NAME,
        AssumeRolePolicyDocument=json.dumps({
            'Statement': [{
                'Action': 'sts:AssumeRole',
                'Effect': 'Allow',
                'Principal': {'Service': 'redshift.amazonaws.com'}
            }],
            'Version': '2012-10-17'
        }),
        Description='Allows Redshift cluster to call AWS services on you behalf',
    )
    print('Role creation successful!')
    
    
    iam.attach_role_policy(
        RoleName=DWH_ROLE_NAME,
        PolicyArn='arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess'
    )['ResponseMetadata']['HTTPStatusCode']
    
    print('Role policy attached successfully!')
except Exception as e:
    if e.response['Error']['Code'] == 'EntityAlreadyExists':
        iam_role = iam.get_role(RoleName=DWH_ROLE_NAME)
        print('Role gotten')
    else:
        print(e)

Creating IAM role for Redshift cluster...
Role gotten


In [2]:
role_arn = iam_role['Role']['Arn']
role_arn

### Build the Redshift cluster

In [18]:
# Instantiating redshift client
redshift = boto3.client('redshift', region_name='us-east-2', aws_access_key_id=KEY, aws_secret_access_key=SECRET)

try:
    print('Creating Redshift cluster...')
    redshift_cluster = redshift.create_cluster(
        DBName=DWH_DB_NAME,
        ClusterIdentifier=DWH_CLUSTER_ID,
        NodeType=DWH_NODE_TYPE,
        MasterUsername=DWH_USER_NAME,
        MasterUserPassword=DWH_USER_PASSWORD,
        NumberOfNodes=DWH_NUMBER_0F_NODES,
        IamRoles=[
            role_arn,
        ]
    )
    print('Redshift cluster creation successful!')
except Exception as e:
    print(e)

Creating Redshift cluster...
Redshift cluster creation successful!


In [21]:
# Checking cluster availability status
cluster_props = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_ID)['Clusters'][0]
cluster_props['ClusterAvailabilityStatus']

'Available'

In [3]:
# Obtaining cluster endpoint
DWH_ENDPOINT = cluster_props['Endpoint']['Address']
DWH_PORT = int(cluster_props['Endpoint']['Port'])
print('Endpoint: {}\nPort: {}'.format(DWH_ENDPOINT, DWH_PORT))

In [4]:
# cluster_vars = pd.DataFrame({
#     'keys':['ClusterIdentifier', 'NodeType', 'ClusterStatus', 'Endpoint:Address', 'Endpoint:Port', 'IamRole', 'Vpc', 'NumberOfNodes'], 
#     'values':[cluster_props['ClusterIdentifier'], cluster_props['NodeType'], cluster_props['ClusterStatus'], 
#               cluster_props['Endpoint']['Address'], cluster_props['Endpoint']['Port'], cluster_props['IamRoles'][0]['IamRoleArn'],
#               cluster_props['VpcId'], cluster_props['NumberOfNodes']]
# })

# cluster_vars

### Open Incomming TCP port to access the cluster endpoint

In [25]:
# get an ec2 resourse
ec2 = boto3.resource('ec2', region_name='us-east-2', aws_access_key_id=KEY, aws_secret_access_key=SECRET)

In [5]:
try: 
    vpc = ec2.Vpc(id=cluster_props['VpcId'])
    default_sg = list(vpc.security_groups.all())[0]
    print(default_sg)
    
    default_sg.authorize_ingress(
        GroupName=default_sg.group_name,
        CidrIp='0.0.0.0/0',
        IpProtocol='TCP',
        FromPort=DWH_PORT,
        ToPort=DWH_PORT
    )
except Exception as e:
    print(e)

### Cleaning UP

In [34]:
# response = redshift.delete_cluster(ClusterIdentifier=DWH_CLUSTER_ID, SkipFinalClusterSnapshot=True)
# response

In [None]:
# iam.detach_role_policy(RoleName=DWH_ROLE_NAME, PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess")
# iam.delete_role(RoleName=DWH_ROLE_NAME)