# Iac: Create Redshift Cluster and Run Airflow DAG

In [None]:
import pandas as pd
import boto3
import configparser
import json
import re
import os
import time

## AWS Configuration Variables

In [None]:
config = configparser.ConfigParser()
config.read_file(open('airflow/config/redshift.cfg'))

KEY                    = config.get('AWS','KEY')
SECRET                 = config.get('AWS','SECRET')
BUCKET                 = config.get('AWS','BUCKET')
REGION                 = config.get('AWS', 'REGION')

DWH_CLUSTER_TYPE       = config.get("DWH", "DWH_CLUSTER_TYPE")
DWH_NUM_NODES          = config.get("DWH","DWH_NUM_NODES")
DWH_NODE_TYPE          = config.get("DWH","DWH_NODE_TYPE")

DWH_CLUSTER_IDENTIFIER = config.get("DWH","DWH_CLUSTER_IDENTIFIER")
DWH_DB                 = config.get("DWH","DWH_DB")
DWH_DB_USER            = config.get("DWH","DWH_DB_USER")
DWH_DB_PASSWORD        = config.get("DWH","DWH_DB_PASSWORD")
DWH_PORT               = config.get("DWH","DWH_PORT")

DWH_IAM_ROLE_NAME      = config.get("DWH", "DWH_IAM_ROLE_NAME")

(DWH_DB_USER, DWH_DB_PASSWORD, DWH_DB)

pd.DataFrame({"Param":
                  ["DWH_CLUSTER_TYPE", "DWH_NUM_NODES", "DWH_NODE_TYPE", "DWH_CLUSTER_IDENTIFIER", "DWH_DB", "DWH_DB_USER", "DWH_DB_PASSWORD", "DWH_PORT", "DWH_IAM_ROLE_NAME"],
              "Value":
                  [DWH_CLUSTER_TYPE, DWH_NUM_NODES, DWH_NODE_TYPE, DWH_CLUSTER_IDENTIFIER, DWH_DB, DWH_DB_USER, DWH_DB_PASSWORD, DWH_PORT, DWH_IAM_ROLE_NAME]
             })

# Instaniate AWS Resources

In [None]:
ec2 = boto3.resource('ec2',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                    )

s3 = boto3.client('s3',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                   )

iam = boto3.client('iam',aws_access_key_id=KEY,
                     aws_secret_access_key=SECRET,
                     region_name='us-west-2'
                  )

redshift = boto3.client('redshift',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                       )

## Create AWS S3 Sample Bucket

In [None]:
# local path to sample_data
local_path = 'data/'
# create s3 bucket
s3.create_bucket(Bucket=BUCKET, CreateBucketConfiguration={'LocationConstraint': REGION})

#for file in local_path, add to s3 bucket
for root,dirs,files in os.walk(path):
    for file in files:
        filepath = os.path.join(root,file)
        s3.upload_file(filepath, BUCKET, file)
        upload_count += 1
        

In [None]:
# list files within s3 bucket
for key in s3.list_objects(Bucket=BUCKET)['Contents']:
    print(key['Key'])

In [None]:
# create IAM role
try:
    print("1.1 Creating a new IAM Role") 
    dwhRole = iam.create_role(
        Path='/',
        RoleName=DWH_IAM_ROLE_NAME,
        Description = "Allows Redshift clusters to call AWS services on your behalf.",
        AssumeRolePolicyDocument=json.dumps(
            {'Statement': [{'Action': 'sts:AssumeRole',
               'Effect': 'Allow',
               'Principal': {'Service': 'redshift.amazonaws.com'}}],
             'Version': '2012-10-17'})
    )    
except Exception as e:
    print(e)

print("1.2 Attaching Policy")

iam.attach_role_policy(RoleName=DWH_IAM_ROLE_NAME,
                       PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
                      )['ResponseMetadata']['HTTPStatusCode']

print("1.3 Get the IAM role ARN")
roleArn = iam.get_role(RoleName=DWH_IAM_ROLE_NAME)['Role']['Arn']

print(roleArn)

{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Action": [
                "iam:AttachRolePolicy",
                "iam:CreateRole",
                "iam:PutRolePolicy",
                "iam:GetRole",
                "iam:DetachRolePolicy"
            ],
            "Resource": "arn:aws:iam::501460770806:role/udacity_dwh_role"
        }
    ]
}

Within AWS console, Trusted Relationships Policy change to:

{
  "Version": "2012-10-17",
  "Statement": [
    {
      "Effect": "Allow",
      "Principal": {
        "Service": "redshift.amazonaws.com",
        "AWS": "arn:aws:iam::501460770806:user/learning-projects"
      },
      "Action": "sts:AssumeRole"
    }
  ]
}

## Create or Resume Cluster

In [None]:
try:
    response = redshift.create_cluster(    
        #Redshift cluster config    
        ClusterType=DWH_CLUSTER_TYPE,
        NodeType=DWH_NODE_TYPE,
        NumberOfNodes=int(DWH_NUM_NODES),

        #Identifiers & Credentials
        DBName=DWH_DB,
        ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,
        MasterUsername=DWH_DB_USER,
        MasterUserPassword=DWH_DB_PASSWORD,
        
        #Roles (for s3 access)
        IamRoles=[roleArn]  
    )
except Exception as e:
    print(e)

In [None]:
# resume cluster
try:
    redshift.resume_cluster(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)
except Exception as e:
    print(e)

In [None]:
def prettyRedshiftProps(props):
    """Returns redshift cluster properties
    Keyword Argument:
    props -- Cluster property dictionary  (redshift variable called with describe_clusters attribute)
    """
    pd.set_option('display.max_colwidth', -1)
    keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId']
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=["Key", "Value"])

myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

In [None]:
# Cell will print out when cluster is available
cluster_status = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]['ClusterStatus']
while cluster_status != 'available':
    time.sleep(60)
    cluster_status = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]['ClusterStatus']
else:
    print('Cluster is Available')

## Allow Inbound TCP port to Access Redshift Endpoint

In [None]:
# RUN CELL ONLY WHEN CLUSTER IS AVAILABLE
# DWH_ENDPOINT = myClusterProps['Endpoint']['Address']
# DWH_ROLE_ARN = myClusterProps['IamRoles'][0]['IamRoleArn']

In [None]:
try:
    vpc = ec2.Vpc(id=myClusterProps['VpcId'])
    defaultSg = list(vpc.security_groups.all())[0]
    print(defaultSg)
    defaultSg.authorize_ingress(
        GroupName=defaultSg.group_name,
        # GroupName='default',
        CidrIp='0.0.0.0/0',
        IpProtocol='TCP',
        FromPort=int(DWH_PORT),
        ToPort=int(DWH_PORT)
    )
except Exception as e:
    print(e)

## Create Postgres Database for Airflow Backend

Task parrelization isn't available with the default sqlite backend and sequential executor. A postgres database for the airflow backend will allow the local executor to be used for task parrelization.

Instructions:
- [Download Postgres UI App](https://www.postgresql.org/download/)
- Within Postgres query editor or psql terminal, run: CREATE DATABASE airflow;



In [None]:
#uncomment cell below to install sql magic
# ! pip install ipython-sql
%load_ext sql

In [None]:
postgres_conn_string = "postgresql://{}:{}@{}:{}/{}".format(POSTGRES_USER, POSTGRES_PASSWORD, POSTGRES_HOST, POSTGRES_PORT, POSTGRES_DB)
%sql $postgres_conn_string

In [None]:
redshift_conn_string="postgresql://{}:{}@{}:{}/{}".format(DWH_DB_USER, DWH_DB_PASSWORD, DWH_ENDPOINT, DWH_PORT,DWH_DB)
%sql $redshift_conn_string

In [None]:
%sql SELECT * FROM  data_science_mentor_activity_errors LIMIT 5

In [None]:
# use for debugging AWS COPY commands
%sql SELECT * FROM stl_load_errors LIMIT 10

In [None]:
%sql SELECT * FROM projects_dim LIMIT 10

## Connect to Redshift Relational Database

In [None]:
%sql SELECT * FROM users_dim LIMIT 10

In [None]:
%sql SELECT * FROM videos_dim LIMIT 10

## Dimension Tables

In [None]:
%sql SELECT * FROM data_science_project_feedback LIMIT 10

In [None]:
%sql SELECT * FROM data_analytics_section_feedback LIMIT 10

In [None]:
%sql SELECT * FROM data_science_video_log LIMIT 10

In [None]:
%sql SELECT * FROM data_science_mentor_activity LIMIT 10

## Staging Tables

In [None]:
%sql SELECT * FROM data_science_highest_prompt_score LIMIT 10

In [None]:
%sql SELECT * FROM data_engineering_highest_answer_score LIMIT 10

## Fact Tables

In [None]:
%sql SELECT * FROM data_analytics_avg_video_views_per_user LIMIT 10

In [None]:
% sql select * FROM data_science_avg_video_view_range LIMIT 10

In [None]:
# pause cluster
pause_cluster = redshift.pause_cluster(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)

In [None]:
# delete cluster
# redshift.delete_cluster(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER, SkipFinalClusterSnapshot=True)

# Delete or Pause Resources

For PostgreSQL connection, shut down server throught PostgreSQL app or psql terminal.

In [None]:
myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

In [None]:
# delete IAM role
iam.detach_role_policy(RoleName=DWH_IAM_ROLE_NAME, PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess")
iam.delete_role(RoleName=DWH_IAM_ROLE_NAME)

## Table Samples

In [None]:
	
from pyspark.sql import SparkSession
spark = SparkSession.builder.\
config("spark.jars.packages","saurfang:spark-sas7bdat:2.0.0-s_2.11")\
.enableHiveSupport().getOrCreate()
df_spark =spark.read.format('com.github.saurfang.sas.spark').load('../../data/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat')


In [None]:
#write to parquet
df_spark.write.parquet("sas_data")
df_spark=spark.read.parquet("sas_data")