In [None]:
import boto3
import pandas as pd
import psycopg2
import json

In [None]:
import configparser
config = configparser.ConfigParser()
config.read_file(open('redshift-cluster.config'))

In [None]:
KEY = config.get('AWS', 'KEY')
SECRET = config.get('AWS', 'SECRET')


DWH_CLUSTER_TYPE = config.get('DWH', 'DWH_CLUSTER_TYPE')
DWH_NUM_NODES = config.get('DWH', 'DWH_NUM_NODES')
DWH_NODE_TYPE = config.get('DWH', 'DWH_NODE_TYPE')
DWH_CLUSTER_IDENTIFIER = config.get('DWH', 'DWH_CLUSTER_IDENTIFIER')
DWH_DB = config.get('DWH', 'DWH_DB')
DWH_DB_USER = config.get('DWH', 'DWH_DB_USER')
DWH_DB_PASSWORD = config.get('DWH', 'DWH_DB_PASSWORD')
DWH_PORT = config.get('DWH', 'DWH_PORT')
DWH_IAM_ROLE_NAME = config.get('DWH', 'DWH_IAM_ROLE_NAME')

(DWH_DB_USER, DWH_DB_PASSWORD, DWH_DB)

In [None]:
pd.DataFrame(
    {
        'Param':
            [
                'DWH_CLUSTER_TYPE',
                'DWH_NUM_NODES',
                'DWH_NODE_TYPE',
                'DWH_CLUSTER_IDENTIFIER',
                'DWH_DB',
                'DWH_DB_USER',
                'DWH_DB_PASSWORD',
                'DWH_PORT',
                'DWH_IAM_ROLE_NAME'
            ],
        'Value':
            [
                DWH_CLUSTER_TYPE,
                DWH_NUM_NODES,
                DWH_NODE_TYPE,
                DWH_CLUSTER_IDENTIFIER,
                DWH_DB,
                DWH_DB_USER,
                DWH_DB_PASSWORD,
                DWH_PORT,
                DWH_IAM_ROLE_NAME

            ]
    }
)

In [None]:
ec2 = boto3.resource(
                    'ec2',
                    region_name='us-east-1',
                    aws_access_key_id=KEY,
                    aws_secret_access_key=SECRET
)

In [None]:
s3 = boto3.resource(
                    's3',
                    region_name='us-east-1',
                    aws_access_key_id=KEY,
                    aws_secret_access_key=SECRET
)

In [None]:
iam = boto3.client(
                    'iam',
                    region_name='us-east-1',
                    aws_access_key_id=KEY,
                    aws_secret_access_key=SECRET
                    )

In [None]:
redshift = boto3.client(
                    'redshift',
                    region_name='us-east-1',
                    aws_access_key_id=KEY,
                    aws_secret_access_key=SECRET
                    )

In [None]:
bucket=s3.Bucket('redshift-tutorial-s3')
data_files = [filename.key for filename in bucket.objects.filter(Prefix='')]
data_files

In [None]:
roleArn = iam.get_role(RoleName=DWH_IAM_ROLE_NAME)['Role']['Arn']
roleArn

In [None]:
try:
    response = redshift.create_cluster(
        # see doc: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/redshift/client/create_cluster.html
        ClusterType=DWH_CLUSTER_TYPE,
        NodeType=DWH_NODE_TYPE,

        # Identifiers & Credentials
        DBName=DWH_DB,
        ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,
        MasterUsername=DWH_DB_USER,
        MasterUserPassword=DWH_DB_PASSWORD,
        Port=int(DWH_PORT),
        NumberOfNodes=int(DWH_NUM_NODES),
        Tags=[
            {'Key': 'ENVIRONMENT','Value': 'TUTORIAL'},
        ],

        # Roles (s3 access role)
        IamRoles=[roleArn],

        # Other settings
        MultiAZ=False,
    )
except Exception as e:
    print(e)

In [None]:
redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]

In [None]:
def prettyRedshiftProps(props):
    pd.set_option('display.max.colwidth', 0)
    keysToShow = ['ClusterIdentifier', 'ClusterStatus', 'NodeType', 'NumberOfNodes', 'DBName', 'MasterUsername', 'Endpoint', 'VpcId']
    x = [(k, v) for k, v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=['Parameter', 'value'])

clusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]

prettyRedshiftProps(clusterProps)

In [None]:
DWH_ENDPOINT = clusterProps['Endpoint']['Address']
DWH_ROLE_ARN = clusterProps['IamRoles'][0]['IamRoleArn']
DB_NAME = clusterProps['DBName']
DB_USER = clusterProps['MasterUsername']

In [None]:
DB_NAME

In [None]:
try:
    vpc = ec2.Vpc(id=clusterProps['VpcId'])
    default_SG = list(vpc.security_groups.all())[0]
    print(default_SG)

    default_SG.authorize_ingress(
        GroupName=default_SG.group_name,
        CidrIp='0.0.0.0/0',
        IpProtocol='TCP',
        FromPort=int(DWH_PORT),
        ToPort=int(DWH_PORT),
    )
except Exception as e:
    print(e)

In [None]:
try:
    conn = psycopg2.connect(
        host=DWH_ENDPOINT,
        dbname=DB_NAME,
        user=DB_USER,
        password=DWH_DB_PASSWORD,
        port=int(DWH_PORT)
    )
except Exception as e:
    print(e)

conn.set_session(autocommit=True)

In [None]:
try:
    cur = conn.cursor()
except Exception as e:
    print("Error: Could not obtain database cursor")
    print(e)

In [None]:
try:
    cur.execute(
        """
        create table if not exists users(
            userid integer not null distkey sortkey,
            username char(8),
            firstname varchar(30),
            lastname varchar(30),
            city varchar(30),
            state char(2),
            email varchar(100),
            phone char(14),
            likesports boolean,
            liketheatre boolean,
            likeconcerts boolean,
            likejazz boolean,
            likeclassical boolean,
            likeopera boolean,
            likerock boolean,
            likevegas boolean,
            likebroadway boolean,
            likemusicals boolean
        );
        """
    )
except Exception as e:
    print("Error: Could not create 'users' table")
    print(e)

In [None]:
try:
    cur.execute(
        """
        create table if not exists venue(
            venueid smallint not null distkey sortkey,
            venuename varchar(100),
            venuecity varchar(30),
            venuestate char(2),
            venueseats integer
        );
        """
    )
except Exception as e:
    print("Error: Could not create 'venue' table")
    print(e)

In [None]:
try:
    cur.execute(
        """
        create table if not exists category(
            catid smallint not null distkey sortkey,
            catgroup varchar(10),
            catname varchar(10),
            catdesc varchar(50)
        );

        create table if not exists date(
            dateid smallint not null distkey sortkey,
            caldate date not null,
            day character(3) not null,
            week smallint not null,
            month character(5) not null,
            qr character(5) not null,
            year smallint not null,
            holiday boolean default('N')
        );

        create table if not exists event(
            eventid integer not null distkey,
            venueid smallint not null,
            catid smallint not null,
            dateid smallint not null sortkey,
            eventname varchar(200),
            starttime timestamp
        );

        create table if not exists listing(
            listid integer not null distkey,
            sellerid integer not null,
            eventid integer not null,
            dateid smallint not null sortkey,
            numtickets smallint not null,
            priceperticket decimal(8,2),
            totalprice decimal(8,2),
            listtime timestamp
        );
        
        """
    )
except Exception as e:
    print("Error: Could not create 'venue' table")
    print(e)

In [None]:
try:
    cur.execute(
    """
    copy users 
    from 's3://redshift-tutorial-s3/allusers_pipe.txt'
    credentials 'aws_iam_role=arn:aws:iam::211125552279:role/redshift-tutorial-s3-access-role'
    delimiter '|'
    region 'us-east-1'

    """
    )
except Exception as e:
    print("Unable to copy 'users' data from s3 into redshift")
    print(e)

In [None]:
try:
    cur.execute(
    """
    select * from users;

    """
    )
except Exception as e:
    print("Unable to select from 'users' table")
    print(e)


In [None]:
rows =cur.fetchmany(10)
for row in rows:
    print(row)


In [None]:
try:
    conn.close()
except psycopg2.Error as e:
    print(e)

redshift.delete_cluster(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER, SkipFinalClusterSnapshot=True)
