In [1]:
import pandas as pd
import boto3
import json
import configparser

### Create Redshift 

In [2]:
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

KEY    = config.get('IAM','KEY')
SECRET = config.get('IAM','SECRET')

DWH_CLUSTER_IDENTIFIER = config.get("DWH","DWH_CLUSTER_IDENTIFIER")
DWH_CLUSTER_TYPE       = config.get("DWH","DWH_CLUSTER_TYPE")
DWH_NUM_NODES          = config.get("DWH","DWH_NUM_NODES")
DWH_NODE_TYPE          = config.get("DWH","DWH_NODE_TYPE")


DWH_DB          = config.get("DWH","DWH_DB")
DWH_DB_USER     = config.get("DWH","DWH_DB_USER")
DWH_DB_PASSWORD = config.get("DWH","DWH_DB_PASSWORD")
DWH_PORT        = config.get("DWH","DWH_PORT")

DWH_IAM_ROLE_NAME = config.get("DWH","DWH_IAM_ROLE_NAME")


BUCKET = config.get("S3", "BUCKET")

# DWH_IAM_ROLE_NAME      = config.get("DWH", "DWH_IAM_ROLE_NAME")

(DWH_DB_USER, DWH_DB_PASSWORD, DWH_DB,
 KEY, SECRET,BUCKET,
DWH_IAM_ROLE_NAME)

('awsuser',
 'AdminPass123',
 'dev',
 'AKIA433DURC4ATQP2POF',
 '5EsL4E36WqTpSTEqtTnCg8i99WrUwJjwXJUtSXOM',
 'udacity-dend',
 'myRedshiftRole')

# Create the IAM role (if not exists)

In [3]:
iam = boto3.client('iam',
                    region_name="us-east-1",
                    aws_access_key_id=KEY,
                    aws_secret_access_key=SECRET)

In [4]:
try:
    print('1.1 Creating a new IAM Role')
    dwhRole = iam.create_role(
        Path='/',
        RoleName=DWH_IAM_ROLE_NAME,
        Description="Allow Redshift clusters to call AWS services on your behalf.",
        AssumeRolePolicyDocument=json.dumps(
            {'Statement': [{'Action': 'sts:AssumeRole',
                'Effect': 'Allow',
                'Principal': {'Service': 'redshift.amazonaws.com'}}],
            'Version': '2012-10-17'})
    )
    

except Exception as e:
    print(e)

1.1 Creating a new IAM Role
An error occurred (EntityAlreadyExists) when calling the CreateRole operation: Role with name myRedshiftRole already exists.


In [5]:
# Attach Policy

print('1.2 Attaching Policy')
iam.attach_role_policy(RoleName=DWH_IAM_ROLE_NAME,
                      PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
                      )['ResponseMetadata']['HTTPStatusCode']

1.2 Attaching Policy


200

In [6]:
# Get and print the IAM role ARN
print('1.3 Get the IAM role ARN')
iam_role = iam.get_role(
                        RoleName=DWH_IAM_ROLE_NAME
                        )
roleArn = iam_role['Role']['Arn']
# NOTE: Un-comment this to print the result.
print(roleArn)

1.3 Get the IAM role ARN
arn:aws:iam::884435355832:role/myRedshiftRole


In [3]:
redshift = boto3.client('redshift',
                       region_name="us-east-1",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                       )

In [9]:
try:
    response = redshift.create_cluster(        
        #HW
        ClusterType=DWH_CLUSTER_TYPE,
        NodeType=DWH_NODE_TYPE,
        NumberOfNodes=int(DWH_NUM_NODES),

        #Identifiers & Credentials
        DBName=DWH_DB,
        ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,
        MasterUsername=DWH_DB_USER,
        MasterUserPassword=DWH_DB_PASSWORD,
        
        #Roles (for s3 access)
        IamRoles=[roleArn]  
    )
except Exception as e:
    print(e)

An error occurred (ClusterAlreadyExists) when calling the CreateCluster operation: Cluster already exists


In [4]:
def prettyRedshiftProps(props):
    pd.set_option('display.max_colwidth', -1)
    keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId']
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=["Key", "Value"])

myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

Unnamed: 0,Key,Value
0,ClusterIdentifier,redshift-cluster-1
1,NodeType,dc2.large
2,ClusterStatus,resuming
3,MasterUsername,awsuser
4,DBName,dev
5,Endpoint,"{'Address': 'redshift-cluster-1.ccmn84cnjbnf.us-east-1.redshift.amazonaws.com', 'Port': 5439}"
6,VpcId,vpc-99c9b1e4
7,NumberOfNodes,1


In [5]:
DWH_ENDPOINT = myClusterProps['Endpoint']['Address']
DWH_ROLE_ARN = myClusterProps['IamRoles'][0]['IamRoleArn']
print("DWH_ENDPOINT :: ", DWH_ENDPOINT)
print("DWH_ROLE_ARN :: ", DWH_ROLE_ARN)

DWH_ENDPOINT ::  redshift-cluster-1.ccmn84cnjbnf.us-east-1.redshift.amazonaws.com
DWH_ROLE_ARN ::  arn:aws:iam::884435355832:role/myRedshiftRole


In [13]:
ec2 = boto3.resource(   'ec2', 
                      region_name="us-east-1",
                      aws_access_key_id=KEY,
                      aws_secret_access_key=SECRET)

In [14]:
try:
    vpc = ec2.Vpc(id=myClusterProps['VpcId'])
    defaultSg = list(vpc.security_groups.all())[0]
    print(defaultSg)
    
    defaultSg.authorize_ingress(
        GroupName= defaultSg.group_name,
        CidrIp='0.0.0.0/0',
        IpProtocol='TCP', 
        FromPort=int(DWH_PORT),
        ToPort=int(DWH_PORT)
    )
except Exception as e:
    print(e)

ec2.SecurityGroup(id='sg-0da029bc2efb77d05')
An error occurred (InvalidPermission.Duplicate) when calling the AuthorizeSecurityGroupIngress operation: the specified rule "peer: 0.0.0.0/0, TCP, from port: 5439, to port: 5439, ALLOW" already exists


## STEP 4: Connect to cluster

In [6]:
%load_ext sql

In [7]:
conn_string="postgresql://{}:{}@{}:{}/{}".format(DWH_DB_USER, DWH_DB_PASSWORD, DWH_ENDPOINT, DWH_PORT,DWH_DB)
print(conn_string)
%sql $conn_string

postgresql://awsuser:AdminPass123@redshift-cluster-1.ccmn84cnjbnf.us-east-1.redshift.amazonaws.com:5439/dev


'Connected: awsuser@dev'

In [60]:
%%sql

DROP TABLE IF EXISTS staging_events;

CREATE TABLE IF NOT EXISTS staging_events (
        event_id BIGINT IDENTITY(0,1) NOT NULL,
        artist text NULL,
        auth text NULL,
        firstName text NULL,
        gender text NULL,
        itemInSession text NULL,
        lastName text NULL,
        length text NULL,
        level text NULL,
        location text NULL,
        method text NULL,
        page text NULL,
        registration text NULL,
        sessionId INTEGER NOT NULL SORTKEY DISTKEY,
        song text NULL,
        status INTEGER NULL,
        ts BIGINT NOT NULL,
        userAgent text NULL,
        userId text NULL
    );

 * postgresql://awsuser:***@redshift-cluster-1.ccmn84cnjbnf.us-east-1.redshift.amazonaws.com:5439/dev
Done.
Done.


[]

In [12]:
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

LOG_DATA          = config.get("S3","LOG_DATA")
ARN               = DWH_ROLE_ARN
LOG_JSONPATH      = config.get("S3","LOG_JSONPATH")


(LOG_DATA, ARN, LOG_JSONPATH)

("'s3://udacity-dend/log_data'",
 'arn:aws:iam::884435355832:role/myRedshiftRole',
 "'s3://udacity-dend/log_json_path.json'")

In [65]:
qry =("""
    COPY staging_events FROM {}
    credentials 'aws_iam_role={}'
    format as json {}
    STATUPDATE ON
    region 'us-west-2';
""").format(LOG_DATA, ARN, LOG_JSONPATH)

print(qry)

%sql $qry


    COPY staging_events FROM 's3://udacity-dend/log_data'
    credentials 'aws_iam_role=arn:aws:iam::884435355832:role/myRedshiftRole'
    format as json 's3://udacity-dend/log_json_path.json'
    STATUPDATE ON
    region 'us-west-2';

 * postgresql://awsuser:***@redshift-cluster-1.ccmn84cnjbnf.us-east-1.redshift.amazonaws.com:5439/dev
Done.


[]

In [8]:
%%sql
SELECT count(*) 
FROM staging_events

 * postgresql://awsuser:***@redshift-cluster-1.ccmn84cnjbnf.us-east-1.redshift.amazonaws.com:5439/dev
1 rows affected.


count
8056


In [64]:
%%sql
SELECT * from stl_load_errors

 * postgresql://awsuser:***@redshift-cluster-1.ccmn84cnjbnf.us-east-1.redshift.amazonaws.com:5439/dev
1 rows affected.


userid,slice,tbl,starttime,session,query,filename,line_number,colname,type,col_length,position,raw_line,raw_field_value,err_code,err_reason,is_partial,start_offset
100,0,161732,2021-11-24 02:14:33.446016,18706,571,s3://udacity-dend/log_data/2018/11/2018-11-01-events.json,1,sessionid,int4,0,0,"{""artist"":null,""auth"":""Logged In"",""firstName"":""Walter"",""gender"":""M"",""itemInSession"":0,""lastName"":""Frye"",""length"":null,""level"":""free"",""location"":""San Francisco-Oakland-Hayward, CA"",""method"":""GET"",""page"":""Home"",""registration"":1540919166796.0,""sessionId"":38,""song"":null,""status"":200,""ts"":1541105830796,""userAgent"":""\\""Mozilla\\/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit\\/537.36 (KHTML, like Gecko) Chrome\\/36.0.1985.143 Safari\\/537.36\\"""",""userId"":""39""} {""artist"":null,""auth"":""Logged In"",""firstName"":""Kaylee"",""gender"":""F"",""itemInSession"":0,""lastName"":""Summers"",""length"":null,""level"":""free"",""location"":""Phoenix-Mesa-Scottsdale, AZ"",""method"":""GET"",""page"":""Home"",""registration"":1540344794796.0,""sessionId"":139,""song"":null,""status"":200,""ts"":1541106106796,""userAgent"":""\\""Mozilla\\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\\/537.36 (KHTML, like Gecko) Chrome\\/35.0.1916.153 Safari\\/537.36\\"""",""userId"":""8""} {""artist"":""Des'ree"",""auth"":""Logged In"",""firstName"":""Kaylee"",""gender"":""F"",""itemInSession"":1,""lastName"":""Summers"",",,1213,Missing data for not-null field,0,0


## Log data to song staging table

In [9]:
%%sql

DROP TABLE IF EXISTS staging_songs;

CREATE TABLE IF NOT EXISTS staging_songs (
    artist_id text NOT NULL SORTKEY DISTKEY,
    artist_latitude text NULL,
    artist_location text NULL,
    artist_longitude text NULL,
    artist_name text NULL,
    duration DECIMAL(9) NULL,
    num_songs INTEGER NULL,
    song_id text NOT NULL,
    title text NULL,
    year INTEGER NULL
);

 * postgresql://awsuser:***@redshift-cluster-1.ccmn84cnjbnf.us-east-1.redshift.amazonaws.com:5439/dev
Done.
Done.


[]

In [15]:
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

SONG_DATA = config.get("S3","SONG_DATA")
ARN       = config.get("IAM_ROLE","ARN")


(SONG_DATA, ARN)

("'s3://udacity-dend/song_data'",
 'arn:aws:iam::884435355832:role/myRedshiftRole')

In [16]:
qry = ("""
    COPY staging_songs FROM {}
    credentials 'aws_iam_role={}'
    format as json 'auto'
    ACCEPTINVCHARS AS '^'
    STATUPDATE ON
    region 'us-west-2';
""").format(SONG_DATA, ARN)

print(qry)

%sql $qry


    COPY staging_songs FROM 's3://udacity-dend/song_data'
    credentials 'aws_iam_role=arn:aws:iam::884435355832:role/myRedshiftRole'
    format as json 'auto'
    ACCEPTINVCHARS AS '^'
    STATUPDATE ON
    region 'us-west-2';

 * postgresql://awsuser:***@redshift-cluster-1.ccmn84cnjbnf.us-east-1.redshift.amazonaws.com:5439/dev
Done.


[]

In [17]:
%%sql
SELECT * from stl_load_errors

 * postgresql://awsuser:***@redshift-cluster-1.ccmn84cnjbnf.us-east-1.redshift.amazonaws.com:5439/dev
0 rows affected.


userid,slice,tbl,starttime,session,query,filename,line_number,colname,type,col_length,position,raw_line,raw_field_value,err_code,err_reason,is_partial,start_offset


In [18]:
%%sql
SELECT count(*) 
FROM staging_songs

 * postgresql://awsuser:***@redshift-cluster-1.ccmn84cnjbnf.us-east-1.redshift.amazonaws.com:5439/dev
1 rows affected.


count
14896


### Check list log data

In [26]:
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))
BUCKET = config.get("S3", "BUCKET")

udacity =  s3.Bucket(BUCKET)


# Print all files
# for object in udacity.objects.filter(Prefix='log_data'):
#     count += 1
#     print(object)


# count: 31
count = sum(1 for _ in udacity.objects.filter(Prefix='log_data'))
print(count)

31


### Check list song data

In [22]:
BUCKET = config.get("S3", "BUCKET")
udacity =  s3.Bucket(BUCKET)


# Print all files
# for object in udacity.objects.filter(Prefix='song_data'):
#     count += 1
#     print(object)


# count: 14897
# count = sum(1 for _ in udacity.objects.filter(Prefix='song_data'))
# print(count)

14897


In [None]:
# Create the IAM role (if not exists)

try:
    print('1.1 Creating a new IAM Role')
    dwhRole = iam.create_role(
        Path='/',
        RoleName=DWH_IAM_ROLE_NAME,
        Description="Allow Redshift clusters to call AWS services on your behalf.",
        AssumeRolePolicyDocument=json.dumps(
            {'Statement': [{'Action': 'sts:AssumeRole',
                'Effect': 'Allow',
                'Principal': {'Service': 'redshift.amazonaws.com'}}],
            'Version': '2012-10-17'})
    )
    

except Exception as e:
    print(e)


In [None]:
# Attach Policy

print('1.2 Attaching Policy')
iam.attach_role_policy(RoleName=DWH_IAM_ROLE_NAME,
                      PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
                      )['ResponseMetadata']['HTTPStatusCode']


In [None]:
# Get and print the IAM role ARN
print('1.3 Get the IAM role ARN')
iam_role = iam.get_role(
                        RoleName=DWH_IAM_ROLE_NAME
                        )
roleArn = iam_role['Role']['Arn']
# NOTE: Un-comment this to print the result.
#print(roleArn)

In [None]:
try:
    response = redshift.create_cluster( 
        ClusterType=DWH_CLUSTER_TYPE,
        NodeType=DWH_NODE_TYPE,
        NumberOfNodes=int(DWH_NUM_NODES),
        DBName=DWH_DB,
        ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,
        MasterUsername=DWH_DB_USER,
        MasterUserPassword=DWH_DB_PASSWORD,
        IamRoles=[roleArn]
    )
except Exception as e:
    print(e)

In [None]:
def prettyRedshiftProps(props):
    pd.set_option('display.max_colwidth', -1)
    keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId']
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=["Key", "Value"])

myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
# NOTE: Un-comment this to print the result.
#prettyRedshiftProps(myClusterProps)

In [None]:
DWH_ENDPOINT = myClusterProps['Endpoint']['Address']
DWH_ROLE_ARN = myClusterProps['IamRoles'][0]['IamRoleArn']

print("DWH_ENDPOINT :: ", DWH_ENDPOINT)
print("DWH_ROLE_ARN :: ", DWH_ROLE_ARN)

In [None]:
try:
    vpc = ec2.Vpc(id=myClusterProps['VpcId'])
    defaultSg = list(vpc.security_groups.all())[0]
    print(defaultSg)
    
    defaultSg.authorize_ingress(
        GroupName= defaultSg.group_name,
        CidrIp='0.0.0.0/0',
        IpProtocol='TCP', 
        FromPort=int(DWH_PORT),
        ToPort=int(DWH_PORT)
    )
except Exception as e:
    print(e)

In [None]:
%load_ext sql

In [None]:
conn_string="postgresql://{}:{}@{}:{}/{}".format(DWH_DB_USER, DWH_DB_PASSWORD, DWH_ENDPOINT, DWH_PORT,DWH_DB)
# NOTE: Un-comment this to print the result.
#print(conn_string)
%sql $conn_string

In [None]:
# Number of items in staging_events table
%%time
%%sql
SELECT COUNT(*)
FROM staging_events;

In [None]:
# Number of items in staging_songs table
%%time
%%sql
SELECT COUNT(*)
FROM staging_songs;