# Sparkify Control Point

In [1]:
# First things first, we have to create a redshift cluster for our project on AWS
# Here, we'd be using IaC to proceed with the processes

# importing boto3, AWS python SDK
import boto3
from botocore.exceptions import ClientError

import configparser
import json
import pandas as pd

## Configuration

In [2]:
# Extracting config variables 
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

KEY = config.get('USER', 'KEY')
SECRET = config.get('USER', 'SECRET')

In [3]:
DWH_ROLE_NAME = config.get('DWH', 'DWH_ROLE_NAME')
DWH_DB_NAME = config.get('DWH', 'DWH_DB_NAME')
DWH_CLUSTER_ID = config.get('DWH', 'DWH_CLUSTER_ID')
DWH_NODE_TYPE = config.get('DWH', 'DWH_NODE_TYPE')
DWH_USER_NAME = config.get('DWH', 'DWH_USER_NAME')
DWH_USER_PASSWORD = config.get('DWH', 'DWH_USER_PASSWORD')
DWH_NUMBER_0F_NODES = int(config.get('DWH', 'DWH_NUMBER_0F_NODES'))
DWH_PORT = int(config.get('DWH', 'DWH_PORT'))

variables = pd.DataFrame({
    'keys':['DWH_ROLE_NAME', 'DWH_DB_NAME', 'DWH_CLUSTER_ID', 'DWH_NODE_TYPE', 'DWH_NUMBER_0F_NODES', 'DWH_PORT'], 
    'values':[DWH_ROLE_NAME, DWH_DB_NAME, DWH_CLUSTER_ID, DWH_NODE_TYPE, DWH_NUMBER_0F_NODES, DWH_PORT]
})

variables

Unnamed: 0,keys,values
0,DWH_ROLE_NAME,redshift_s3_readonly
1,DWH_DB_NAME,sparkifydb
2,DWH_CLUSTER_ID,sparkify-cluster
3,DWH_NODE_TYPE,dc2.large
4,DWH_NUMBER_0F_NODES,4
5,DWH_PORT,5439


### Create IAM role for Redshift cluster
This role will grant redshift AmazonS3ReadOnlyAccess

In [4]:
# Instantiating IAM client
iam = boto3.client('iam', region_name='us-east-2', aws_access_key_id=KEY, aws_secret_access_key=SECRET)
role_arn = ''

try:
    print('Creating IAM role for Redshift cluster...')
    iam_role = iam.create_role(
        RoleName=DWH_ROLE_NAME,
        AssumeRolePolicyDocument=json.dumps({
            'Statement': [{
                'Action': 'sts:AssumeRole',
                'Effect': 'Allow',
                'Principal': {'Service': 'redshift.amazonaws.com'}
            }],
            'Version': '2012-10-17'
        }),
        Description='Allows Redshift cluster to call AWS services on you behalf',
    )
    print('Role creation successful!')
    
    
    iam.attach_role_policy(
        RoleName=DWH_ROLE_NAME,
        PolicyArn='arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess'
    )['ResponseMetadata']['HTTPStatusCode']
    
    print('Role policy attached successfully!')
except Exception as e:
    if e.response['Error']['Code'] == 'EntityAlreadyExists':
        iam_role = iam.get_role(RoleName=DWH_ROLE_NAME)
    else:
        print(e)
finally:
    role_arn = iam_role['Role']['Arn']
    print('Role gotten')

Creating IAM role for Redshift cluster...
Role gotten


### Build the Redshift cluster

In [5]:
# Instantiating redshift client
redshift = boto3.client('redshift', region_name='us-east-2', aws_access_key_id=KEY, aws_secret_access_key=SECRET)

try:
    print('Creating Redshift cluster...')
    redshift_cluster = redshift.create_cluster(
        DBName=DWH_DB_NAME,
        ClusterIdentifier=DWH_CLUSTER_ID,
        NodeType=DWH_NODE_TYPE,
        MasterUsername=DWH_USER_NAME,
        MasterUserPassword=DWH_USER_PASSWORD,
        NumberOfNodes=DWH_NUMBER_0F_NODES,
        IamRoles=[
            role_arn,
        ]
    )
    print('Redshift cluster creation successful!')
except Exception as e:
    print(e)

Creating Redshift cluster...
Redshift cluster creation successful!


In [32]:
# Checking cluster availability status
cluster_props = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_ID)['Clusters'][0]
cluster_props['ClusterAvailabilityStatus'], cluster_props['ClusterStatus']

ClusterNotFoundFault: An error occurred (ClusterNotFound) when calling the DescribeClusters operation: Cluster sparkify-cluster not found.

In [9]:
# Obtaining cluster endpoint
DWH_ENDPOINT = cluster_props['Endpoint']['Address']
DWH_PORT = int(cluster_props['Endpoint']['Port'])
# print('Endpoint: {}\nPort: {}'.format(DWH_ENDPOINT, DWH_PORT))

In [21]:
# cluster_vars = pd.DataFrame({
#     'keys':['ClusterIdentifier', 'NodeType', 'ClusterStatus', 'Endpoint:Address', 'Endpoint:Port', 'IamRole', 'Vpc', 'NumberOfNodes'], 
#     'values':[cluster_props['ClusterIdentifier'], cluster_props['NodeType'], cluster_props['ClusterStatus'], 
#               cluster_props['Endpoint']['Address'], cluster_props['Endpoint']['Port'], cluster_props['IamRoles'][0]['IamRoleArn'],
#               cluster_props['VpcId'], cluster_props['NumberOfNodes']]
# })

# cluster_vars

### Open Incomming TCP port to access the cluster endpoint

In [12]:
# get an ec2 resourse
ec2 = boto3.resource('ec2', region_name='us-east-2', aws_access_key_id=KEY, aws_secret_access_key=SECRET)

In [12]:
try: 
    vpc = ec2.Vpc(id=cluster_props['VpcId'])
    default_sg = list(vpc.security_groups.all())[0]
    print(default_sg)
    
    default_sg.authorize_ingress(
        GroupName=default_sg.group_name,
        CidrIp='0.0.0.0/0',
        IpProtocol='TCP',
        FromPort=DWH_PORT,
        ToPort=DWH_PORT
    )
except Exception as e:
    print(e)

## ETL

In [10]:
%load_ext sql

In [11]:
conn_string = "postgresql://{}:{}@{}:{}/{}".format(DWH_USER_NAME, DWH_USER_PASSWORD, DWH_ENDPOINT, DWH_PORT, DWH_DB_NAME)
%sql $conn_string
# postgresql://{username}:{userpassword}@{host}:{port}/{database}

'Connected: mike@sparkifydb'

In [27]:
%%sql
COPY "staging_events" (artist, auth, firstname, gender, iteminsession, lastname, 
                       length, level, location, method, page, registration, sessionid, song, status, ts,
                       useragent, userid) 
FROM 's3://sparkify-dwh-bucket/random/log_data.csv'
CREDENTIALS 'aws_iam_role=arn:aws:iam::451737047229:role/redshift_s3_readonly'
COMPUPDATE OFF REGION 'us-west-2'
CSV IGNOREHEADER 1

 * postgresql://mike:***@sparkify-cluster.csc0efk6rxxc.us-east-2.redshift.amazonaws.com:5439/sparkifydb


InternalError: (psycopg2.errors.InternalError_) Load into table 'staging_events' failed.  Check 'stl_load_errors' system table for details.

[SQL: COPY "staging_events" (artist, auth, firstname, gender, iteminsession, lastname, 
                       length, level, location, method, page, registration, sessionid, song, status, ts,
                       useragent, userid) 
FROM 's3://sparkify-dwh-bucket/random/log_data.csv'
CREDENTIALS 'aws_iam_role=arn:aws:iam::451737047229:role/redshift_s3_readonly'
COMPUPDATE OFF REGION 'us-west-2'
CSV IGNOREHEADER 1]
(Background on this error at: https://sqlalche.me/e/14/2j85)

In [28]:
%%sql
SELECT *
FROM stl_load_errors
ORDER BY starttime DESC;

 * postgresql://mike:***@sparkify-cluster.csc0efk6rxxc.us-east-2.redshift.amazonaws.com:5439/sparkifydb
6 rows affected.


userid,slice,tbl,starttime,session,query,filename,line_number,colname,type,col_length,position,raw_line,raw_field_value,err_code,err_reason,is_partial,start_offset
100,7,101736,2021-12-30 01:46:32.340123,12729,555,s3://sparkify-dwh-bucket/random/log_data.csv,2,useragent,varchar,256,139,"Harmonia,Logged In,Ryan,M,0,Smith,655.77751,free,""San Jose-Sunnyvale-Santa Clara, CA"",PUT,NextSong,1.541016707796E12,583,Sehr kosmisch,200,1542241826796,""\\""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36\\"""",26",\\,1214,Invalid quote formatting for CSV,0,0
100,3,101734,2021-12-30 01:24:36.479190,12729,265,s3://sparkify-dwh-bucket/random/log_data.csv,2,registration,float8,0,90,"Harmonia,Logged In,Ryan,M,0,Smith,655.77751,free,""San Jose-Sunnyvale-Santa Clara, CA"",PUT,NextSong,1.541016707796E12,583,Sehr kosmisch,200,1542241826796,""\\""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36\\"""",26",NextSong,1207,"Invalid digit, Value 'N', Pos 0, Type: Double",0,0
100,3,101732,2021-12-30 01:22:31.075486,12729,229,s3://sparkify-dwh-bucket/random/log_data.csv,2,registration,float8,0,90,"Harmonia,Logged In,Ryan,M,0,Smith,655.77751,free,""San Jose-Sunnyvale-Santa Clara, CA"",PUT,NextSong,1.541016707796E12,583,Sehr kosmisch,200,1542241826796,""\\""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36\\"""",26",NextSong,1207,"Invalid digit, Value 'N', Pos 0, Type: Double",0,0
100,7,101730,2021-12-30 01:20:17.051733,12729,203,s3://sparkify-dwh-bucket/random/log_data.csv,2,registration,float8,0,90,"Harmonia,Logged In,Ryan,M,0,Smith,655.77751,free,""San Jose-Sunnyvale-Santa Clara, CA"",PUT,NextSong,1.541016707796E12,583,Sehr kosmisch,200,1542241826796,""\\""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36\\"""",26",NextSong,1207,"Invalid digit, Value 'N', Pos 0, Type: Double",0,0
100,7,101728,2021-12-30 01:14:00.140621,12729,119,s3://sparkify-dwh-bucket/random/log_data.csv,2,useragent,varchar,256,139,"Harmonia,Logged In,Ryan,M,0,Smith,655.77751,free,""San Jose-Sunnyvale-Santa Clara, CA"",PUT,NextSong,1.541016707796E12,583,Sehr kosmisch,200,1542241826796,""\\""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36\\"""",26",\\,1214,Invalid quote formatting for CSV,0,0
100,3,101726,2021-12-30 01:12:13.702264,12729,97,s3://sparkify-dwh-bucket/random/log_data.csv,1,iteminsession,int2,0,29,"artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId",itemInSession,1207,"Invalid digit, Value 'i', Pos 0, Type: Short",0,0


In [23]:
%%sql
SELECT * FROM staging_events

 * postgresql://mike:***@sparkify-cluster.csc0efk6rxxc.us-east-2.redshift.amazonaws.com:5439/sparkifydb
0 rows affected.


artist,auth,firstname,gender,iteminsession,lastname,length,level,location,method,page,registration,sessionid,song,status,ts,useragent,userid


In [13]:
%%sql
SELECT * FROM staging_songs

 * postgresql://mike:***@sparkify-cluster.csc0efk6rxxc.us-east-2.redshift.amazonaws.com:5439/sparkifydb
0 rows affected.


artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year


In [45]:
%%sql
COPY "staging_events" FROM 's3://sparkify-dwh-bucket/log-data.csv/part-00000-36d6ee93-2106-46ca-b604-a4aec6992719-c000.csv'
CREDENTIALS 'aws_iam_role=arn:aws:iam::451737047229:role/redshift_s3_readonly'
COMPUPDATE OFF REGION 'us-west-2'
CSV

In [None]:
# s3://sparkify-dwh-bucket/song_data.csv

In [44]:
%%sql
SELECT *
FROM stl_load_errors
ORDER BY starttime DESC;

## Cleaning up

In [30]:
# response = redshift.delete_cluster(ClusterIdentifier=DWH_CLUSTER_ID, SkipFinalClusterSnapshot=True)
# response

In [None]:
# iam.detach_role_policy(RoleName=DWH_ROLE_NAME, PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess")
# iam.delete_role(RoleName=DWH_ROLE_NAME)

In [19]:
import pandas as pd

In [21]:
df = pd.read_csv('log_data.csv')
df.head()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
Harmonia,Logged In,Ryan,M,0,Smith,655.77751,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,Sehr kosmisch,200,1542241826796,\Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...,like Gecko) Ubuntu Chromium/36.0.1985.125 Chr...,26.0
The Prodigy,Logged In,Ryan,M,1,Smith,260.07465,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,The Big Gundown,200,1542242481796,\Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...,like Gecko) Ubuntu Chromium/36.0.1985.125 Chr...,26.0
Train,Logged In,Ryan,M,2,Smith,205.45261,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,Marry Me,200,1542242741796,\Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...,like Gecko) Ubuntu Chromium/36.0.1985.125 Chr...,26.0
,Logged In,Wyatt,M,0,Scott,,free,"Eureka-Arcata-Fortuna, CA",GET,Home,1540872000000.0,563,,200,1542247071796,Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7....,9,
,Logged In,Austin,M,0,Rosales,,free,"New York-Newark-Jersey City, NY-NJ-PA",GET,Home,1541060000000.0,521,,200,1542252577796,Mozilla/5.0 (Windows NT 6.1; rv:31.0) Gecko/20...,12,
