# Sparkify Control Point

In [1]:
# First things first, we have to create a redshift cluster for our project on AWS
# Here, we'd be using IaC to proceed with the processes

# importing boto3, AWS python SDK
import boto3
from botocore.exceptions import ClientError

import configparser
import json
import pandas as pd

## Configuration

In [2]:
# Extracting config variables 
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

KEY = config.get('USER', 'KEY')
SECRET = config.get('USER', 'SECRET')

In [3]:
DWH_ROLE_NAME = config.get('DWH', 'DWH_ROLE_NAME')
DWH_DB_NAME = config.get('DWH', 'DWH_DB_NAME')
DWH_CLUSTER_ID = config.get('DWH', 'DWH_CLUSTER_ID')
DWH_NODE_TYPE = config.get('DWH', 'DWH_NODE_TYPE')
DWH_USER_NAME = config.get('DWH', 'DWH_USER_NAME')
DWH_USER_PASSWORD = config.get('DWH', 'DWH_USER_PASSWORD')
DWH_NUMBER_0F_NODES = int(config.get('DWH', 'DWH_NUMBER_0F_NODES'))
DWH_PORT = int(config.get('DWH', 'DWH_PORT'))

variables = pd.DataFrame({
    'keys':['DWH_ROLE_NAME', 'DWH_DB_NAME', 'DWH_CLUSTER_ID', 'DWH_NODE_TYPE', 'DWH_NUMBER_0F_NODES', 'DWH_PORT'], 
    'values':[DWH_ROLE_NAME, DWH_DB_NAME, DWH_CLUSTER_ID, DWH_NODE_TYPE, DWH_NUMBER_0F_NODES, DWH_PORT]
})

variables

Unnamed: 0,keys,values
0,DWH_ROLE_NAME,redshift_s3_readonly
1,DWH_DB_NAME,sparkifydb
2,DWH_CLUSTER_ID,sparkify-cluster
3,DWH_NODE_TYPE,dc2.large
4,DWH_NUMBER_0F_NODES,4
5,DWH_PORT,5439


### Create IAM role for Redshift cluster
This role will grant redshift AmazonS3ReadOnlyAccess

In [4]:
# Instantiating IAM client
iam = boto3.client('iam', region_name='us-east-2', aws_access_key_id=KEY, aws_secret_access_key=SECRET)

try:
    print('Creating IAM role for Redshift cluster...')
    iam_role = iam.create_role(
        RoleName=DWH_ROLE_NAME,
        AssumeRolePolicyDocument=json.dumps({
            'Statement': [{
                'Action': 'sts:AssumeRole',
                'Effect': 'Allow',
                'Principal': {'Service': 'redshift.amazonaws.com'}
            }],
            'Version': '2012-10-17'
        }),
        Description='Allows Redshift cluster to call AWS services on you behalf',
    )
    print('Role creation successful!')
    
    
    iam.attach_role_policy(
        RoleName=DWH_ROLE_NAME,
        PolicyArn='arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess'
    )['ResponseMetadata']['HTTPStatusCode']
    
    print('Role policy attached successfully!')
except Exception as e:
    if e.response['Error']['Code'] == 'EntityAlreadyExists':
        iam_role = iam.get_role(RoleName=DWH_ROLE_NAME)
        print('Role gotten')
    else:
        print(e)

Creating IAM role for Redshift cluster...
Role gotten


In [1]:
role_arn = iam_role['Role']['Arn']
role_arn

### Build the Redshift cluster

In [4]:
# Instantiating redshift client
redshift = boto3.client('redshift', region_name='us-east-2', aws_access_key_id=KEY, aws_secret_access_key=SECRET)

try:
    print('Creating Redshift cluster...')
    redshift_cluster = redshift.create_cluster(
        DBName=DWH_DB_NAME,
        ClusterIdentifier=DWH_CLUSTER_ID,
        NodeType=DWH_NODE_TYPE,
        MasterUsername=DWH_USER_NAME,
        MasterUserPassword=DWH_USER_PASSWORD,
        NumberOfNodes=DWH_NUMBER_0F_NODES,
        IamRoles=[
            role_arn,
        ]
    )
    print('Redshift cluster creation successful!')
except Exception as e:
    print(e)

In [2]:
# Checking cluster availability status
cluster_props = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_ID)['Clusters'][0]
cluster_props['ClusterAvailabilityStatus'], cluster_props['ClusterStatus']

In [3]:
# Obtaining cluster endpoint
DWH_ENDPOINT = cluster_props['Endpoint']['Address']
DWH_PORT = int(cluster_props['Endpoint']['Port'])
print('Endpoint: {}\nPort: {}'.format(DWH_ENDPOINT, DWH_PORT))

In [4]:
# cluster_vars = pd.DataFrame({
#     'keys':['ClusterIdentifier', 'NodeType', 'ClusterStatus', 'Endpoint:Address', 'Endpoint:Port', 'IamRole', 'Vpc', 'NumberOfNodes'], 
#     'values':[cluster_props['ClusterIdentifier'], cluster_props['NodeType'], cluster_props['ClusterStatus'], 
#               cluster_props['Endpoint']['Address'], cluster_props['Endpoint']['Port'], cluster_props['IamRoles'][0]['IamRoleArn'],
#               cluster_props['VpcId'], cluster_props['NumberOfNodes']]
# })

# cluster_vars

### Open Incomming TCP port to access the cluster endpoint

In [11]:
# get an ec2 resourse
ec2 = boto3.resource('ec2', region_name='us-east-2', aws_access_key_id=KEY, aws_secret_access_key=SECRET)

In [12]:
try: 
    vpc = ec2.Vpc(id=cluster_props['VpcId'])
    default_sg = list(vpc.security_groups.all())[0]
    print(default_sg)
    
    default_sg.authorize_ingress(
        GroupName=default_sg.group_name,
        CidrIp='0.0.0.0/0',
        IpProtocol='TCP',
        FromPort=DWH_PORT,
        ToPort=DWH_PORT
    )
except Exception as e:
    print(e)

## ETL

In [13]:
%load_ext sql

In [5]:
conn_string = 'postgresql://{}:{}@{}:{}/{}'.format(DWH_USER_NAME, DWH_USER_PASSWORD, DWH_ENDPOINT, DWH_PORT, DWH_DB_NAME)
%sql $conn_string

In [71]:
%%sql
DROP TABLE IF EXISTS "staging_events";
DROP TABLE IF EXISTS "staging_songs";

 * postgresql://mike:***@sparkify-cluster.csc0efk6rxxc.us-east-2.redshift.amazonaws.com:5439/sparkifydb
Done.
Done.


[]

In [18]:
%%sql
DROP TABLE IF EXISTS "staging_events";
CREATE TABLE IF NOT EXISTS "staging_events" (
    "artist" VARCHAR,
    "auth" VARCHAR,
    "firstName" VARCHAR,
    "gender" VARCHAR,
    "itemInSession" SMALLINT,
    "lastName" VARCHAR,
    "length" REAL,
    "level" VARCHAR,
    "location" VARCHAR,
    "method" VARCHAR,
    "page" VARCHAR,
    "registration" DOUBLE PRECISION,
    "sessionId" SMALLINT,
    "song" VARCHAR,
    "status" SMALLINT,
    "ts" BIGINT,
    "userAgent" VARCHAR,
    "userId" SMALLINT
);

 * postgresql://mike:***@sparkify-cluster.csc0efk6rxxc.us-east-2.redshift.amazonaws.com:5439/sparkifydb
Done.
Done.


[]

In [19]:
%%sql
COPY "staging_events" FROM 's3://udacity-dend/log-data'
CREDENTIALS 'aws_iam_role=arn:aws:iam::451737047229:role/redshift_s3_readonly'
COMPUPDATE OFF REGION 'us-west-2'
JSON 's3://udacity-dend/log_json_path.json';

 * postgresql://mike:***@sparkify-cluster.csc0efk6rxxc.us-east-2.redshift.amazonaws.com:5439/sparkifydb
Done.


[]

In [25]:
%%sql
DROP TABLE IF EXISTS "staging_songs";
CREATE TABLE IF NOT EXISTS "staging_songs" (
    "artist_id" VARCHAR,
    "artist_latitude" DECIMAL(18,12),
    "artist_location" VARCHAR,
    "artist_longitude" DECIMAL(18,12),
    "artist_name" VARCHAR(MAX),
    "duration" DOUBLE PRECISION, 
    "num_songs" SMALLINT,
    "song_id" VARCHAR,
    "title" VARCHAR,
    "year" SMALLINT
);

 * postgresql://mike:***@sparkify-cluster.csc0efk6rxxc.us-east-2.redshift.amazonaws.com:5439/sparkifydb
Done.
Done.


[]

In [None]:
%%sql
COPY "staging_songs" FROM 's3://udacity-dend/song-data'
CREDENTIALS 'aws_iam_role=arn:aws:iam::451737047229:role/redshift_s3_readonly'
COMPUPDATE OFF REGION 'us-west-2'
JSON 'auto ignorecase';

 * postgresql://mike:***@sparkify-cluster.csc0efk6rxxc.us-east-2.redshift.amazonaws.com:5439/sparkifydb


In [24]:
%%sql
SELECT *
FROM "stl_load_errors"
ORDER BY "starttime" DESC;

 * postgresql://mike:***@sparkify-cluster.csc0efk6rxxc.us-east-2.redshift.amazonaws.com:5439/sparkifydb
4 rows affected.


userid,slice,tbl,starttime,session,query,filename,line_number,colname,type,col_length,position,raw_line,raw_field_value,err_code,err_reason,is_partial,start_offset
100,0,101722,2021-12-26 19:56:25.229299,13364,225,s3://udacity-dend/song-data/C/A/R/TRCARJQ128F425A389.json,1,artist_name,varchar,256,0,"{""song_id"": ""SOLAUEC12A8AE476BB"", ""num_songs"": 1, ""title"": ""Medication"", ""artist_name"": ""Spiritualized;Jason;Jason - Dulcimer/;Kate Radley - Vox continental/Farfisa/Tones/Drones/Tremeloes/;Sean Cook - Wha-monica/;Mark Refoy;Jon Mattock;Icon Hunt;Stuart Gordon;Balanescu Quartet;Rico;Rico - Tam Tam/;Bammie;Tim Sanders;Roddy Lorimar;Steve Sidwell;Chris Sharrack;Caroline Crawley;Marilyn McFarlane;Helen White"", ""artist_latitude"": null, ""year"": 0, ""duration"": 498.72934, ""artist_id"": ""ARVHQMD1269FB25AE7"", ""artist_longitude"": null, ""artist_location"": """"}",,1204,String length exceeds DDL length,0,0
100,7,101722,2021-12-26 19:56:25.229299,13364,225,s3://udacity-dend/song-data/B/K/O/TRBKOTN128F425A38C.json,1,artist_name,varchar,256,0,"{""song_id"": ""SOCCKQH12A8AE476C3"", ""num_songs"": 1, ""title"": ""Born Never Asked"", ""artist_name"": ""Spiritualized;Jason;Jason - Dulcimer/;Kate Radley - Vox continental/Farfisa/Tones/Drones/Tremeloes/;Sean Cook - Wha-monica/;Mark Refoy;Jon Mattock;Icon Hunt;Stuart Gordon;Balanescu Quartet;Rico;Rico - Tam Tam/;Bammie;Tim Sanders;Roddy Lorimar;Steve Sidwell;Chris Sharrack;Caroline Crawley;Marilyn McFarlane;Helen White"", ""artist_latitude"": null, ""year"": 0, ""duration"": 125.20444, ""artist_id"": ""ARVHQMD1269FB25AE7"", ""artist_longitude"": null, ""artist_location"": """"}",,1204,String length exceeds DDL length,0,0
100,4,101722,2021-12-26 19:56:25.229299,13364,225,s3://udacity-dend/song-data/C/W/V/TRCWVDW128F425A38A.json,1,artist_name,varchar,256,0,"{""song_id"": ""SOORTJE12A8AE476BD"", ""num_songs"": 1, ""title"": ""Electric Phase"", ""artist_name"": ""Spiritualized;Jason;Jason - Dulcimer/;Kate Radley - Vox continental/Farfisa/Tones/Drones/Tremeloes/;Sean Cook - Wha-monica/;Mark Refoy;Jon Mattock;Icon Hunt;Stuart Gordon;Balanescu Quartet;Rico;Rico - Tam Tam/;Bammie;Tim Sanders;Roddy Lorimar;Steve Sidwell;Chris Sharrack;Caroline Crawley;Marilyn McFarlane;Helen White"", ""artist_latitude"": null, ""year"": 0, ""duration"": 93.75302, ""artist_id"": ""ARVHQMD1269FB25AE7"", ""artist_longitude"": null, ""artist_location"": """"}",,1204,String length exceeds DDL length,0,0
100,3,101722,2021-12-26 19:56:25.229299,13364,225,s3://udacity-dend/song-data/A/Y/F/TRAYFUW128F428F618.json,1,artist_location,varchar,256,0,"{""song_id"": ""SORMAXQ12A8C139224"", ""num_songs"": 1, ""title"": ""Landmines"", ""artist_name"": ""St. Vincent"", ""artist_latitude"": 19.40904, ""year"": 2007, ""duration"": 307.53914, ""artist_id"": ""AR0JBXL1187FB52810"", ""artist_longitude"": -99.14977, ""artist_location"": ""ORDER &#039;ACTOR&#039; ON INSOUND: <a href=\\""http://www.insound.com/search/searchmain.jsp?query=st.+vincent+actor\\"" target=\\""_blank\\"" rel=\\""nofollow\\"" onmousedown='UntrustedLink.bootstrap($(this), \\""\\"", event)'>http://www.insound.com/search/searchmain.jsp?query=st.+vincent+actor</a>""}",,1204,String length exceeds DDL length,0,0


In [138]:
%%sql
INSERT INTO "staging_songs" ("song_id", "num_songs", "title", "artist_name", "artist_latitude", "year", "duration", "artist_id", "artist_longitude", "artist_location")
VALUES ('SOXZYWX12A6310ED0C', 1, 'It\'s About Time', 'Jamie Cullum', 51.50632, 0, 246.9873, 'ARC1IHZ1187FB4E920', -0.12714, ''), 
       ('SODZYPO12A8C13A91E', 1, 'Burn My Body (Album Version)', 'Broken Spindles', null, 0, 177.99791, 'AR1C2IX1187B99BF74', null, ''), 
       ('SOQPWCR12A6D4FB2A3', 1, 'A Poor Recipe For Civic Cohesion', 'Western Addiction', 37.77916, 2005, 118.07302, 'AR73AIO1187B9AD57B', -122.42005, 'San Francisco, CA');

 * postgresql://mike:***@sparkify-cluster.csc0efk6rxxc.us-east-2.redshift.amazonaws.com:5439/sparkifydb
3 rows affected.


[]

In [17]:
%%sql
SELECT * 
FROM staging_songs;

 * postgresql://mike:***@sparkify-cluster.csc0efk6rxxc.us-east-2.redshift.amazonaws.com:5439/sparkifydb
1 rows affected.


artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
ARJNIUY12298900C91,,,,Adelitas Way,213.9424,1,SOBLFFE12AF72AA5BA,Scream,2009


In [127]:
%%sql
SELECT *
FROM staging_events
WHERE registration > CAST(registration AS BIGINT);

 * postgresql://mike:***@sparkify-cluster.csc0efk6rxxc.us-east-2.redshift.amazonaws.com:5439/sparkifydb
0 rows affected.


artist,auth,firstname,gender,iteminsession,lastname,length,level,location,method,page,registration,sessionid,song,status,ts,useragent,userid


## Cleaning up

In [6]:
# response = redshift.delete_cluster(ClusterIdentifier=DWH_CLUSTER_ID, SkipFinalClusterSnapshot=True)
# response

In [None]:
# iam.detach_role_policy(RoleName=DWH_ROLE_NAME, PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess")
# iam.delete_role(RoleName=DWH_ROLE_NAME)