In [1]:
import pandas as pd
import boto3
import json
import configparser

# Load parameters

In [2]:
config = configparser.ConfigParser()
config.read('dwh.cfg')

KEY = config.get('AWS', 'KEY')
SECRET = config.get('AWS', 'SECRET')

DWH_CLUSTER_TYPE=config.get('DWH', 'DWH_CLUSTER_TYPE')
DWH_NODE_TYPE=config.get('DWH', 'DWH_NODE_TYPE')
DWH_NUM_NODES=config.get('DWH', 'DWH_NUM_NODES')

DB_NAME=config.get('CLUSTER', 'DB_NAME')
DWH_CLUSTER_IDENTIFIER=config.get('DWH', 'DWH_CLUSTER_IDENTIFIER')
DB_USER=config.get('CLUSTER', 'DB_USER')
DB_PASSWORD=config.get('CLUSTER', 'DB_PASSWORD')
DB_PORT = config.get('CLUSTER', 'DB_PORT')

DWH_IAM_ROLE_NAME = config.get('DWH', 'DWH_IAM_ROLE_NAME')

# Create clients for EC2, S3, IAM and Redshift

In [3]:
import boto3

ec2 = boto3.resource('ec2',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                    )

s3 = boto3.resource('s3',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                   )

iam = boto3.client('iam',aws_access_key_id=KEY,
                     aws_secret_access_key=SECRET,
                     region_name='us-west-2'
                  )

redshift = boto3.client('redshift',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                       )

## Sample buckets

In [4]:
sampleDbBucket = s3.Bucket('udacity-dend')

for obj in sampleDbBucket.objects.filter(Prefix='log_data'):
    print(obj)

s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-01-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-02-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-03-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-04-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-05-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-06-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-07-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-08-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-09-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-10-events.json')
s3.ObjectSummary(b

# Create IAM Role

In [5]:
from botocore.exceptions import ClientError

#1.1 Create the role, 
try:
    print("1.1 Creating a new IAM Role") 
    dwhRole = iam.create_role(
        Path='/',
        RoleName=DWH_IAM_ROLE_NAME,
        Description = "Allows Redshift clusters to call AWS services on your behalf.",
        AssumeRolePolicyDocument=json.dumps(
            {'Statement': [{'Action': 'sts:AssumeRole',
               'Effect': 'Allow',
               'Principal': {'Service': 'redshift.amazonaws.com'}}],
             'Version': '2012-10-17'})
    )    
except Exception as e:
    print(e)
    
    
print("1.2 Attaching Policy")

iam.attach_role_policy(RoleName=DWH_IAM_ROLE_NAME,
                       PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
                      )['ResponseMetadata']['HTTPStatusCode']

print("1.3 Get the IAM role ARN")
roleArn = iam.get_role(RoleName=DWH_IAM_ROLE_NAME)['Role']['Arn']

print(roleArn)

1.1 Creating a new IAM Role
1.2 Attaching Policy
1.3 Get the IAM role ARN
arn:aws:iam::030982041430:role/dwhRole


# Create Redshift cluster

In [6]:
try:
    response = redshift.create_cluster(        
        #HW
        ClusterType=DWH_CLUSTER_TYPE,
        NodeType=DWH_NODE_TYPE,
        NumberOfNodes=int(DWH_NUM_NODES),

        #Identifiers & Credentials
        DBName=DB_NAME,
        ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,
        MasterUsername=DB_USER,
        MasterUserPassword=DB_PASSWORD,
        
        #Roles (for s3 access)
        IamRoles=[roleArn]  
    )
except Exception as e:
    print(e)

## *Describe* the cluster to see its status

In [10]:
def prettyRedshiftProps(props):
    pd.set_option('display.max_colwidth', None)
    keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId']
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=["Key", "Value"])

myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

Unnamed: 0,Key,Value
0,ClusterIdentifier,dwhcluster
1,NodeType,dc2.large
2,ClusterStatus,available
3,MasterUsername,dwhuser
4,DBName,dwh
5,Endpoint,"{'Address': 'dwhcluster.ccghvw7gmopq.us-west-2.redshift.amazonaws.com', 'Port': 5439}"
6,VpcId,vpc-0f638647f8c8f6869
7,NumberOfNodes,4


## Get endpoint and ARN

In [11]:
DWH_ENDPOINT = myClusterProps['Endpoint']['Address']
DWH_ROLE_ARN = myClusterProps['IamRoles'][0]['IamRoleArn']
print("DWH_ENDPOINT :: ", DWH_ENDPOINT)
print("DWH_ROLE_ARN :: ", DWH_ROLE_ARN)

DWH_ENDPOINT ::  dwhcluster.ccghvw7gmopq.us-west-2.redshift.amazonaws.com
DWH_ROLE_ARN ::  arn:aws:iam::030982041430:role/dwhRole


# Connect to the cluster

In [12]:
%load_ext sql

In [None]:
conn_string ="postgresql://{}:{}@{}:{}/{}".format(DB_USER, DB_PASSWORD,
                                                 DWH_ENDPOINT, DB_PORT,
                                                 DB_NAME)
%sql $conn_string

# Create the tables

In [15]:
%run "create_tables.py"

In [17]:
# check if the tables have been created
%sql SELECT * FROM artists;

 * postgresql://dwhuser:***@dwhcluster.ccghvw7gmopq.us-west-2.redshift.amazonaws.com:5439/dwh
0 rows affected.


artist_id,name,location,latitude,longitude


# Load data into staging and analytics tables

In [18]:
%run "etl.py"

# Data Quality Checks

In [19]:
# check for any loading errors
%sql select * from stl_load_errors;

 * postgresql://dwhuser:***@dwhcluster.ccghvw7gmopq.us-west-2.redshift.amazonaws.com:5439/dwh
0 rows affected.


userid,slice,tbl,starttime,session,query,filename,line_number,colname,type,col_length,position,raw_line,raw_field_value,err_code,err_reason,is_partial,start_offset


In [20]:
%sql select * from songplays order by songplay_id;

 * postgresql://dwhuser:***@dwhcluster.ccghvw7gmopq.us-west-2.redshift.amazonaws.com:5439/dwh
319 rows affected.


songplay_id,start_time,user_id,level,song_id,artist_id,session_id,location,user_agent
0,2018-11-30 10:42:09,36,paid,SOTNHIP12AB0183131,ARD46C811C8A414F3F,998,"Janesville-Beloit, WI","""Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"""
1,2018-11-24 13:55:51,95,paid,SOARUPP12AB01842E0,ARD46C811C8A414F3F,564,"Winston-Salem, NC","""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like Mac OS X) AppleWebKit/537.51.2 (KHTML, like Gecko) Version/7.0 Mobile/11D257 Safari/9537.53"""
2,2018-11-16 16:27:21,90,free,SOMUJKC12AB01865AD,AR9RYZP1187FB36C6A,148,"Pensacola-Ferry Pass-Brent, FL",Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0
3,2018-11-28 08:18:57,58,paid,SOJWCWM12A8C13B664,ARM6T8I1187FB36CC8,887,"Augusta-Richmond County, GA-SC","""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"""
4,2018-11-23 18:11:01,86,free,SOQDMXT12A6D4F8255,ART5MUE1187B98C961,869,"La Crosse-Onalaska, WI-MN","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36"""
5,2018-11-19 08:32:12,24,paid,SOSMTXQ12A6D4F721D,ARS927Z1187B9ACA29,672,"Lake Havasu City-Kingman, AZ","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"""
6,2018-11-15 14:31:08,97,paid,SOVOZSC12A8C144E73,ART0ETO1187B9AB519,605,"Lansing-East Lansing, MI","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36"""
7,2018-11-14 06:19:41,80,paid,SOACRBY12AB017C757,ARVGCRM11F50C496F4,548,"Portland-South Portland, ME","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"""
8,2018-11-29 02:36:13,54,free,SOTNHIP12AB0183131,ARD46C811C8A414F3F,951,"Yuba City, CA",Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0
9,2018-11-23 15:29:23,53,free,SOARUPP12AB01842E0,ARD46C811C8A414F3F,860,"Klamath Falls, OR","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.103 Safari/537.36"""


In [27]:
%%sql 
select user_id, count(*)
from users
group by user_id
order by count(*)
limit 5;

 * postgresql://dwhuser:***@dwhcluster.ccghvw7gmopq.us-west-2.redshift.amazonaws.com:5439/dwh
5 rows affected.


user_id,count
22,1
30,1
23,1
41,1
24,1


In [28]:
%%sql
select * from artists limit 5;

 * postgresql://dwhuser:***@dwhcluster.ccghvw7gmopq.us-west-2.redshift.amazonaws.com:5439/dwh
5 rows affected.


artist_id,name,location,latitude,longitude
ARZJIAN1187FB4CF8D,Alpinestars,,,
ARNXMP21187B9A8B4D,The Mahavishnu Orchestra,,,
AR57POM1187B992EDD,The Johnny Dankworth Seven,"Southall, Middlesex, England",51.51138,-0.37833
ARFWXAS1187FB38486,Sinergia,,,
AR7NTTO1187B995A0C,Manhattan Transfer,"New York, NY",40.71455,-74.00712


In [26]:
%%sql 
select level, song_id, count(*)
from songplays
GROUP BY (level, song_id)
ORDER BY count(*) DESC;

 * postgresql://dwhuser:***@dwhcluster.ccghvw7gmopq.us-west-2.redshift.amazonaws.com:5439/dwh
223 rows affected.


level,song_id,count
paid,SOBONKR12A58A7A7E0,29
paid,SOUNZHU12A8AE47481,8
free,SOBONKR12A58A7A7E0,8
paid,SOULTKQ12AB018A183,8
paid,SOHTKMO12AB01843B0,5
paid,SOLZOBD12AB0185720,5
paid,SOIZLKI12A6D4F7B61,4
paid,SOTNHIP12AB0183131,4
paid,SOIOESO12A6D4F621D,4
paid,SONQEYS12AF72AABC9,4


# Delete cluster

In [29]:
redshift.delete_cluster(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,
                      SkipFinalClusterSnapshot=True)

{'Cluster': {'ClusterIdentifier': 'dwhcluster',
  'NodeType': 'dc2.large',
  'ClusterStatus': 'deleting',
  'ClusterAvailabilityStatus': 'Modifying',
  'MasterUsername': 'dwhuser',
  'DBName': 'dwh',
  'Endpoint': {'Address': 'dwhcluster.ccghvw7gmopq.us-west-2.redshift.amazonaws.com',
   'Port': 5439},
  'ClusterCreateTime': datetime.datetime(2022, 9, 29, 12, 17, 59, 812000, tzinfo=tzutc()),
  'AutomatedSnapshotRetentionPeriod': 1,
  'ManualSnapshotRetentionPeriod': -1,
  'ClusterSecurityGroups': [],
  'VpcSecurityGroups': [{'VpcSecurityGroupId': 'sg-0cfe6ea2c880b7d34',
    'Status': 'active'}],
  'ClusterParameterGroups': [{'ParameterGroupName': 'default.redshift-1.0',
    'ParameterApplyStatus': 'in-sync'}],
  'ClusterSubnetGroupName': 'default',
  'VpcId': 'vpc-0f638647f8c8f6869',
  'AvailabilityZone': 'us-west-2a',
  'PreferredMaintenanceWindow': 'sat:12:00-sat:12:30',
  'PendingModifiedValues': {},
  'ClusterVersion': '1.0',
  'AllowVersionUpgrade': True,
  'NumberOfNodes': 4,
  '

In [34]:
myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER
                                          )['Clusters'][0]
prettyRedshiftProps(myClusterProps)

ClusterNotFoundFault: An error occurred (ClusterNotFound) when calling the DescribeClusters operation: Cluster dwhcluster not found.

# Delete IAM Role

In [33]:
iam.detach_role_policy(RoleName=DWH_IAM_ROLE_NAME,
                      PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess")
iam.delete_role(RoleName=DWH_IAM_ROLE_NAME)

{'ResponseMetadata': {'RequestId': '796c21c8-78be-4dfe-bc58-d8a4e9a3d990',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '796c21c8-78be-4dfe-bc58-d8a4e9a3d990',
   'content-type': 'text/xml',
   'content-length': '200',
   'date': 'Thu, 29 Sep 2022 12:31:13 GMT'},
  'RetryAttempts': 0}}