# Create Cluster and Role

In [2]:
from create_cluster import ConfigureRedshift
%load_ext autoreload
%autoreload 2

import logging
logging.basicConfig(level=logging.INFO)

# Reads config to create cluster and IAM Role
configurer = ConfigureRedshift("./aws.cfg")

In [2]:
# create role
arn = configurer.create_role()

INFO:spam.auxiliary.ConfigureRedshift:1.1 Creating a new IAM Role
INFO:spam.auxiliary.ConfigureRedshift:1.2 Attaching Policy
INFO:spam.auxiliary.ConfigureRedshift:1.3 Get the IAM role ARN


In [3]:
# create cluster
custer_response = configurer.create_redshift_cluster()

INFO:spam.auxiliary.ConfigureRedshift:CREATE CLUSTER


In [3]:
# get cluster information and format it in a DataFrame
props = configurer.get_cluster_props()
format_props = configurer.prettyRedshiftProps(props)
format_props

Unnamed: 0,Key,Value
0,ClusterIdentifier,dwhcluster
1,NodeType,dc2.large
2,ClusterStatus,available
3,MasterUsername,dwhuser
4,DBName,dwh
5,Endpoint,"{'Address': 'dwhcluster.cakoxduzammt.us-west-2.redshift.amazonaws.com', 'Port': 5439}"
6,VpcId,vpc-07ac452326e3d6af5
7,NumberOfNodes,2


In [4]:
# Add TCP inbound to the redshift cluster
configurer.create_tcp_cluster()

INFO:spam.auxiliary.ConfigureRedshift:Add TCP to Cluster
ERROR:spam.auxiliary.ConfigureRedshift:An error occurred (InvalidPermission.Duplicate) when calling the AuthorizeSecurityGroupIngress operation: the specified rule "peer: 0.0.0.0/0, TCP, from port: 5439, to port: 5439, ALLOW" already exists


# Data Quality

Before executing this part, make sure to create the tables by running:

```python
python -m create_tables
python -m etl
```

In [5]:
!python -m create_tables

INFO:root:Dropping Tables...
INFO:root:Creating Tables...


In [6]:
!python -m etl

INFO:root:Staging Tables...
INFO:root:Inserting Tables...


### Connects to Redshift

In [20]:
import configparser
import psycopg2
import pandas as pd

# creates config from file
config = configparser.ConfigParser()
config.read('dwh.cfg')

# gets the important parameters to connect to the Redshift
DWH_DB_USER = config['CLUSTER']['DB_USER']
DWH_DB_PASSWORD = config['CLUSTER']['DB_PASSWORD']
DWH_ENDPOINT = config['CLUSTER']['HOST']
DWH_PORT = config['CLUSTER']['DB_PORT']
DWH_DB = config['CLUSTER']['DB_NAME']

# creates the connection to execure queries in the data warehouse
conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".
                        format(*config['CLUSTER'].values()))
cur = conn.cursor()


### Tables total Rows

In [21]:

# list of all tables in the dwh
tables = ["staging_events", "staging_songs", "songplay",
          "users", "songs", "artists", "time"]

# count of each table in the dwh
df = pd.DataFrame()
for table in tables:
    query = f"""SELECT count(*) FROM {table}"""
    count = pd.read_sql(query, conn)
    df[table] = [count.iloc[0].values[0]]

df.style.format("{:_}".format)


Unnamed: 0,staging_events,staging_songs,songplay,users,songs,artists,time
0,8_056,14_896,319,105,14_896,10_025,8_023


### Table Duplicates

Since the tables do not have many rows, it is possible to select all columns and analyse

In [22]:
tables = ["songplay", "users", "songs", "artists", "time"]

# number of duplicated rows in each table
df = pd.DataFrame()
for table in tables:
    print(table)
    query = f"""SELECT * FROM {table}"""
    count = pd.read_sql(query, conn)
    df[table] = [count.duplicated().sum()]

df.style.format("{:_}".format)

songplay
users
songs
artists
time


Unnamed: 0,songplay,users,songs,artists,time
0,0,0,0,0,0


### Tables sample

In [23]:
# songplay 5 rows sample
df = pd.read_sql("""SELECT * FROM songplay LIMIT 5;""", conn)
df

Unnamed: 0,songplay_id,start_time,user_id,level,song_id,artist_id,session_id,location,user_agent
0,124,2018-11-01 21:11:13.796,8,free,SOEIQUY12AF72A086A,ARHUC691187B9AD27F,139,"Phoenix-Mesa-Scottsdale, AZ","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36"""
1,224,2018-11-02 16:35:00.796,50,free,SOBONKR12A58A7A7E0,AR5E44Z1187B9A1D74,156,"New Haven-Milford, CT","""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"""
2,259,2018-11-03 01:12:26.796,53,free,SOSELMV12A6D4FCF5A,ARWD25M1187FB4C563,52,"Klamath Falls, OR","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.103 Safari/537.36"""
3,6,2018-11-04 16:25:54.796,69,free,SOARUPP12AB01842E0,ARD46C811C8A414F3F,235,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"""
4,172,2018-11-04 19:35:15.796,73,paid,SOULTKQ12AB018A183,ARKQQZA12086C116FC,72,"Tampa-St. Petersburg-Clearwater, FL","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2"""


In [24]:
# users 5 rows sample
df = pd.read_sql("""SELECT * FROM users LIMIT 5;""", conn)
df

Unnamed: 0,user_id,first_name,last_name,gender,level
0,10,Sylvie,Cruz,F,free
1,95,Sara,Johnson,F,paid
2,54,Kaleb,Cook,M,free
3,37,Jordan,Hicks,F,free
4,16,Rylan,George,M,free


In [25]:
# songs 5 rows sample
df = pd.read_sql("""SELECT * FROM songs LIMIT 5;""", conn)
df

Unnamed: 0,song_id,title,artist_id,year,duration
0,SOAPERH12A58A787DC,The One And Only (Edited),ARZ5H0P1187B98A1DD,0,230.42567
1,SOUJRGA12A6310EC37,Bad Omen (24-Bit Digitally Remastered 04) (),AR98JLC1187B9ADE23,0,243.06893
2,SOIOVAN12AAF3B50A7,Maronna Nera,ARHEOP21187B99AEF6,0,205.322
3,SOKFMQE12A6D4F41B7,Tina (I Held You In My Arms) (1989 Digital Remaster),AR1G6WM1187FB5C014,0,125.46567
4,SODMVJR12A6D4F985D,If I...,ARDI88R1187B98DAB2,0,222.92853


In [26]:
# artists 5 rows sample
df = pd.read_sql("""SELECT * FROM artists LIMIT 5;""", conn)
df

Unnamed: 0,artist_id,name,location,latitude,longitude
0,ARTW7I31187B9A4CA9,!!!,,,
1,AR2ZNXL1187B9B87A1,1 Giant Leap,,,
2,AR4WA5Y1187B9AC434,1000 Homo DJs,"Chicago, IL",,
3,AR9GUZF1187FB4D1BC,10000 Maniacs,"Jamestown, NY",,
4,AR30QTD1187FB3DAD3,1090 Club,"Billings, MT",45.0,-108.50552


In [27]:
# time 5 rows sample
df = pd.read_sql("""SELECT * FROM time LIMIT 5;""", conn)
df

Unnamed: 0,start_time,hour,day,week,month,year,weekday
0,2018-11-01 20:57:10.796,20,1,44,11,2018,4
1,2018-11-01 21:01:46.796,21,1,44,11,2018,4
2,2018-11-01 21:02:12.796,21,1,44,11,2018,4
3,2018-11-01 21:05:52.796,21,1,44,11,2018,4
4,2018-11-01 21:08:16.796,21,1,44,11,2018,4


## Analitycs Query Example

In [38]:
# selecting all the songs of Red Hot Chili Peppers with \
# the song and artist id:
query = """
    SELECT s.title
         , s.song_id
         , a.artist_id
         , a.name as artist
         , s.year
         , s.duration as song_length
    FROM songs as s
    JOIN artists a ON s.artist_id = a.artist_id
    WHERE lower(a.name) = 'red hot chili peppers';
"""
df = pd.read_sql(query, conn)
df


Unnamed: 0,title,song_id,artist_id,artist,year,song_length
0,Emit Remmus (Album Version),SONAEER12A67020459,ARE8GLF1187FB52532,Red Hot Chili Peppers,1999,240.19546
1,Get On Top (Album Version),SOPNFTX12A67020455,ARE8GLF1187FB52532,Red Hot Chili Peppers,1999,198.05995
2,The Greeting Song (Album Version),SOQLJFO12A6D4F7503,ARE8GLF1187FB52532,Red Hot Chili Peppers,1991,193.43628
3,Naked In The Rain (Album Version),SOBANAT12A6D4F7501,ARE8GLF1187FB52532,Red Hot Chili Peppers,1991,265.63873
4,Venice Queen (Album Version),SOAZIRE12A6702046F,ARE8GLF1187FB52532,Red Hot Chili Peppers,2002,367.01995


In [47]:
# counting the number of unique users that played songs \
# from a given artist_id
query = """
    SELECT t.year, 
           count(distinct s.user_id) as nunique_users
    FROM songplay as s
    JOIN time as t ON s.start_time = t.start_time
    WHERE s.artist_id = 'AR5E44Z1187B9A1D74'
    GROUP BY t.year;
"""
df = pd.read_sql(query, conn)
df


Unnamed: 0,year,nunique_users
0,2018,22


### Closing Connection

In [48]:
# close connection
conn.close()

# Deletes Cluster and Role

In [17]:
# delete cluster
configurer.delete_cluster()

INFO:spam.auxiliary.ConfigureRedshift:Deleting Cluster


In [18]:
# delete role
configurer.delete_role()

INFO:spam.auxiliary.ConfigureRedshift:Deleting Role
