# Project: Data Warehouse

## How to use this notebook

- Create user, roles, security groups and Redshift cluster
- Fill the variables in dwh.cfg
- Run all the cells below

### Parse the config file

In [22]:
import pandas as pd
import boto3
import json

In [23]:
import configparser
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

KEY = config.get('AWS','KEY')
SECRET = config.get('AWS','SECRET')

HOST = config.get("CLUSTER","HOST")
DB_NAME = config.get("CLUSTER","DB_NAME")
DB_USER = config.get("CLUSTER","DB_USER")
DB_PASSWORD = config.get("CLUSTER","DB_PASSWORD")
DB_PORT = config.get("CLUSTER","DB_PORT")

IAM_ROLE = config.get("IAM_ROLE", "ARN")

### Test if the S3 connection works

In [24]:
import boto3

s3 = boto3.resource('s3',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                   )

In [25]:
bucket=s3.Bucket('udacity-dend')
log_data_files = [filename.key for filename in bucket.objects.filter(Prefix='log-data')]
log_data_files[:10]

['log-data/',
 'log-data/2018/11/2018-11-01-events.json',
 'log-data/2018/11/2018-11-02-events.json',
 'log-data/2018/11/2018-11-03-events.json',
 'log-data/2018/11/2018-11-04-events.json',
 'log-data/2018/11/2018-11-05-events.json',
 'log-data/2018/11/2018-11-06-events.json',
 'log-data/2018/11/2018-11-07-events.json',
 'log-data/2018/11/2018-11-08-events.json',
 'log-data/2018/11/2018-11-09-events.json']

In [26]:
song_data_files = [filename.key for filename in bucket.objects.filter(Prefix='song-data/A/A')]
song_data_files[:10]

['song-data/A/A/A/TRAAAAK128F9318786.json',
 'song-data/A/A/A/TRAAAAV128F421A322.json',
 'song-data/A/A/A/TRAAABD128F429CF47.json',
 'song-data/A/A/A/TRAAACN128F9355673.json',
 'song-data/A/A/A/TRAAAEA128F935A30D.json',
 'song-data/A/A/A/TRAAAED128E0783FAB.json',
 'song-data/A/A/A/TRAAAEM128F93347B9.json',
 'song-data/A/A/A/TRAAAEW128F42930C0.json',
 'song-data/A/A/A/TRAAAFD128F92F423A.json',
 'song-data/A/A/A/TRAAAGR128F425B14B.json']

### Connect to DB

In [27]:
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [28]:
import os 
conn_string="postgresql://{}:{}@{}:{}/{}".format(DB_USER, DB_PASSWORD, HOST, DB_PORT, DB_NAME)
print(conn_string)
%sql $conn_string

postgresql://****@redshift-cluster.ciepxrdcqquc.us-west-2.redshift.amazonaws.com:5439/dev


'Connected: awsuser@dev'

### Drop and Create Tables

In [29]:
%run create_tables.py

### Load data from S3 to Staging Tables and Insert to Final Tables

In [30]:
%run etl.py

In [31]:
### Run Queries to Test

In [32]:
%%sql
count_user << select count(*) as total_user from dim_users

 * postgresql://awsuser:***@redshift-cluster.ciepxrdcqquc.us-west-2.redshift.amazonaws.com:5439/dev
1 rows affected.
Returning data to local variable count_user


In [33]:
%%sql
count_song << select count(*) as total_song from dim_songs

 * postgresql://awsuser:***@redshift-cluster.ciepxrdcqquc.us-west-2.redshift.amazonaws.com:5439/dev
1 rows affected.
Returning data to local variable count_song


In [34]:
%%sql
count_artist << select count(*) as total_artist from dim_artists

 * postgresql://awsuser:***@redshift-cluster.ciepxrdcqquc.us-west-2.redshift.amazonaws.com:5439/dev
1 rows affected.
Returning data to local variable count_artist


In [35]:
%%sql
count_time << select count(*) as total_time from dim_time

 * postgresql://awsuser:***@redshift-cluster.ciepxrdcqquc.us-west-2.redshift.amazonaws.com:5439/dev
1 rows affected.
Returning data to local variable count_time


In [36]:
%%sql
count_songplay << select count(*) as total_songplay from fact_songplays

 * postgresql://awsuser:***@redshift-cluster.ciepxrdcqquc.us-west-2.redshift.amazonaws.com:5439/dev
1 rows affected.
Returning data to local variable count_songplay


In [37]:
print("Total User = " + str(count_user))
print("Total Song = " + str(count_song))
print("Total Artist = " + str(count_artist))
print("Total Time = " + str(count_time))
print("Total Songplay = " + str(count_songplay))

Total User = +------------+
| total_user |
+------------+
|    105     |
+------------+
Total Song = +------------+
| total_song |
+------------+
|   14896    |
+------------+
Total Artist = +--------------+
| total_artist |
+--------------+
|    10025     |
+--------------+
Total Time = +------------+
| total_time |
+------------+
|    8023    |
+------------+
Total Songplay = +----------------+
| total_songplay |
+----------------+
|      326       |
+----------------+


In [38]:
%%sql
stag_event << select count(*) as stag_event from staging_events

 * postgresql://awsuser:***@redshift-cluster.ciepxrdcqquc.us-west-2.redshift.amazonaws.com:5439/dev
1 rows affected.
Returning data to local variable stag_event


In [39]:
%%sql
stag_song << select count(*) as stag_song from staging_songs

 * postgresql://awsuser:***@redshift-cluster.ciepxrdcqquc.us-west-2.redshift.amazonaws.com:5439/dev
1 rows affected.
Returning data to local variable stag_song


In [40]:
stag_event

stag_event
8056


In [41]:
stag_song

stag_song
14896
