# Steps for the ETL process

In [1]:
%load_ext sql

In [2]:
import pandas as pd 
import boto3 
import json
import psycopg2

### 1. Get the configurations to connect to Redshift Cluster

In [3]:
import configparser
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

(HOST,DB_NAME, DB_USER, DB_PASSWORD, DB_PORT) = config['CLUSTER'].values()

IAM_ROLE_ARN   = config.get("IAM_ROLE","ARN")

#### Connecting to Redshift Cluster

In [None]:
conn="postgresql://{}:{}@{}:{}/{}".format(DB_USER, DB_PASSWORD, HOST, DB_PORT,DB_NAME)
print(conn)
%sql $conn

### 2. Creating Tables
- Write the `CREATE` statements for all the tables in the `create_tables.py`
- Run `create_tables.py`

In [5]:
%run create_tables.py

##### Check the Sample data in S3

In [6]:
#Checking the data objects in S3
s3 = boto3.resource('s3',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                   )

sampleDbBucket =  s3.Bucket("udacity-dend")

for obj in sampleDbBucket.objects.filter(Prefix="log_data/2018/11/2018-11-01"):
    print(obj)

for obj in sampleDbBucket.objects.filter(Prefix="song_data/A/A/A/TRAAANK128F428B515.json"):
    print(obj)

s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-01-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='song_data/A/A/A/TRAAANK128F428B515.json')


### 3. Copying Sample data from S3 to the stage tables using <span style = "color:red"> COPY </span>
 - Write the `COPY` statements in the `sql_queries.py`
 ***NOTE :*** Following copy statements will load only with few records for testing

###  <span style = "color:orange"> ST_Events </span>

In [None]:
%%time
qry = """
     COPY ST_Events from {} 
     credentials 'aws_iam_role={}' 
     format as json {}
     compupdate off
     region 'us-west-2';""".format(config.get("S3","LOG_DATA"),IAM_ROLE_ARN,config.get("S3","LOG_JSONPATH"))

%sql $qry

### <span style = "color:orange"> ST_Songs </span>

In [None]:
%%time
qry = """
    COPY ST_songs from 's3://udacity-dend/song_data/A/A/A/TRAAANK128F428B515.json' 
     credentials 'aws_iam_role={}' 
     format json 'auto'
     compupdate off
     region 'us-west-2';
""".format(IAM_ROLE_ARN)

%sql $qry
#config.get("S3","SONG_DATA")

### 4. Inserting into Spatkify schema tables

In [20]:
conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(*config['CLUSTER'].values()))
cur = conn.cursor()

In [None]:
from sql_queries import copy_table_queries, insert_table_queries
import re
for query in insert_table_queries:
        table_name = re.findall(r'INSERT INTO\ (.+?)\ ',query)
        try:
            cur.execute(query)
            conn.commit()
            print("'{}'  Insert Successful...!!!".format(table_name[0]))
        except psycopg2.Error as e:
            print(e)

In [None]:
cur.execute("select * from (SELECT user_id,count(*) c FROM users group by user_id ) where c>1")
row = cur.fetchone()
while row:
    print(row)
    row = cur.fetchone()

### 5. Steps for Deleting the duplicate records and Inserting with latest data in `Users` table

#### 5.1. Getting the userIds with duplicates

In [21]:
import pandas as pd
query = """select user_id from (SELECT user_id,count(*) c FROM users group by user_id ) where c>1"""
df = pd.read_sql_query(query, conn)
df.count
user_ids = tuple(df['user_id'])
print("Duplicate recodrs for userIds :",user_ids)

Duplicate recodrs for userIds : ('15', '16', '29', '36', '49', '80', '85', '88')


##### Following is the query to get the most recent data of the user

In [None]:
query = """select MAX(ts) as max_time, \
            userId FROM ST_Events WHERE userId IN {} \
            GROUP BY userId ORDER BY userId""".format(user_ids)
df = pd.read_sql_query(query,conn)
df

#####  Query with converted timestamp

In [None]:
#query = """select MAX(TIMESTAMP 'epoch' +  ts/1000 * interval '1 second') as max_time, \
 #           userId, level FROM ST_Events WHERE userId IN {} \
 #           GROUP BY userId,level ORDER BY userId""".format(user_ids)
#df = pd.read_sql_query(query,conn)
#df
max_time = """select MAX(TIMESTAMP 'epoch' +  ts/1000 * interval '1 second') as max_time, \
                userId FROM ST_Events WHERE userId IN {} \
                GROUP BY userId ORDER BY userId""".format(user_ids)
recent_time_df = pd.read_sql_query(max_time,conn)
recent_time_df

#### 5.2. Following snippet is to fecth the most recent data of the user

In [23]:
max_ts = """ SELECT max_time FROM (SELECT MAX(ts) as max_time, \
                                    userId FROM ST_Events WHERE userId IN {} \
                                    GROUP BY userId ORDER BY userId)""".format(user_ids)

updated_level = """ SELECT userId, 
                        firstName,
                        lastName,
                        gender, level FROM ST_EVENTS \
                    WHERE ts IN ({}) AND userId IN {}""".format(max_ts,user_ids)
cur.execute(updated_level)
recent_level = cur.fetchall()
recent_level

[('15', 'Lily', 'Koch', 'F', 'paid'),
 ('16', 'Rylan', 'George', 'M', 'paid'),
 ('29', 'Jacqueline', 'Lynch', 'F', 'paid'),
 ('36', 'Matthew', 'Jones', 'M', 'paid'),
 ('49', 'Chloe', 'Cuevas', 'F', 'paid'),
 ('80', 'Tegan', 'Levine', 'F', 'paid'),
 ('85', 'Kinsley', 'Young', 'F', 'paid'),
 ('88', 'Mohammad', 'Rodriguez', 'M', 'paid')]

#### Checking for the duplicate records

In [35]:
cur.execute("select * from (SELECT user_id,count(*) c FROM users group by user_id ) where c>1")
cur.fetchall()

[('15', 2),
 ('16', 2),
 ('29', 2),
 ('36', 2),
 ('49', 2),
 ('80', 2),
 ('85', 2),
 ('88', 2)]

#### 5.3. <span style = "color:orange">Deleting </span>the duplicate records

In [36]:
delete_query = "DELETE FROM users WHERE user_id IN {}".format(user_ids)
cur.execute(delete_query)

#### 5.4. Inserting with the latest data

In [37]:
users_insert_query = "INSERT INTO users (user_id, first_name,last_name,gender, level) VALUES  (%s, %s,%s,%s, %s)"

for row in recent_level:
    cur.execute(users_insert_query,row)

In [38]:
print(user_ids)
query = """SELECT * FROM users WHERE user_id IN {};""".format(user_ids)
cur.execute(query)
cur.fetchall()

('15', '16', '29', '36', '49', '80', '85', '88')


[('15', 'Lily', 'Koch', 'F', 'paid'),
 ('16', 'Rylan', 'George', 'M', 'paid'),
 ('29', 'Jacqueline', 'Lynch', 'F', 'paid'),
 ('36', 'Matthew', 'Jones', 'M', 'paid'),
 ('49', 'Chloe', 'Cuevas', 'F', 'paid'),
 ('80', 'Tegan', 'Levine', 'F', 'paid'),
 ('85', 'Kinsley', 'Young', 'F', 'paid'),
 ('88', 'Mohammad', 'Rodriguez', 'M', 'paid')]

In [None]:
old_isolation_level = conn.isolation_level
print("old_isolation_level :",old_isolation_level)
conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
cur.execute("VACCUM FULL users to 100 percent;")