This notebook checks that the data looks correct in the tabes and runs some sample analytics queries.

In [1]:
import configparser
import psycopg2

In [2]:
config = configparser.ConfigParser()

In [3]:
# reload config file with ARN and endpoint updated
config.read_file(open('dwh.cfg'))

In [4]:
conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(*config['CLUSTER'].values()))
cur = conn.cursor()

In [5]:
tables = ['staging_events', 'staging_songs', 'songplays', 'users', 'songs', 'artists', 'time']

In [6]:
for t in tables:
    print('table: {}'.format(t))
    query = 'SELECT COUNT(*) from {};'.format(t)
    cur.execute(query)
    print('number of rows: ', cur.fetchone()[0])
    query = 'SELECT * from {} LIMIT 5;'.format(t)
    cur.execute(query)
    for c in cur.fetchall():
        print(c)
    
    print('\n')

table: staging_events
number of rows:  8056
(None, 'Logged In', 'Adler', 'M', 0, 'Barrera', None, 'free', 'New York-Newark-Jersey City, NY-NJ-PA', 'GET', 'Home', Decimal('1540835983796'), 248, None, 200, 1541470364796, '"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2"', 100)
('Gustavo Cerati', 'Logged In', 'Adler', 'M', 1, 'Barrera', Decimal('249'), 'free', 'New York-Newark-Jersey City, NY-NJ-PA', 'PUT', 'NextSong', Decimal('1540835983796'), 248, 'Uno Entre 1000', 200, 1541470383796, '"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2"', 100)
('Limp Bizkit', 'Logged In', 'Adler', 'M', 2, 'Barrera', Decimal('270'), 'free', 'New York-Newark-Jersey City, NY-NJ-PA', 'PUT', 'NextSong', Decimal('1540835983796'), 248, 'Behind Blue Eyes', 200, 1541470632796, '"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.

# Some example analytics queries

### Get the top 10 most-played songs.

In [7]:
query = """
SELECT songs.title, COUNT(songs.title) as play_count
FROM songplays
JOIN songs
ON songplays.song_id = songs.song_id
GROUP BY songs.title
ORDER BY play_count DESC LIMIT 10;
"""

cur.execute(query)
for c in cur.fetchall():
    print(c)

("You're The One", 37)
("I CAN'T GET STARTED", 9)
('Catch You Baby (Steve Pitron & Max Sanna Radio Edit)', 9)
("Nothin' On You [feat. Bruno Mars] (Album Version)", 8)
("Hey Daddy (Daddy's Home)", 6)
('Make Her Say', 5)
('Up Up & Away', 5)
('Unwell (Album Version)', 4)
('Mr. Jones', 4)
('Supermassive Black Hole (Album Version)', 4)


### Get the number of users in different groups (e.g. paid/free).

In [8]:
query = """
SELECT level, COUNT(level) level_count
FROM songplays
GROUP BY level
ORDER BY level_count DESC;
"""

cur.execute(query)
for c in cur.fetchall():
    print(c)

('paid', 271)
('free', 62)


In [9]:
conn.close()