In [37]:
import psycopg2

In [38]:
conn = psycopg2.connect("host=127.0.0.1 dbname=sparkifydb user=student password=student")
conn.set_session(autocommit=True)
cur = conn.cursor()

Count the numb of songplays in the database from paid users

In [39]:
cur.execute("SELECT COUNT (*) FROM songplays WHERE songplays.level = 'paid';")
print(cur.fetchone()[0])

5591


Count the total number of songplays in the database

In [40]:
cur.execute("SELECT COUNT(*) FROM songplays;")
print(cur.fetchone()[0])

6820


Count the number of songplays that occur on a Monday

In [41]:
cur.execute("SELECT COUNT(*) FROM songplays JOIN time ON songplays.start_time = time.start_time WHERE time.weekday = 'Monday';")
results = cur.fetchall()[0][0]
print(results)

1014


Find the total number of songplays in the database by day of week

In [42]:
days = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
count = []

for i, day in enumerate(days):
    cur.execute("SELECT COUNT(*) FROM songplays JOIN time ON songplays.start_time = time.start_time WHERE time.weekday = %s;",(day,))
    count.append(cur.fetchall()[0][0]) 

count_by_day = dict(zip(days,count))
print(count_by_day)

{'Monday': 1014, 'Tuesday': 1071, 'Wednesday': 1364, 'Thursday': 1052, 'Friday': 1295, 'Saturday': 628, 'Sunday': 396}


Number of songplays by day of week for 'free' level users

In [43]:
days = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
count = []

for i, day in enumerate(days):
    cur.execute("SELECT COUNT(*) FROM songplays JOIN time ON songplays.start_time = time.start_time WHERE time.weekday = %s AND songplays.level = 'free';",(day,))
    count.append(cur.fetchall()[0][0]) 

count_by_day_free = dict(zip(days,count))
print(count_by_day_free)

{'Monday': 201, 'Tuesday': 199, 'Wednesday': 195, 'Thursday': 148, 'Friday': 257, 'Saturday': 108, 'Sunday': 121}


Number of songplays by day of week for 'paid' level users

In [44]:
days = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
count = []

for i, day in enumerate(days):
    cur.execute("SELECT COUNT(*) FROM songplays JOIN time ON songplays.start_time = time.start_time WHERE time.weekday = %s AND songplays.level = 'paid';",(day,))
    count.append(cur.fetchall()[0][0]) 

count_by_day_paid = dict(zip(days,count))
print(count_by_day_paid)

{'Monday': 813, 'Tuesday': 872, 'Wednesday': 1169, 'Thursday': 904, 'Friday': 1038, 'Saturday': 520, 'Sunday': 275}


Number of songs in songs table by year

In [45]:
cur.execute("SELECT songs.year as year, COUNT(*) as count_all FROM songs GROUP BY songs.year ORDER BY songs.year ASC;")
results = cur.fetchall()
count_songs_by_year = dict(results)
print(count_songs_by_year)

{1961: 1, 1964: 1, 1969: 1, 1972: 1, 1985: 1, 1987: 1, 1992: 1, 1994: 1, 1997: 1, 1999: 1, 2000: 1, 2003: 1, 2004: 1, 2005: 1}


Find the top 3 locations by total songplays, return total songplays and location for each

In [46]:
cur.execute("""SELECT COUNT(*) as count_all, songplays.location as location FROM songplays 
            GROUP BY songplays.location ORDER BY count_all DESC LIMIT 3;""")
top3_locations_by_songplays = cur.fetchall()
print(top3_locations_by_songplays)

[(691, 'San Francisco-Oakland-Hayward, CA'), (665, 'Portland-South Portland, ME'), (557, 'Lansing-East Lansing, MI')]


Find the top 3 locations by total songplays for each day of week

In [47]:
days = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
top3 = []

for i, day in enumerate(days):
    cur.execute("SELECT COUNT(*) as count_all, songplays.location as location FROM songplays JOIN time ON songplays.start_time = time.start_time WHERE time.weekday = %s GROUP BY songplays.location ORDER BY count_all DESC;",(day,))
    top3.append(cur.fetchall()[0:3]) 

top3_locations_by_day = dict(zip(days,top3))
print(top3_locations_by_day)

{'Monday': [(152, 'Lake Havasu City-Kingman, AZ'), (114, 'Waterloo-Cedar Falls, IA'), (99, 'Tampa-St. Petersburg-Clearwater, FL')], 'Tuesday': [(150, 'Lansing-East Lansing, MI'), (106, 'Atlanta-Sandy Springs-Roswell, GA'), (97, 'San Francisco-Oakland-Hayward, CA')], 'Wednesday': [(224, 'Chicago-Naperville-Elgin, IL-IN-WI'), (193, 'Lansing-East Lansing, MI'), (181, 'Portland-South Portland, ME')], 'Thursday': [(142, 'San Francisco-Oakland-Hayward, CA'), (140, 'Portland-South Portland, ME'), (127, 'San Jose-Sunnyvale-Santa Clara, CA')], 'Friday': [(183, 'San Francisco-Oakland-Hayward, CA'), (158, 'Portland-South Portland, ME'), (137, 'Janesville-Beloit, WI')], 'Saturday': [(97, 'Winston-Salem, NC'), (96, 'Tampa-St. Petersburg-Clearwater, FL'), (80, 'San Francisco-Oakland-Hayward, CA')], 'Sunday': [(119, 'Atlanta-Sandy Springs-Roswell, GA'), (54, 'Waterloo-Cedar Falls, IA'), (37, 'San Francisco-Oakland-Hayward, CA')]}


Find the top 3 locations by number of unique users, return the count and location

In [48]:
cur.execute("""WITH unique_users AS (SELECT DISTINCT on (user_id) user_id, location, level  FROM songplays) 
            SELECT COUNT (*) as count_all, unique_users.location FROM unique_users GROUP BY unique_users.location 
            ORDER BY count_all DESC LIMIT 3;""")
top3_locations_by_unique_users = cur.fetchall()
print(top3_locations_by_unique_users)

[(10, 'New York-Newark-Jersey City, NY-NJ-PA'), (4, 'Houston-The Woodlands-Sugar Land, TX'), (3, 'Chicago-Naperville-Elgin, IL-IN-WI')]


Number of unique users by day of week for 'free' level

In [49]:
days = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
count = []

for i, day in enumerate(days):
    cur.execute("""WITH unique_users AS (SELECT DISTINCT on (user_id) user_id, location, level, time.weekday FROM songplays JOIN time
                ON songplays.start_time = time.start_time WHERE level = 'free' AND weekday = %s) 
                SELECT COUNT (*) as count_all FROM unique_users;""",(day,))
    count.append(cur.fetchall()[0][0])
unique_users_by_day_free = dict(zip(days,count))
print(unique_users_by_day_free)

{'Monday': 47, 'Tuesday': 45, 'Wednesday': 50, 'Thursday': 45, 'Friday': 52, 'Saturday': 36, 'Sunday': 35}


Number of unique users by day of week for 'paid' level

In [50]:
days = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
count = []

for i, day in enumerate(days):
    cur.execute("""WITH unique_users AS (SELECT DISTINCT on (user_id) user_id, location, level, time.weekday FROM songplays JOIN time
                ON songplays.start_time = time.start_time WHERE level = 'paid' AND weekday = %s) 
                SELECT COUNT (*) as count_all FROM unique_users;""",(day,))
    count.append(cur.fetchall()[0][0])
unique_users_by_day_free = dict(zip(days,count))
print(unique_users_by_day_free)

{'Monday': 16, 'Tuesday': 16, 'Wednesday': 16, 'Thursday': 14, 'Friday': 16, 'Saturday': 11, 'Sunday': 6}


Top 3 paid users by total number of songs played

In [51]:
cur.execute("""SELECT COUNT (*) as count_all,  users.first_name, users.last_name, users.gender FROM songplays JOIN users
            ON songplays.user_id = users.user_id WHERE songplays.level = 'paid' GROUP BY first_name, last_name, gender
            ORDER BY count_all DESC LIMIT 3;""")
top3_paid_users_by_songplays = cur.fetchall()
print(top3_paid_users_by_songplays)

[(650, 'Chloe', 'Cuevas', 'F'), (648, 'Tegan', 'Levine', 'F'), (557, 'Kate', 'Harrell', 'F')]


## REMEMBER: Restart this notebook to close connection to `sparkifydb`
Each time you run the cells above, remember to restart this notebook to close the connection to your database. Otherwise, you won't be able to run your code in `create_tables.py`, `etl.py`, or `etl.ipynb` files since you can't make multiple connections to the same database (in this case, sparkifydb).

In [52]:
conn.close()