# Project 3 - Data Warehouse Demo Queries
Example database queries to demonstrate that Sparkify analytics tables loaded on Sparkify Redshift cluster can be queried by the analytics team to generate insight.

In [1]:
import pandas as pd
import configparser
import psycopg2

In [2]:
# Load configuration data to connect to Redshift cluster
config = configparser.ConfigParser()
config.read('dwh.cfg')

['dwh.cfg']

In [3]:
## Establish connection to Redshift cluster
try:
    conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(*config['CLUSTER'].values()))
    cur = conn.cursor()
except Exception as e:
    print(e)

Count the number of songplays in the database from paid users

In [4]:
try:  
    cur.execute("SELECT COUNT (*) FROM fact_songplays WHERE fact_songplays.level = 'paid';")
    print(cur.fetchone()[0])
except Exception as e:
    print(e)

246


Count the total number of songplays in the database

In [5]:
try:  
    cur.execute("SELECT COUNT(*) FROM fact_songplays;")
    print(cur.fetchone()[0])
except Exception as e:
    print(e)

315


Count the number of songplays that occur on a Monday

In [6]:
try:    
    cur.execute("SELECT COUNT(*) FROM fact_songplays JOIN dim_time ON fact_songplays.start_time = dim_time.start_time WHERE dim_time.weekday = 'Monday';")
    results = cur.fetchall()[0][0]
    print(results)
except Exception as e:
    print(e)

102


Find the total number of songplays in the database by day of week

In [7]:
try:   
    days = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
    count = []

    for i, day in enumerate(days):
        cur.execute("SELECT COUNT(*) FROM fact_songplays JOIN dim_time ON fact_songplays.start_time = dim_time.start_time WHERE dim_time.weekday = %s;",(day,))
        count.append(cur.fetchall()[0][0]) 

    count_by_day = dict(zip(days,count))
    print(count_by_day)
except Exception as e:
    print(e)

{'Monday': 102, 'Tuesday': 96, 'Wednesday': 120, 'Thursday': 138, 'Friday': 90, 'Saturday': 48, 'Sunday': 36}


Number of songplays by day of week for 'free' level users

In [8]:
try:    
    days = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
    count = []

    for i, day in enumerate(days):
        cur.execute("SELECT COUNT(*) FROM fact_songplays JOIN dim_time ON fact_songplays.start_time = dim_time.start_time WHERE dim_time.weekday = %s AND fact_songplays.level = 'free';",(day,))
        count.append(cur.fetchall()[0][0]) 

    count_by_day_free = dict(zip(days,count))
    print(count_by_day_free)
except Exception as e:
    print(e)

{'Monday': 24, 'Tuesday': 24, 'Wednesday': 18, 'Thursday': 12, 'Friday': 42, 'Saturday': 6, 'Sunday': 12}


Number of songplays by day of week for 'paid' level users

In [9]:
try:    
    days = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
    count = []

    for i, day in enumerate(days):
        cur.execute("SELECT COUNT(*) FROM fact_songplays JOIN dim_time ON fact_songplays.start_time = dim_time.start_time WHERE dim_time.weekday = %s AND fact_songplays.level = 'paid';",(day,))
        count.append(cur.fetchall()[0][0]) 

    count_by_day_paid = dict(zip(days,count))
    print(count_by_day_paid)
except Exception as e:
    print(e)

{'Monday': 78, 'Tuesday': 72, 'Wednesday': 102, 'Thursday': 126, 'Friday': 48, 'Saturday': 42, 'Sunday': 24}


Number of songs in songs table by year

In [10]:
try:
    cur.execute("SELECT dim_songs.year as year, COUNT(*) as count_all FROM dim_songs GROUP BY dim_songs.year ORDER BY dim_songs.year ASC;")
    results = cur.fetchall()
    count_songs_by_year = dict(results)
    print(count_songs_by_year)
except Exception as e:
    print(e)

{1927: 3, 1944: 3, 1954: 9, 1956: 6, 1957: 9, 1958: 6, 1959: 12, 1960: 6, 1961: 12, 1962: 12, 1963: 21, 1964: 21, 1965: 6, 1966: 30, 1967: 27, 1968: 48, 1969: 39, 1970: 69, 1971: 36, 1972: 45, 1973: 48, 1974: 45, 1975: 54, 1976: 39, 1977: 63, 1978: 60, 1979: 51, 1980: 69, 1981: 75, 1982: 102, 1983: 63, 1984: 69, 1985: 72, 1986: 123, 1987: 105, 1988: 126, 1989: 150, 1990: 168, 1991: 165, 1992: 282, 1993: 192, 1994: 285, 1995: 309, 1996: 294, 1997: 315, 1998: 246, 1999: 387, 2000: 411, 2001: 393, 2002: 441, 2003: 555, 2004: 636, 2005: 723, 2006: 816, 2007: 858, 2008: 792, 2009: 612, 2010: 216}


Find the top 3 locations by total songplays, return total songplays and location for each

In [11]:
try:    
    cur.execute("""SELECT COUNT(*) as count_all, fact_songplays.location as location FROM fact_songplays 
                GROUP BY fact_songplays.location ORDER BY count_all DESC LIMIT 3;""")
    top3_locations_by_songplays = cur.fetchall()
    print(top3_locations_by_songplays)
except Exception as e:
    print(e)

[(33, 'Lansing-East Lansing, MI'), (33, 'San Francisco-Oakland-Hayward, CA'), (21, 'Chicago-Naperville-Elgin, IL-IN-WI')]


Find the top 3 locations by total songplays for each day of week

In [12]:
try:
    days = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
    top3 = []

    for i, day in enumerate(days):
        cur.execute("SELECT COUNT(*) as count_all, fact_songplays.location as location FROM fact_songplays JOIN dim_time ON fact_songplays.start_time = dim_time.start_time WHERE dim_time.weekday = %s GROUP BY fact_songplays.location ORDER BY count_all DESC;",(day,))
        top3.append(cur.fetchall()[0:3]) 

    top3_locations_by_day = dict(zip(days,top3))
    print(top3_locations_by_day)
except Exception as e:
    print(e)

{'Monday': [(18, 'Lansing-East Lansing, MI'), (18, 'Lake Havasu City-Kingman, AZ'), (12, 'Waterloo-Cedar Falls, IA')], 'Tuesday': [(24, 'Marinette, WI-MI'), (12, 'Atlanta-Sandy Springs-Roswell, GA'), (12, 'Lansing-East Lansing, MI')], 'Wednesday': [(24, 'Chicago-Naperville-Elgin, IL-IN-WI'), (18, 'Tampa-St. Petersburg-Clearwater, FL'), (18, 'Portland-South Portland, ME')], 'Thursday': [(24, 'Lansing-East Lansing, MI'), (24, 'San Francisco-Oakland-Hayward, CA'), (18, 'Portland-South Portland, ME')], 'Friday': [(18, 'San Francisco-Oakland-Hayward, CA'), (12, 'New Haven-Milford, CT'), (6, 'Eugene, OR')], 'Saturday': [(12, 'Winston-Salem, NC'), (12, 'Waterloo-Cedar Falls, IA'), (12, 'Tampa-St. Petersburg-Clearwater, FL')], 'Sunday': [(6, 'Marinette, WI-MI'), (6, 'Indianapolis-Carmel-Anderson, IN'), (6, 'Waterloo-Cedar Falls, IA')]}


Top 3 paid users by total number of songs played

In [13]:
try:
    cur.execute("""SELECT COUNT (*) as count_all,  dim_users.first_name, dim_users.last_name, dim_users.gender FROM fact_songplays JOIN dim_users
                ON fact_songplays.user_id = dim_users.user_id WHERE fact_songplays.level = 'paid' GROUP BY first_name, last_name, gender
                ORDER BY count_all DESC LIMIT 3;""")
    top3_paid_users_by_songplays = cur.fetchall()
    print(top3_paid_users_by_songplays)
except Exception as e:
    print(e)

[(99, 'Kate', 'Harrell', 'F'), (90, 'Chloe', 'Cuevas', 'F'), (63, 'Jacob', 'Klein', 'M')]


### Demonstrate use of Pandas read_sql to ingest data into DataFrames

In [14]:
pd.read_sql("SELECT COUNT(*) as count_all, fact_songplays.location as location FROM fact_songplays GROUP BY fact_songplays.location ORDER BY count_all DESC LIMIT 3;",conn)

Unnamed: 0,count_all,location
0,33,"Lansing-East Lansing, MI"
1,33,"San Francisco-Oakland-Hayward, CA"
2,21,"Chicago-Naperville-Elgin, IL-IN-WI"


In [15]:
pd.read_sql("SELECT dim_songs.year as year, COUNT(*) as count_all FROM dim_songs GROUP BY dim_songs.year ORDER BY dim_songs.year ASC;",conn)

Unnamed: 0,year,count_all
0,1927,3
1,1944,3
2,1954,9
3,1956,6
4,1957,9
5,1958,6
6,1959,12
7,1960,6
8,1961,12
9,1962,12


## REMEMBER: Close connection when finished

In [16]:
conn.close()