# Part 2.3: Summarizing Data in SQL

Importing sqlite3 to run queries in python:

In [8]:
import pandas as pd
import sqlite3

Connecting to the database and creating a cursor object:

In [12]:
connection = sqlite3.connect('chinook.db')
cursor = connection.cursor()

## Summary Statistics with SQL

Combining Aggregate and Scalar Functions:

In [4]:
query = """

SELECT
AVG(milliseconds / 1000.0 / 60) AS avg_runtime_minutes,
ROUND(
    AVG(
        milliseconds / 1000.0 / 60
        ),2
    ) AS avg_runtime_minutes_rounded
FROM
track;

"""

pd.read_sql_query(query,connection)

Unnamed: 0,avg_runtime_minutes,avg_runtime_minutes_rounded
0,6.559987,6.56


Summary Statistics Under Conditions:

In [5]:
query = """

SELECT
COUNT(*) AS num_row,
MIN(total) AS min_total,
MAX(total) AS max_total,
ROUND(AVG(total),2) AS avg_total_rounded
FROM
invoice
WHERE
total > 10 AND
billing_country = 'USA';

"""

pd.read_sql_query(query,connection)

Unnamed: 0,num_row,min_total,max_total,avg_total_rounded
0,34,10.89,18.81,12.67


## Group Summary Statistics with SQL

Counting Rows by Group:

In [14]:
query = """

SELECT
    billing_country,
    COUNT(*) AS num_row
FROM
    invoice
GROUP BY
    billing_country
LIMIT
    5;

"""

pd.read_sql_query(query,connection)

Unnamed: 0,billing_country,num_row
0,Argentina,5
1,Australia,10
2,Austria,9
3,Belgium,7
4,Brazil,61


Summary Statistics by Group Under Conditions:

In [16]:
query = """

SELECT
    billing_state,
    COUNT(*) AS num_row,
    AVG(total) AS avg_sale
FROM
    invoice
WHERE
    billing_country = 'USA'
GROUP BY
    billing_state;

"""

pd.read_sql_query(query,connection)

Unnamed: 0,billing_state,num_row,avg_sale
0,AZ,9,9.35
1,CA,29,7.715172
2,FL,12,7.6725
3,IL,8,8.91
4,MA,10,6.633
5,NV,11,8.28
6,NY,8,9.9
7,TX,12,7.1775
8,UT,10,7.227
9,WA,12,8.1675


Summary Statistics by Ordered Groups:

In [17]:
query = """

SELECT
    track_id,
    COUNT(*) as num_row,
    SUM(unit_price * quantity) AS overall_sale
FROM
    invoice_line
GROUP BY
    track_id
ORDER BY
    overall_sale DESC, 
    num_row DESC
LIMIT
    5;

"""

pd.read_sql_query(query,connection)

Unnamed: 0,track_id,num_row,overall_sale
0,3336,31,30.69
1,1489,14,13.86
2,1495,14,13.86
3,6,13,12.87
4,1487,13,12.87


Summary Statistics by Ordered Groups Under Conditions:

In [20]:
query = """

SELECT
    billing_city,
    COUNT(*) AS num_row,
    SUM(total) AS overall_sale,
    MIN(total) AS min_sale,
    AVG(total) AS avg_sale,
    MAX(total) AS max_sale
FROM
    invoice
WHERE 
    billing_country = 'Canada' 
    OR billing_country = 'France'
GROUP BY
    billing_city
ORDER BY
    overall_sale DESC,
    num_row DESC
LIMIT
    3;

"""

pd.read_sql_query(query,connection)

Unnamed: 0,billing_city,num_row,overall_sale,min_sale,avg_sale,max_sale
0,Paris,18,151.47,1.98,8.415,17.82
1,Bordeaux,11,99.99,0.99,9.09,23.76
2,Montréal,9,99.99,2.97,11.11,19.8


## Multiple Group Summary Statistics

Grouping over Several Columns:

In [22]:
query = """

SELECT
    billing_country,
    billing_state,
    COUNT(*) AS num_row,
    AVG(total) AS avg_sale
FROM
    invoice
GROUP BY
    billing_country, 
    billing_state
LIMIT
    5;

"""

pd.read_sql_query(query,connection)

Unnamed: 0,billing_country,billing_state,num_row,avg_sale
0,Argentina,,5,7.92
1,Australia,NSW,10,8.118
2,Austria,,9,7.7
3,Belgium,,7,8.627143
4,Brazil,DF,15,7.128


Combining WHERE and HAVING Clauses:

In [24]:
query = """


SELECT 
    billing_country, 
    billing_state,
    MIN(total) AS min_sale, 
    MAX(total) AS max_sale 
FROM 
    invoice
WHERE 
    billing_state != 'None'
GROUP BY 
    billing_country, billing_state
HAVING 
    AVG(total) < 10
LIMIT
    5;

"""

pd.read_sql_query(query,connection)

Unnamed: 0,billing_country,billing_state,min_sale,max_sale
0,Australia,NSW,1.98,17.82
1,Brazil,DF,0.99,14.85
2,Brazil,RJ,1.98,16.83
3,Brazil,SP,0.99,17.82
4,Canada,AB,0.99,8.91
