In [1]:
from google.cloud import bigquery
from google.oauth2 import service_account

# We will import a customized function called client which actually returns an authorized bigquery client object with right credentials
# this will cost us an extra pair of () each time we call the client object which is now called by the function client we define in bq_sa_auth.py 

from bq_sa_auth import client

### [ORDER BY, EXTRACT](https://www.kaggle.com/code/dansbecker/order-by/tutorial)

#### Order your results to focus on the most important data for your use case.

-------------------------------------------------------------------------------


### Keywords: ORDER BY, EXTRACT

- to change the order of your results using the ORDER BY clause

- to look at part of a date, like the year or the day. You can do this with EXTRACT.

#### Examples using US Traffic Fatality Records database, which contains information on traffic accidents in the US where at least one person died.

In [3]:
# Construct a reference to the "nhtsa_traffic_fatalities" dataset
dataset_ref = client().dataset("nhtsa_traffic_fatalities", project="bigquery-public-data")

# API request - fetch the dataset
dataset = client().get_dataset(dataset_ref)

# Construct a reference to the "accident_2015" table
table_ref = dataset_ref.table("accident_2015")

# API request - fetch the table
table = client().get_table(table_ref)

# Preview the first five lines of the "accident_2015" table
client().list_rows(table, max_results=5).to_dataframe()

Unnamed: 0,state_number,state_name,consecutive_number,number_of_vehicle_forms_submitted_all,number_of_motor_vehicles_in_transport_mvit,number_of_parked_working_vehicles,number_of_forms_submitted_for_persons_not_in_motor_vehicles,number_of_persons_not_in_motor_vehicles_in_transport_mvit,number_of_persons_in_motor_vehicles_in_transport_mvit,number_of_forms_submitted_for_persons_in_motor_vehicles,...,minute_of_ems_arrival_at_hospital,related_factors_crash_level_1,related_factors_crash_level_1_name,related_factors_crash_level_2,related_factors_crash_level_2_name,related_factors_crash_level_3,related_factors_crash_level_3_name,number_of_fatalities,number_of_drunk_drivers,timestamp_of_crash
0,30,Montana,300019,5,5,0,0,0,7,7,...,45,0,,0,,0,,1,0,2015-03-28 14:58:00+00:00
1,39,Ohio,390099,7,7,0,0,0,15,15,...,24,27,Backup Due to Prior Crash,0,,0,,1,0,2015-02-14 11:19:00+00:00
2,49,Utah,490123,16,16,0,0,0,28,28,...,99,0,,0,,0,,1,0,2015-04-14 12:24:00+00:00
3,48,Texas,481184,6,5,1,0,5,5,10,...,99,0,,0,,0,,1,0,2015-05-27 16:40:00+00:00
4,41,Oregon,410333,11,11,0,0,0,14,14,...,99,0,,0,,0,,1,0,2015-11-17 18:17:00+00:00


Let's use the table to determine how the number of accidents varies with the day of the week. Since:

- the `consecutive_number` column contains a unique ID for each accident, and
- the `timestamp_of_crash` column contains the date of the accident in DATETIME format,

we can:

- EXTRACT the day of the week (as `day_of_week` in the query below) from the `timestamp_of_crash` column, and
- GROUP BY the day of the week, before we COUNT the `consecutive_number` column to determine the number of accidents for each day of the week.
Then we sort the table with an ORDER BY clause, so the days with the most accidents are returned first.

In [4]:
query_crash = """
            SELECT COUNT(consecutive_number) as num_accidents, EXTRACT(DAYOFWEEK FROM timestamp_of_crash) as day_of_week
            FROM `bigquery-public-data.nhtsa_traffic_fatalities.accident_2015`
            GROUP BY day_of_week
            ORDER BY num_accidents DESC
        """

ONE_GB = 1000*1000*1000

safe_config = bigquery.QueryJobConfig(maximum_bytes_billed = ONE_GB)

query_job = client().query(query_crash, job_config=safe_config)

crash_num = query_job.to_dataframe()

crash_num.head()

Unnamed: 0,num_accidents,day_of_week
0,5659,7
1,5298,1
2,4916,6
3,4460,5
4,4182,4


#### To map the numbers returned for the `day_of_week` column to the actual day, you might consult the BigQuery documentation on the DAYOFWEEK function. It says that it returns "an integer between 1 (Sunday) and 7 (Saturday), inclusively". So, in 2015, most fatal motor accidents in the US occured on Sunday and Saturday, while the fewest happened on Tuesday.

### **Exercises:** 

#### The World Bank has made tons of interesting education data available through BigQuery. Run the following cell to see the first few rows of the `international_education` table from the `world_bank_intl_education` dataset.

In [5]:
# Construct a reference to the "world_bank_intl_education" dataset
dataset_ref = client().dataset("world_bank_intl_education", project="bigquery-public-data")

# API request - fetch the dataset
dataset = client().get_dataset(dataset_ref)

# Construct a reference to the "international_education" table
table_ref = dataset_ref.table("international_education")

# API request - fetch the table
table = client().get_table(table_ref)

# Preview the first five lines of the "international_education" table
client().list_rows(table, max_results=5).to_dataframe()

Unnamed: 0,country_name,country_code,indicator_name,indicator_code,value,year
0,Chad,TCD,"Enrolment in lower secondary education, both s...",UIS.E.2,321921.0,2012
1,Chad,TCD,"Enrolment in upper secondary education, both s...",UIS.E.3,68809.0,2006
2,Chad,TCD,"Enrolment in upper secondary education, both s...",UIS.E.3,30551.0,1999
3,Chad,TCD,"Enrolment in upper secondary education, both s...",UIS.E.3,79784.0,2007
4,Chad,TCD,"Repeaters in primary education, all grades, bo...",UIS.R.1,282699.0,2006


The value in the `indicator_code` column describes what type of data is shown in a given row.

One interesting indicator code is `SE.XPD.TOTL.GD.ZS`, which corresponds to "Government expenditure on education as % of GDP (%)".

### 1) Government expenditure on education

Which countries spend the largest fraction of GDP on education?  

To answer this question, consider only the rows in the dataset corresponding to indicator code `SE.XPD.TOTL.GD.ZS`, and write a query that returns the average value in the `value` column for each country in the dataset between the years 2010-2017 (including 2010 and 2017 in the average). 

Requirements:
- Your results should have the country name rather than the country code. You will have one row for each country.
- The aggregate function for average is **AVG()**.  Use the name `avg_ed_spending_pct` for the column created by this aggregation.
- Order the results so the countries that spend the largest fraction of GDP on education show up first.

In [6]:
query_wb = """
            SELECT country_name, AVG(value) as avg_ed_spending_pct
            FROM `bigquery-public-data.world_bank_intl_education.international_education`
            WHERE indicator_code = 'SE.XPD.TOTL.GD.ZS' and year >= 2010 and year <= 2017
            GROUP BY country_name
            ORDER BY avg_ed_spending_pct DESC
        """

ONE_GB = 1000*1000*1000

safe_config = bigquery.QueryJobConfig(maximum_bytes_billed = ONE_GB)

query_job = client().query(query_wb, job_config=safe_config)

wb_ed = query_job.to_dataframe()

wb_ed.head(20)

Unnamed: 0,country_name,avg_ed_spending_pct
0,Cuba,12.83727
1,"Micronesia, Fed. Sts.",12.46775
2,Solomon Islands,10.00108
3,Moldova,8.372153
4,Namibia,8.34961
5,Denmark,8.2743
6,Timor-Leste,7.975114
7,Iceland,7.48021
8,Sweden,7.233168
9,Malta,7.134535


### 2) Identify interesting codes to explore

The last question started by telling you to focus on rows with the code `SE.XPD.TOTL.GD.ZS`. But how would you find more interesting indicator codes to explore?

There are 1000s of codes in the dataset, so it would be time consuming to review them all. But many codes are available for only a few countries. When browsing the options for different codes, you might restrict yourself to codes that are reported by many countries.

Write a query below that selects the indicator code and indicator name for all codes with at least 175 rows in the year 2016.

Requirements:
- You should have one row for each indicator code.
- The columns in your results should be called `indicator_code`, `indicator_name`, and `num_rows`.
- Only select codes with 175 or more rows in the raw database (exactly 175 rows would be included).
- To get both the `indicator_code` and `indicator_name` in your resulting DataFrame, you need to include both in your **SELECT** statement (in addition to a **COUNT()** aggregation). This requires you to include both in your **GROUP BY** clause.
- Order from results most frequent to least frequent.

In [7]:
query_wb2 = """
            SELECT indicator_code, indicator_name, COUNT(1) as num_rows
            FROM `bigquery-public-data.world_bank_intl_education.international_education`
            WHERE year = 2016 
            GROUP BY indicator_code, indicator_name
            HAVING num_rows >= 175
            ORDER BY num_rows DESC
        """

ONE_GB = 1000*1000*1000

safe_config = bigquery.QueryJobConfig(maximum_bytes_billed = ONE_GB)

query_job = client().query(query_wb2, job_config=safe_config)

int_ed = query_job.to_dataframe()

int_ed.head(20)

Unnamed: 0,indicator_code,indicator_name,num_rows
0,SP.POP.GROW,Population growth (annual %),232
1,SP.POP.TOTL,"Population, total",232
2,IT.NET.USER.P2,Internet users (per 100 people),223
3,SH.DYN.MORT,"Mortality rate, under-5 (per 1,000)",213
4,SP.POP.0014.TO,"Population, ages 0-14, total",213
5,SP.POP.0014.TO.ZS,"Population, ages 0-14 (% of total)",213
6,SP.POP.1564.TO.ZS,"Population, ages 15-64 (% of total)",213
7,SP.POP.TOTL.MA.ZS,"Population, male (% of total)",213
8,SP.POP.1564.TO,"Population, ages 15-64, total",213
9,SP.POP.TOTL.MA.IN,"Population, male",213
