# 7. INTRODUCTION TO SQL

# 7.4. ORDER BY

# 7.4.1. COURS

In [1]:
#  how to change the order of your results using the ORDER BY clause, and 
# you'll explore a popular use case by applying ordering to dates.

# 7.4.1.1. ORDER BY

## 7.4.1.1.1. Identifiant de la table

In [None]:
query = """
        SELECT ID, Animal
        FROM 'bigquery-publicdata.pet_records.pets'
        ORDER BY ID
        """

## 7.4.1.1.2. Colonne contenant du texte

In [None]:
query = """
        SELECT ID, Animal
        FROM 'bigquery-publicdata.pet_records.pets'
        ORDER BY Animal
        """

## 7.4.1.1.3. ASCENDANT, DESCENDANT

In [None]:
query = """
        SELECT ID, Animal
        FROM 'bigquery-publicdata.pet_records.pets'
        ORDER BY Animal DESC
        """

## 7.4.1.2. DATES

In [None]:
# We'll talk about dates, because they come up very frequently in 
# real-world databases. 
# There are two ways that dates can be stored in BigQuery: as a DATE or 
# as a DATETIME.

# The DATE format has the year first, then the month, and then the day. 

# It looks like this:
#  YYYY-[M]M-[D]D
#  YYYY: Four-digit year
#  [M]M: One or two digit month
#  [D]D: One or two digit day

# So 2019-01-10 is interpreted as January 10, 2019.

# The DATETIME format is like the date format ... but with time added at 
# the end.

## 7.4.1.3. EXTRACT

In [None]:
query = """
        SELECT Name, EXTRACT(DAY from Date) AS Day
        FROM 'bigquery-publicdata.pet_records.pets_with_date'
        """
# si Date contient une ligne '2019-04-18' le nouveau champ Day 
# contiendra 18

In [None]:
query = """
        SELECT Name, EXTRACT(WEEK from Date) AS Day
        FROM 'bigquery-publicdata.pet_records.pets_with_date'
        """

## 7.4.1.4. EXEMPLES

In [None]:
# Query to find out the number of accidents for each day of the week
query = """
        SELECT COUNT(consecutive_number) AS num_accidents, 
               EXTRACT(DAYOFWEEK FROM timestamp_of_crash) AS day_of_week
        FROM `bigquery-public-data.nhtsa_traffic_fatalities.accident_2015`
        GROUP BY day_of_week
        ORDER BY num_accidents DESC
        """

# 	state_number	state_name	consecutive_number	number_of_vehicle_forms_submitted_all	number_of_motor_vehicles_in_transport_mvit	number_of_parked_working_vehicles	number_of_forms_submitted_for_persons_not_in_motor_vehicles	number_of_persons_not_in_motor_vehicles_in_transport_mvit	number_of_persons_in_motor_vehicles_in_transport_mvit	number_of_forms_submitted_for_persons_in_motor_vehicles	...	minute_of_ems_arrival_at_hospital	related_factors_crash_level_1	related_factors_crash_level_1_name	related_factors_crash_level_2	related_factors_crash_level_2_name	related_factors_crash_level_3	related_factors_crash_level_3_name	number_of_fatalities	number_of_drunk_drivers	timestamp_of_crash
# 0	19	Iowa	190204	1	1	0	0	0	1	1	...	2	0	None	0	None	0	None	1	1	2015-09-11 20:20:00+00:00
# 1	19	Iowa	190233	1	1	0	0	0	1	1	...	88	0	None	0	None	0	None	1	1	2015-11-01 00:30:00+00:00
# 2	19	Iowa	190179	1	1	0	0	0	2	2	...	1	0	None	0	None	0	None	1	0	2015-05-04 16:18:00+00:00
# 3	19	Iowa	190248	1	1	0	0	0	4	4	...	99	0	None	0	None	0	None	2	0	2015-11-17 12:26:00+00:00
# 4	19	Iowa	190231	1	1	0	0	0	1	1	...	88	0	None	0	None	0	None	1	0	2015-10-31 

In [None]:
# Set up the query (cancel the query if it would use too much of 
# your quota, with the limit set to 1 GB)
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**9)
query_job = client.query(query, job_config=safe_config)

# API request - run the query, and convert the results to a pandas DataFrame
accidents_by_day = query_job.to_dataframe()

# Print the DataFrame
accidents_by_day

# 	num_accidents	day_of_week
# 0	5659	7
# 1	5298	1
# 2	4916	6
# 3	4460	5
# 4	4182	4
# 5	4038	2
# 6	3985	3

# 7.4.2. EXERCICES

In [None]:
# Create a "Client" object
client = bigquery.Client()

# Construct a reference to the "world_bank_intl_education" dataset
dataset_ref = client.dataset("world_bank_intl_education", project="bigquery-public-data")

# API request - fetch the dataset
dataset = client.get_dataset(dataset_ref)

# Construct a reference to the "international_education" table
table_ref = dataset_ref.table("international_education")

# API request - fetch the table
table = client.get_table(table_ref)

# Preview the first five lines of the "international_education" table
client.list_rows(table, max_results=5).to_dataframe()

In [None]:
# 	country_name	country_code	indicator_name	indicator_code	value	year
# 0	St. Kitts and Nevis	KNA	Population of the official age for upper secon...	SP.SEC.UTOT.FE.IN	949.0	2016
# 1	St. Lucia	LCA	Unemployment, female (% of female labor force)	SL.UEM.TOTL.FE.ZS	28.5	2016
# 2	Suriname	SUR	Population of the official age for pre-primary...	SP.PRE.TOTL.FE.IN	9249.0	2016
# 3	Tajikistan	TJK	Official entrance age to upper secondary educa...	UIS.THAGE.3.A.GPV	16.0	2016
# 4	Turkmenistan	TKM	Population of the official age for primary edu...	SP.PRM.TOTL.MA.IN	201184.0	2016

## 7.4.2.1. Government expenditure on education

In [None]:
# Which countries spend the largest fraction of GDP on education?

# Your code goes here
country_spend_pct_query = """
                          SELECT country_name, AVG(value) AS avg_ed_spending_pct
                          FROM `bigquery-public-data.world_bank_intl_education.international_education`
                          WHERE indicator_code = 'SE.XPD.TOTL.GD.ZS' and year >= 2010 and year <= 2017
                          GROUP BY country_name
                          ORDER BY avg_ed_spending_pct DESC
                          """

# Set up the query (cancel the query if it would use too much of 
# your quota, with the limit set to 1 GB)
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
country_spend_pct_query_job = client.query(country_spend_pct_query, job_config=safe_config)

# API request - run the query, and return a pandas DataFrame
country_spending_results = country_spend_pct_query_job.to_dataframe()

# View top few rows of results
print(country_spending_results.head())

#             country_name  avg_ed_spending_pct
# 0                   Cuba            12.837270
# 1  Micronesia, Fed. Sts.            12.467750
# 2        Solomon Islands            10.001080
# 3                Moldova             8.372153
# 4                Namibia             8.349610

## 7.4.2.2. Identify interesting codes to explore

In [None]:
# The last question started by telling you to focus on rows with the code 
# SE.XPD.TOTL.GD.ZS. But how would you find more interesting indicator 
# codes to explore?
# There are 1000s of codes in the dataset, so it would be time consuming to
# review them all. But many codes are available for only a few countries. 
# When browsing the options for different codes, you might restrict 
# yourself to codes that are reported by many countries.
# Write a query below that selects the indicator code and indicator name 
# for all codes with at least 175 rows in the year 2016.

# Your code goes here
code_count_query = """
                   SELECT indicator_code, indicator_name, COUNT(1) AS num_rows
                   FROM `bigquery-public-data.world_bank_intl_education.international_education`
                   WHERE year = 2016
                   GROUP BY indicator_name, indicator_code
                   HAVING COUNT(1) >= 175
                   ORDER BY COUNT(1) DESC
                   """

# Set up the query
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
code_count_query_job = client.query(code_count_query, job_config=safe_config)

# API request - run the query, and return a pandas DataFrame
code_count_results = code_count_query_job.to_dataframe()

# View top few rows of results
print(code_count_results.head())

#       indicator_code                      indicator_name  num_rows
# 0        SP.POP.TOTL                   Population, total       232
# 1        SP.POP.GROW        Population growth (annual %)       232
# 2     IT.NET.USER.P2     Internet users (per 100 people)       223
# 3  SP.POP.TOTL.FE.ZS     Population, female (% of total)       213
# 4  SP.POP.0014.TO.ZS  Population, ages 0-14 (% of total)       213
