In [1]:
import yaml
from google.cloud import bigquery


In [2]:
config = yaml.load(open("credentials.yml"), yaml.Loader)
client = bigquery.Client.from_service_account_json(config["credentials_path"])


In [3]:
dataset_ref = bigquery.DatasetReference("bigquery-public-data", "nhtsa_traffic_fatalities")
table_ref = bigquery.TableReference(dataset_ref, "accident_2015")

table = client.get_table(table_ref)
client.list_rows(table, max_results=5).to_dataframe()


Unnamed: 0,state_number,state_name,consecutive_number,number_of_vehicle_forms_submitted_all,number_of_motor_vehicles_in_transport_mvit,number_of_parked_working_vehicles,number_of_forms_submitted_for_persons_not_in_motor_vehicles,number_of_persons_not_in_motor_vehicles_in_transport_mvit,number_of_persons_in_motor_vehicles_in_transport_mvit,number_of_forms_submitted_for_persons_in_motor_vehicles,...,minute_of_ems_arrival_at_hospital,related_factors_crash_level_1,related_factors_crash_level_1_name,related_factors_crash_level_2,related_factors_crash_level_2_name,related_factors_crash_level_3,related_factors_crash_level_3_name,number_of_fatalities,number_of_drunk_drivers,timestamp_of_crash
0,30,Montana,300019,5,5,0,0,0,7,7,...,45,0,,0,,0,,1,0,2015-03-28 14:58:00+00:00
1,39,Ohio,390099,7,7,0,0,0,15,15,...,24,27,Backup Due to Prior Crash,0,,0,,1,0,2015-02-14 11:19:00+00:00
2,49,Utah,490123,16,16,0,0,0,28,28,...,99,0,,0,,0,,1,0,2015-04-14 12:24:00+00:00
3,48,Texas,481184,6,5,1,0,5,5,10,...,99,0,,0,,0,,1,0,2015-05-27 16:40:00+00:00
4,41,Oregon,410333,11,11,0,0,0,14,14,...,99,0,,0,,0,,1,0,2015-11-17 18:17:00+00:00


In [6]:
# Determine how the number of accidents varies with the day of the week.
query = """
    SELECT COUNT(consecutive_number) AS num_accidents, EXTRACT(DAYOFWEEK FROM timestamp_of_crash) AS day_of_week
    FROM `bigquery-public-data.nhtsa_traffic_fatalities.accident_2015`
    GROUP BY day_of_week
    ORDER BY num_accidents DESC
"""

safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**9)  # 1 GB
query_job = client.query(query, job_config=safe_config)
query_results = query_job.to_dataframe()
query_results


Unnamed: 0,num_accidents,day_of_week
0,5659,7
1,5298,1
2,4916,6
3,4460,5
4,4182,4
5,4038,2
6,3985,3


In [7]:
dataset_ref = bigquery.DatasetReference("bigquery-public-data", "world_bank_intl_education")
table_ref = bigquery.TableReference(dataset_ref, "international_education")

table = client.get_table(table_ref)
client.list_rows(table, max_results=5).to_dataframe()

Unnamed: 0,country_name,country_code,indicator_name,indicator_code,value,year
0,Chad,TCD,"Enrolment in lower secondary education, both s...",UIS.E.2,321921.0,2012
1,Chad,TCD,"Enrolment in upper secondary education, both s...",UIS.E.3,68809.0,2006
2,Chad,TCD,"Enrolment in upper secondary education, both s...",UIS.E.3,30551.0,1999
3,Chad,TCD,"Enrolment in upper secondary education, both s...",UIS.E.3,79784.0,2007
4,Chad,TCD,"Repeaters in primary education, all grades, bo...",UIS.R.1,282699.0,2006


In [11]:
# Which countries spend the largest fraction of GDP on education?
# To answer this question, consider only the rows in the dataset corresponding to indicator code `SE.XPD.TOTL.GD.ZS`,
# and write a query that returns the average value in the `value` column for each country in the dataset between the
# years 2010-2017 (including 2010 and 2017 in the average).

query = """
    SELECT country_name, AVG(value) AS avg_ed_spending_pct
    FROM `bigquery-public-data.world_bank_intl_education.international_education`
    WHERE indicator_code = 'SE.XPD.TOTL.GD.ZS' AND year >= 2010 AND year <= 2017
    GROUP BY country_name
    ORDER BY avg_ed_spending_pct DESC
"""

safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**9)  # 1 GB
query_job = client.query(query, job_config=safe_config)
query_results = query_job.to_dataframe()
query_results

Unnamed: 0,country_name,avg_ed_spending_pct
0,Cuba,12.837270
1,"Micronesia, Fed. Sts.",12.467750
2,Solomon Islands,10.001080
3,Moldova,8.372153
4,Namibia,8.349610
...,...,...
152,Cambodia,1.706404
153,West Bank and Gaza,1.503760
154,South Sudan,1.409726
155,Monaco,1.409606


In [15]:
# Write a query below that selects the indicator code and indicator name for all codes with at least 175 rows in the year 2016.

query = """
    SELECT indicator_code, indicator_name, COUNT(1) AS num_rows
    FROM `bigquery-public-data.world_bank_intl_education.international_education`
    WHERE year = 2016
    GROUP BY indicator_code, indicator_name
    HAVING num_rows >= 175
    ORDER BY num_rows DESC
"""

safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**9)  # 1 GB
query_job = client.query(query, job_config=safe_config)
query_results = query_job.to_dataframe()
query_results


Unnamed: 0,indicator_code,indicator_name,num_rows
0,SP.POP.TOTL,"Population, total",232
1,SP.POP.GROW,Population growth (annual %),232
2,IT.NET.USER.P2,Internet users (per 100 people),223
3,SP.POP.1564.TO,"Population, ages 15-64, total",213
4,SP.POP.1564.FE.IN,"Population, ages 15-64, female",213
5,SP.POP.0014.FE.IN,"Population, ages 0-14, female",213
6,SP.POP.1564.TO.ZS,"Population, ages 15-64 (% of total)",213
7,SP.POP.0014.MA.IN,"Population, ages 0-14, male",213
8,SP.POP.TOTL.FE.ZS,"Population, female (% of total)",213
9,SP.POP.0014.TO,"Population, ages 0-14, total",213
