In [1]:
import yaml
from google.cloud import bigquery


In [2]:
config = yaml.load(open("credentials.yml"), yaml.Loader)
client = bigquery.Client.from_service_account_json(config["credentials_path"])


In [8]:
# Reference to the dataset.
dataset_ref = bigquery.DatasetReference("bigquery-public-data", "openaq")
# Print names of all tables in the dataset (there's only one!)
print([table.table_id for table in client.list_tables(dataset_ref)])


['global_air_quality']


In [9]:
# Construct a reference to the "global_air_quality" table
table_ref = bigquery.TableReference(dataset_ref, "global_air_quality")
# API request - fetch the table
table = client.get_table(table_ref)
# Preview the first five lines of the "global_air_quality" table
client.list_rows(table, max_results=5).to_dataframe()


Unnamed: 0,location,city,country,pollutant,value,timestamp,unit,source_name,latitude,longitude,averaged_over_in_hours,location_geom
0,"Borówiec, ul. Drapałka",Borówiec,PL,bc,0.85217,2022-04-28 07:00:00+00:00,µg/m³,GIOS,1.0,52.276794,17.074114,POINT(52.276794 1)
1,"Kraków, ul. Bulwarowa",Kraków,PL,bc,0.91284,2022-04-27 23:00:00+00:00,µg/m³,GIOS,1.0,50.069308,20.053492,POINT(50.069308 1)
2,"Płock, ul. Reja",Płock,PL,bc,1.41,2022-03-30 04:00:00+00:00,µg/m³,GIOS,1.0,52.550938,19.709791,POINT(52.550938 1)
3,"Elbląg, ul. Bażyńskiego",Elbląg,PL,bc,0.33607,2022-05-03 13:00:00+00:00,µg/m³,GIOS,1.0,54.167847,19.410942,POINT(54.167847 1)
4,"Piastów, ul. Pułaskiego",Piastów,PL,bc,0.51,2022-05-11 05:00:00+00:00,µg/m³,GIOS,1.0,52.191728,20.837489,POINT(52.191728 1)


In [10]:
# Query to select all the items from the "city" column where the "country" column is 'US'
query = """
    SELECT city
    FROM `bigquery-public-data.openaq.global_air_quality`
    WHERE country = 'US'
"""

# Set up the query
query_job = client.query(query)
# API request - run the query, and return a pandas DataFrame
us_cities = query_job.to_dataframe()
# What five cities have the most measurements?
us_cities.city.value_counts().head()


city
Phoenix-Mesa-Scottsdale                     39414
Los Angeles-Long Beach-Santa Ana            27479
Riverside-San Bernardino-Ontario            26887
New York-Northern New Jersey-Long Island    25417
San Francisco-Oakland-Fremont               22710
Name: count, dtype: int64

In [None]:
query = """
    SELECT city, country
    FROM `bigquery-public-data.openaq.global_air_quality`
    WHERE country = 'US'
"""

query_job = client.query(query)
cities_countries = query_job.to_dataframe()
cities_countries.head()


In [15]:
# Query to get the score column from every row where the type column has value "job"
query = """
    SELECT score, title
    FROM `bigquery-public-data.hacker_news.full`
    WHERE type = 'job' 
"""

# Create a QueryJobConfig object to estimate size of query without running it
dry_run_config = bigquery.QueryJobConfig(dry_run=True)
# API request - dry run query to estimate costs
dry_run_query_job = client.query(query, job_config=dry_run_config)
print("This query will process {} bytes.".format(dry_run_query_job.total_bytes_processed))


This query will process 553320240 bytes.


In [16]:
# Only run the query if it's less than 1 MB
ONE_MB = 1000 * 1000
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=ONE_MB)
# Set up the query (will only run if it's less than 1 MB)
safe_query_job = client.query(query, job_config=safe_config)
# API request - try to run the query, and return a pandas DataFrame
safe_query_job.to_dataframe()


InternalServerError: 500 Query exceeded limit for bytes billed: 1000000. 553648128 or higher required.

Location: US
Job ID: dc19e7bc-74eb-4f6a-88b2-71df9da5ac73


In [17]:
# Only run the query if it's less than 1 GB
ONE_GB = 1000 * 1000 * 1000
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=ONE_GB)
# Set up the query (will only run if it's less than 1 GB)
safe_query_job = client.query(query, job_config=safe_config)
# API request - try to run the query, and return a pandas DataFrame
job_post_scores = safe_query_job.to_dataframe()
# Print average score for job posts
job_post_scores.score.mean()


1.7267060367454068

In [18]:
job_post_scores

Unnamed: 0,score,title
0,1,HelloFax (YC W11) is seeking its first hire. H...
1,1,ContainIQ (YC S21) Is Hiring an SDR – Founding...
2,1,Android Engineer? SendHub (YC W12) is growing ...
3,1,Rescale is hiring an applications engineer
4,1,Be a Performance Data Hacker at Bump Technologies
...,...,...
15843,5,Backend Developer at Listia
15844,7,[Chicago] Rails developer @ Inkling
15845,9,Mixpanel is looking for a great frontend devel...
15846,15,[SouthBay] Buxfer: looking for smart hackers t...


In [19]:
# Query to select countries with units of "ppm"
first_query = """
    SELECT DISTINCT country
    FROM `bigquery-public-data.openaq.global_air_quality`
    WHERE unit = 'ppm'
"""

safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)  # 10 GB
first_query_job = client.query(first_query, job_config=safe_config)
first_results = first_query_job.to_dataframe()
first_results.head()


Unnamed: 0,country
0,AR
1,IL
2,TW
3,CO
4,EC


In [20]:
# Query to select all columns where pollution levels are exactly 0
zero_pollution_query = """
    SELECT *
    FROM `bigquery-public-data.openaq.global_air_quality`
    WHERE value = 0
"""

safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
query_job = client.query(zero_pollution_query, job_config=safe_config)
zero_pollution_results = query_job.to_dataframe()
zero_pollution_results.head()


Unnamed: 0,location,city,country,pollutant,value,timestamp,unit,source_name,latitude,longitude,averaged_over_in_hours,location_geom
0,"Zielonka, Bory Tucholskie",Zielonka,PL,bc,0.0,2022-04-29 14:00:00+00:00,µg/m³,GIOS,1.0,53.662136,17.933986,POINT(53.662136 1)
1,"Toruń, ul. Przy Kaszowniku",Toruń,PL,bc,0.0,2022-04-19 04:00:00+00:00,µg/m³,GIOS,1.0,53.017628,18.612808,POINT(53.017628 1)
2,"Kielce, ul. Targowa",Kielce,PL,bc,0.0,2022-05-07 17:00:00+00:00,µg/m³,GIOS,1.0,50.878998,20.633692,POINT(50.878998 1)
3,"Zielonka, Bory Tucholskie",Zielonka,PL,bc,0.0,2022-05-19 14:00:00+00:00,µg/m³,GIOS,1.0,53.662136,17.933986,POINT(53.662136 1)
4,"Koszalin, ul. Armii Krajowej",Koszalin,PL,bc,0.0,2022-05-12 20:00:00+00:00,µg/m³,GIOS,1.0,54.193986,16.172544,POINT(54.193986 1)
