In [1]:
from google.cloud import bigquery
from google.oauth2 import service_account

# We will import a customized function called client which actually returns an authorized bigquery client object with right credentials
# this will cost us an extra pair of () each time we call the client object which is now called by the function client we define in bq_sa_auth.py 

from bq_sa_auth import client

### *Foundational components of SQL: SELECT, FROM, WHERE*

### **[Intro to SQL](https://www.kaggle.com/code/dansbecker/select-from-where)**

#### Lets work with OpenAQ dataset about airquality.

In [2]:
# We first construct a reference to the "chicago_crime" dataset and make a API request to fetch it

OAQ_ref = client().dataset("openaq", project="bigquery-public-data")
OAQ_DS = client().get_dataset(OAQ_ref)

# List the tables in the dataset and count their number 

tables = list(client().list_tables(OAQ_DS))


print(f'The number of tables in the OpenAQ dataset are: {len(tables)}')
print("\n")
print(f'The name(s) of the tables in the OpenAq are:')
print("\n")
for tab in tables:
    
    print(tab.table_id)

The number of tables in the OpenAQ dataset are: 1


The name(s) of the tables in the OpenAq are:


global_air_quality


In [5]:
# Only one table in this data set. Lets look at the schema 


GAQ_tab_ref = OAQ_ref.table('global_air_quality')

GAQ_table = client().get_table(GAQ_tab_ref)

GAQ_table.schema

[SchemaField('location', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('city', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('country', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('pollutant', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('value', 'FLOAT', 'NULLABLE', None, None, (), None),
 SchemaField('timestamp', 'TIMESTAMP', 'NULLABLE', None, None, (), None),
 SchemaField('unit', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('source_name', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('latitude', 'FLOAT', 'NULLABLE', None, None, (), None),
 SchemaField('longitude', 'FLOAT', 'NULLABLE', None, None, (), None),
 SchemaField('averaged_over_in_hours', 'FLOAT', 'NULLABLE', None, None, (), None),
 SchemaField('location_geom', 'GEOGRAPHY', 'NULLABLE', None, None, (), None)]

In [6]:
client().list_rows(GAQ_table, max_results=5).to_dataframe()

Unnamed: 0,location,city,country,pollutant,value,timestamp,unit,source_name,latitude,longitude,averaged_over_in_hours,location_geom
0,"Borówiec, ul. Drapałka",Borówiec,PL,bc,0.85217,2022-04-28 07:00:00+00:00,µg/m³,GIOS,1.0,52.276794,17.074114,POINT(52.276794 1)
1,"Kraków, ul. Bulwarowa",Kraków,PL,bc,0.91284,2022-04-27 23:00:00+00:00,µg/m³,GIOS,1.0,50.069308,20.053492,POINT(50.069308 1)
2,"Płock, ul. Reja",Płock,PL,bc,1.41,2022-03-30 04:00:00+00:00,µg/m³,GIOS,1.0,52.550938,19.709791,POINT(52.550938 1)
3,"Elbląg, ul. Bażyńskiego",Elbląg,PL,bc,0.33607,2022-05-03 13:00:00+00:00,µg/m³,GIOS,1.0,54.167847,19.410942,POINT(54.167847 1)
4,"Piastów, ul. Pułaskiego",Piastów,PL,bc,0.51,2022-05-11 05:00:00+00:00,µg/m³,GIOS,1.0,52.191728,20.837489,POINT(52.191728 1)


In [9]:
# Query to select all the items from the "city" column where the "country" column is set to 'ES' (Spain)

query = """
        SELECT city
        FROM `bigquery-public-data.openaq.global_air_quality`
        WHERE country = 'ES'
        """

# Set up the query
query_job = client().query(query)
# API request - run the query, and return a pandas DataFrame
es_cities = query_job.to_dataframe()
es_cities.head()

Unnamed: 0,city
0,Zamora
1,Zamora
2,Guadalajara
3,Zamora
4,Zamora


In [10]:
# Spanish cities with the most air quality measurements 

es_cities.city.value_counts().head()

Madrid                    10328
Santa Cruz de Tenerife     6831
Valencia/València          6787
Asturias                   6402
Castellón/Castelló         5117
Name: city, dtype: int64

In [12]:
# To select multiple columns in the query simply add the column name with a column using SELECT city, location, ... . Use * to get all the columns 

es_query = """
        SELECT *
        FROM `bigquery-public-data.openaq.global_air_quality`
        WHERE country = 'ES'
        """
# Set up the query
query_job_es = client().query(es_query)
# API request - run the query, and return a pandas DataFrame
es_oaq_df = query_job_es.to_dataframe()
es_oaq_df.head()

Unnamed: 0,location,city,country,pollutant,value,timestamp,unit,source_name,latitude,longitude,averaged_over_in_hours,location_geom
0,ES1927A,Zamora,ES,co,300.0,2022-04-30 06:00:00+00:00,µg/m³,EEA Spain,1.0,41.509722,-5.746389,POINT(41.509722219467 1)
1,ES1535A,Albacete,ES,co,500.0,2021-12-30 00:00:00+00:00,µg/m³,EEA Spain,1.0,38.97928,-1.85213,POINT(38.9792799994747 1)
2,ES1537A,Guadalajara,ES,co,230.0,2022-05-09 04:00:00+00:00,µg/m³,EEA Spain,1.0,40.62984,-3.17159,POINT(40.6298399994692 1)
3,ES1537A,Guadalajara,ES,co,350.0,2022-04-20 01:00:00+00:00,µg/m³,EEA Spain,1.0,40.62984,-3.17159,POINT(40.6298399994692 1)
4,ES1537A,Guadalajara,ES,co,240.0,2022-05-13 05:00:00+00:00,µg/m³,EEA Spain,1.0,40.62984,-3.17159,POINT(40.6298399994692 1)


In [13]:
# To estimate the size of any query we can set a query job configuration before running the actual query. Lets check the size of query below
# Query to select all the items from the "city" column where the "country" column is 'ES'

query = """
        SELECT city, pollutant, value,unit, latitude, longitude
        FROM `bigquery-public-data.openaq.global_air_quality`
        WHERE country = 'ES'
        """


# Create a QueryJobConfig object to estimate size of query without running it
dry_run_config = bigquery.QueryJobConfig(dry_run=True)

# API request - dry run query to estimate costs
dry_run_query_job = client().query(query, job_config=dry_run_config)

print(f"This query will process {dry_run_query_job.total_bytes_processed/10**(6)} Mega bytes.")

This query will process 303.71036 Mega bytes.


### All data about air quality in Spain is about 303 MB. We can also specify a parameter when running the query to limit how much data you are willing to scan. If the query size is above the limit the query will be cancelled!

In [14]:
# Only run the query if it's less than 1 MB
ONE_MB = 1000 * 1000

safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=ONE_MB)

# Set up the query (will only run if it's less than 1 MB)
safe_query_job = client().query(query, job_config=safe_config)

# API request - try to run the query, and return a pandas DataFrame
safe_query_job.to_dataframe()

InternalServerError: 500 Query exceeded limit for bytes billed: 1000000. 304087040 or higher required.

Location: US
Job ID: dc172c6c-8701-456b-8a5f-8359b2a918a7


In [15]:
# Only run the query if it's less than 1 GB

ONE_GB = 1000*1000*1000

safe_config = bigquery.QueryJobConfig(maximum_bytes_billed = ONE_GB)

# Set up the query (will only run if it's less than 1 GB)
safe_query_job = client().query(query, job_config=safe_config)

# API request - try to run the query, and return a pandas DataFrame
es2_df = safe_query_job.to_dataframe()

# Calcylate average of pollutant values in the data frame
es2_df.head()

Unnamed: 0,city,pollutant,value,unit,latitude,longitude
0,Zamora,co,300.0,µg/m³,1.0,41.509722
1,Zamora,co,400.0,µg/m³,1.0,41.509722
2,Guadalajara,co,230.0,µg/m³,1.0,40.62984
3,Zamora,co,400.0,µg/m³,1.0,41.509722
4,Zamora,co,300.0,µg/m³,1.0,41.509722


In [16]:
## There are various pollutants, so averaging over all of them does not make too much sense 

mean_co = es2_df.value[es2_df['pollutant'] == 'co'].mean()
unit = es2_df.unit[0]

print(f"The mean carbon di-oxide level in Spain is around {mean_co:.2f} {unit}") 

The mean carbon di-oxide level in Spain is around 292.44 µg/m³
