In [1]:
from google.cloud import bigquery

  from pkg_resources import get_distribution


In [3]:
# create clinet object
client = bigquery.Client("intsql-2025")



In [4]:
# construct ref to openaq dataset
dataset_ref = client.dataset("openaq", project="bigquery-public-data")

In [5]:
# Api req, fetch the dataset
dataset = client.get_dataset(dataset_ref)

In [7]:
# List all tables in openaq dataset
tables = list(client.list_tables(dataset))

# and print all tables' name in the dataset
for table in tables:
    print(table.table_id)

global_air_quality


In [11]:
# construct a ref to the table "global_air_quality"
table_ref = dataset_ref.table("global_air_quality")

# api req, fetch the table
table = client.get_table(table_ref)

# preview the first five lines
client.list_rows(table, max_results=5).to_dataframe()

  client.list_rows(table, max_results=5).to_dataframe()


Unnamed: 0,location,city,country,pollutant,value,timestamp,unit,source_name,latitude,longitude,averaged_over_in_hours,location_geom
0,"Borówiec, ul. Drapałka",Borówiec,PL,bc,0.85217,2022-04-28 07:00:00+00:00,µg/m³,GIOS,1.0,52.276794,17.074114,POINT(52.276794 1)
1,"Kraków, ul. Bulwarowa",Kraków,PL,bc,0.91284,2022-04-27 23:00:00+00:00,µg/m³,GIOS,1.0,50.069308,20.053492,POINT(50.069308 1)
2,"Płock, ul. Reja",Płock,PL,bc,1.41,2022-03-30 04:00:00+00:00,µg/m³,GIOS,1.0,52.550938,19.709791,POINT(52.550938 1)
3,"Elbląg, ul. Bażyńskiego",Elbląg,PL,bc,0.33607,2022-05-03 13:00:00+00:00,µg/m³,GIOS,1.0,54.167847,19.410942,POINT(54.167847 1)
4,"Piastów, ul. Pułaskiego",Piastów,PL,bc,0.51,2022-05-11 05:00:00+00:00,µg/m³,GIOS,1.0,52.191728,20.837489,POINT(52.191728 1)


In [12]:
# Query to select all the items from the "city" column where the "country" column is 'US
query = """ 
        SELECT city
        FROM `bigquery-public-data.openaq.global_air_quality`
        WHERE country = 'US'
        """

In [13]:
# setting up the query
query_job = client.query(query)

In [14]:
# API request - run the query, and return a pandas DataFrame
us_cities = query_job.to_dataframe()

In [17]:
us_cities.city.value_counts().head()

city
Phoenix-Mesa-Scottsdale                     39414
Los Angeles-Long Beach-Santa Ana            27479
Riverside-San Bernardino-Ontario            26887
New York-Northern New Jersey-Long Island    25417
San Francisco-Oakland-Fremont               22710
Name: count, dtype: int64

 multiple columns
```python
query = """
        SELECT city, country
        FROM `bigquery-public-data.openaq.global_air_quality`
        WHERE country = 'US'
        """
```
or selecting all columns
```python
query = """
        SELECT * 
        FROM `bigquery-public-data.openaq.global_air_quality`
        WHERE country = 'US'
        """
```

#### Working with big datasets
It is better to know how to estimate the query's size. Her is how to do it:

In [18]:
# Query to get the score column from every row where the type column has value "job"
query = """ 
        SELECT score, title
        FROM `bigquery-public-data.hacker_news.full`
        WHERE type = "job"
        """

# Create a QueryJobConfig object to estimate size of query without running it
dry_run_config = bigquery.QueryJobConfig(dry_run=True)

# API request - dry run query to estimate costs
dry_run_query_job = client.query(query, job_config=dry_run_config)

print("This query will process {} bytes.".format(dry_run_query_job.total_bytes_processed))

This query will process 716584537 bytes.


#### Why we better know the size of a query before running it?
1. Cost Control (Most Significant Factor)
2. Performance and Efficiency
3. reventing Full Table Scans

In [19]:
# specify a parameter when running the query to limit how much data you are willing to scan
# only run query if it is less than 1 mb
ONE_MB = 1000*1000
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=ONE_MB)

# Set up the query (will only run if it's less than 1 MB)
safe_query_job = client.query(query, job_config=safe_config)

# API request - try to run the query, and return a pandas DataFrame
safe_query_job.to_dataframe()

InternalServerError: 500 Query exceeded limit for bytes billed: 1000000. 717225984 or higher required.

(job ID: 0f517f95-0ac0-4c3a-b2f6-8fef13e44c33)

             -----Query Job SQL Follows-----             

    |    .    |    .    |    .    |    .    |    .    |
   1: 
   2:        SELECT score, title
   3:        FROM `bigquery-public-data.hacker_news.full`
   4:        WHERE type = "job"
   5:        
    |    .    |    .    |    .    |    .    |    .    |

The query was cancelled, because the limit of 1 MB was exceeded. However, we can increase the limit to run the query successfully!

In [21]:
one_gb = 1000*1000*1000
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=one_gb)

safe_query_job = client.query(query, job_config=safe_config)

job_post_scores = safe_query_job.to_dataframe()

# Print average score for job posts
job_post_scores.score.mean()

np.float64(1.6455829151592951)

Which countries have reported pollution levels in units of "ppm"? In the code cell below, set first_query to an SQL query that pulls the appropriate entries from the country column.

In [22]:
ppm_query = """
            SELECT country
            FROM `bigquery-public-data.openaq.global_air_quality`
            WHERE unit = "ppm"
            """

# Set up the query (cancel the query if it would use too much of 
# the quota, with the limit set to 10 GB)
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
ppm_query_job = client.query(ppm_query, job_config=safe_config)


# API request - run the query, and return a pandas DataFrame
results = ppm_query_job.to_dataframe()

# View top few rows of results
print(results.head())

  country
0      AR
1      AR
2      AR
3      AR
4      CO
