# 7. INTRODUCTION TO SQL

# 7.2. SELECT, FROM & WHERE

# 7.2.1. COURS

## 7.2.1.1. SELECT ... FROM

In [None]:
# The most basic SQL query selects a single column from a single table. T
# - specify the column you want after the word SELECT, and then
# - specify the table after the word FROM.

In [None]:
query = """
        SELECT Name
        FROM 'biquery-public-ata.pet_records.pets'
        ...
        """

## 7.2.1.2. WHERE ...

In [None]:
# BigQuery datasets are large, so you'll usually want to return only the 
# rows meeting specific conditions. You can do this using the WHERE clause.

In [None]:
query = """
        SELECT Name
        FROM 'biquery-public-ata.pet_records.pets'
        WHERE Animal = 'Cat'
        ...
        """

## 7.2.1.3. EXEMPLE

In [None]:
# Query to select all the items from the "city" column where the "country" 
# column is 'US'
query = """
        SELECT city
        FROM `bigquery-public-data.openaq.global_air_quality`
        WHERE country = 'US'
        """

## 7.2.1.4. Submitting the query to the dataset

In [None]:
# Create a "Client" object
client = bigquery.Client()

In [None]:
# Set up the query
query_job = client.query(query)

In [None]:
# API request - run the query, and return a pandas DataFrame
us_cities = query_job.to_dataframe()
# Now we've got a pandas DataFrame called us_cities, which we can use like
# any other DataFrame.

In [None]:
# What five cities have the most measurements?
us_cities.city.value_counts().head()

# Phoenix-Mesa-Scottsdale                     88
# Houston                                     82
# Los Angeles-Long Beach-Santa Ana            68
# Riverside-San Bernardino-Ontario            60
# New York-Northern New Jersey-Long Island    60
# Name: city, dtype: int64

## 7.2.1.5. More queries

In [None]:
# if you want multiple columns, you can select them with a comma between 
# the names:
query = """
        SELECT city, country
        FROM `bigquery-public-data.openaq.global_air_quality`
        WHERE country = 'US'
        """

In [None]:
# You can select all columns with a * like this:
query = """
        SELECT *
        FROM `bigquery-public-data.openaq.global_air_quality`
        WHERE country = 'US'
        """

## 7.2.1.6. Q&A: Notes on formatting

In [2]:
# Question: What's up with the triple quotation marks (""")?
# Answer: These tell Python that everything inside them is a single string,
# even though we have line breaks in it. The line breaks aren't necessary, 
# but they make it easier to read your query.

In [3]:
# Question: Do you need to capitalize SELECT and FROM?
# Answer: No, SQL doesn't care about capitalization. However, it's 
# customary to capitalize your SQL commands, and it makes your queries a 
# bit easier to read.

## 7.2.1.7. Working with big datasets

In [4]:
# how to avoid scanning too much data at once, so that you don't run over 
# your limit.
# To begin,you can estimate the size of any query before running it. 
# Here is an example using the (very large!) Hacker News dataset. 
# To see how much data a query will scan, we create a QueryJobConfig 
# object and set the dry_run parameter to True.

In [None]:
# Query to get the score column from every row where the type column has value "job"
query = """
        SELECT score, title
        FROM `bigquery-public-data.hacker_news.full`
        WHERE type = "job" 
        """

# Create a QueryJobConfig object to estimate size of query without running it
dry_run_config = bigquery.QueryJobConfig(dry_run=True)

# API request - dry run query to estimate costs
dry_run_query_job = client.query(query, job_config=dry_run_config)

print("This query will process {} bytes.".format(dry_run_query_job.total_bytes_processed))

This query will process 429036722 bytes.

In [None]:
# You can also specify a parameter when running the query to limit how 
# much data you are willing to scan. Here's an example with a low limit.

# Only run the query if it's less than 1 MB
ONE_MB = 1000*1000
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=ONE_MB)

# Set up the query (will only run if it's less than 1 MB)
safe_query_job = client.query(query, job_config=safe_config)

# API request - try to run the query, and return a pandas DataFrame
safe_query_job.to_dataframe()

# InternalServerError: 500 Query exceeded limit for bytes billed: 1000000. 
# 429916160 or higher required.

In [None]:
# In this case, the query was cancelled, because the limit of 1 MB was 
# exceeded. 
# However, we can increase the limit to run the query successfully!

ONE_GB = 1000*1000*1000
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=ONE_GB)

# Set up the query (will only run if it's less than 1 GB)
safe_query_job = client.query(query, job_config=safe_config)

# API request - try to run the query, and return a pandas DataFrame
job_post_scores = safe_query_job.to_dataframe()

# Print average score for job posts
job_post_scores.score.mean()

# 1.874250078939059

# 7.2.2. EXERCICES

In [None]:
# Try writing some SELECT statements of your own to explore a large 
# dataset of air pollution measurements.

In [None]:
# Create a "Client" object
client = bigquery.Client()

# Construct a reference to the "openaq" dataset
dataset_ref = client.dataset("openaq", project="bigquery-public-data")

# API request - fetch the dataset
dataset = client.get_dataset(dataset_ref)

# Construct a reference to the "global_air_quality" table
table_ref = dataset_ref.table("global_air_quality")

# API request - fetch the table
table = client.get_table(table_ref)

# Preview the first five lines of the "global_air_quality" table
client.list_rows(table, max_results=5).to_dataframe()

# 	location	city	country	pollutant	value	timestamp	unit	source_name	latitude	longitude	averaged_over_in_hours
# 0	BTM Layout, Bengaluru - KSPCB	Bengaluru	IN	co	910.00	2018-02-22 03:00:00+00:00	µg/m³	CPCB	12.912811	77.60922	0.25
# 1	BTM Layout, Bengaluru - KSPCB	Bengaluru	IN	no2	131.87	2018-02-22 03:00:00+00:00	µg/m³	CPCB	12.912811	77.60922	0.25
# 2	BTM Layout, Bengaluru - KSPCB	Bengaluru	IN	o3	15.57	2018-02-22 03:00:00+00:00	µg/m³	CPCB	12.912811	77.60922	0.25
# 3	BTM Layout, Bengaluru - KSPCB	Bengaluru	IN	pm25	45.62	2018-02-22 03:00:00+00:00	µg/m³	CPCB	12.912811	77.60922	0.25
# 4	BTM Layout, Bengaluru - KSPCB	Bengaluru	IN	so2	4.49	2018-02-22 03:00:00+00:00	µg/m³	CPCB	12.912811	77.60922	0.25

### 7.2.2.1. Units of measurement

In [None]:
# Which countries have reported pollution levels in units of "ppm"? 
# In the code cell below, set first_query to an SQL query that pulls the 
# appropriate entries from the country column.

first_query = """
              SELECT country
              FROM `bigquery-public-data.openaq.global_air_quality`
              WHERE unit = "ppm"
              """

# Set up the query (cancel the query if it would use too much of 
# your quota, with the limit set to 10 GB)
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
first_query_job = client.query(first_query, job_config=safe_config)

# API request - run the query, and return a pandas DataFrame
first_results = first_query_job.to_dataframe()

# View top few rows of results
print(first_results.head())

#   country
# 0      US
# 1      US
# 2      US
# 3      US
# 4      US

In [None]:
# OU
# Or to get each country just once, you could use
first_query = """
              SELECT DISTINCT country
              FROM `bigquery-public-data.openaq.global_air_quality`
              WHERE unit = "ppm"
              """
#   country
# 0      US
# 1      CL
# 2      AU
# 3      BM
# 4      MX

## 7.2.2.2. High air quality

In [None]:
# Which pollution levels were reported to be exactly 0?
# - Set zero_pollution_query to select all columns of the rows where the 
#   value column is 0.
# - Set zero_pollution_results to a pandas DataFrame containing the query 
#   results.

# Query to select all columns where pollution levels are exactly 0
zero_pollution_query = """
                       SELECT *
                       FROM `bigquery-public-data.openaq.global_air_quality`
                       WHERE value = 0
                       """
# Set up the query
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
query_job = client.query(zero_pollution_query, job_config=safe_config)

# API request - run the query and return a pandas DataFrame
zero_pollution_results = query_job.to_dataframe()

print(zero_pollution_results.head())

#                                         location     city country pollutant  \
# 0                     Victoria Memorial - WBSPCB  Kolkata      IN      pm25   
# 1  Rabindra Bharati University, Kolkata - WBSPCB  Kolkata      IN       so2   
# 2                   Zamość ul. Hrubieszowska 69A   Zamość      PL       no2   
# 3                               Końskie, MOBILNA  Końskie      PL      pm10   
# 4                               Końskie, MOBILNA  Końskie      PL      pm25   

#    value                 timestamp   unit source_name   latitude  longitude  \
# 0    0.0 2017-10-16 20:45:00+00:00  µg/m³        CPCB  22.572645  88.363890   
# 1    0.0 2017-10-28 14:30:00+00:00  µg/m³        CPCB  22.627874  88.380400   
# 2    0.0 2020-05-19 05:00:00+00:00  µg/m³        GIOS  50.716630  23.290247   
# 3    0.0 2018-12-21 13:00:00+00:00  µg/m³        GIOS  51.189526  20.408892   
# 4    0.0 2018-12-21 13:00:00+00:00  µg/m³        GIOS  51.189526  20.408892   

#    averaged_over_in_hours  
# 0                    0.25  
# 1                    0.25  
# 2                     NaN  
# 3                     NaN  
# 4                     NaN  