# 7. INTRODUCTION TO SQL

# 7.5. AS & WITH

# 7.5.1. COURS

## 7.5.1.1. AS

In [1]:
# AS to rename the columns generated by your queries, which is also known 
# as aliasing.

In [None]:
query = """
        SELECT Animal, COUNT(ID) As Identifiant
        FROM 'bigquery-publicdata.pet_records.pets'
        GROUP BY Animal
        """

## 7.5.1.2. WITH ... AS

In [None]:
# On its own, AS is a convenient way to clean up the data returned by your 
# query. 
# It's even more powerful when combined with WITH in what's called a 
# "common table expression".
# A common table expression (or CTE) is a temporary table that you return 
# within your query. CTEs are helpful for splitting your queries into 
# readable chunks, and you can write queries against them.

In [None]:
query = """
        WITH Seniors AS
        (
            SELECT ID, Name
            FROM 'bigquery-publicdata.pet_records.pets'
            WHERE Years_old >5
        )
        SELECT ID
        FROM Seniors
        """

In [None]:
# Query to select the number of transactions per date, sorted by date
query_with_CTE = """ 
                 WITH time AS 
                 (
                     SELECT DATE(block_timestamp) AS trans_date
                     FROM `bigquery-public-data.crypto_bitcoin.transactions`
                 )
                 SELECT COUNT(1) AS transactions,
                        trans_date
                 FROM time
                 GROUP BY trans_date
                 ORDER BY trans_date
                 """

# Set up the query (cancel the query if it would use too much of 
# your quota, with the limit set to 10 GB)
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
query_job = client.query(query_with_CTE, job_config=safe_config)

# API request - run the query, and convert the results to a pandas DataFrame
transactions_by_date = query_job.to_dataframe()

# Print the first five rows
transactions_by_date.head()

# 	transactions	trans_date
# 0	1	2009-01-03
# 1	14	2009-01-09
# 2	61	2009-01-10
# 3	93	2009-01-11
# 4	101	2009-01-12

In [None]:
transactions_by_date.set_index('trans_date').plot()

# 7.5.2. EXERCICES

In [None]:
from google.cloud import bigquery

# Create a "Client" object
client = bigquery.Client()

# Construct a reference to the "chicago_taxi_trips" dataset
dataset_ref = client.dataset("chicago_taxi_trips", project="bigquery-public-data")

# API request - fetch the dataset
dataset = client.get_dataset(dataset_ref)

## 7.5.2.1. Find the data

In [None]:
# List all the tables in the dataset
tables = list(client.list_tables(dataset))

# Print names of all tables in the dataset (there is only one!)
for table in tables:  
    print(table.table_id)

table_name = 'taxi_trips'

## 7.5.2.2. Peek at the data

In [None]:
# Construct a reference to the "taxi_trips" table
table_ref = dataset_ref.table("taxi_trips")

# API request - fetch the table
table = client.get_table(table_ref)

# Preview the first five lines of the "taxi_trips" table
client.list_rows(table, max_results=5).to_dataframe()

## 7.5.2.3. Determine when this data is from

In [None]:
rides_per_year_query = """
                       SELECT EXTRACT(YEAR FROM trip_start_timestamp) AS year, 
                              COUNT(1) AS num_trips
                       FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
                       GROUP BY year
                       ORDER BY year
                       """

# Set up the query (cancel the query if it would use too much of 
# your quota)
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
rides_per_year_query_job = client.query(rides_per_year_query, job_config=safe_config)

# API request - run the query, and return a pandas DataFrame
rides_per_year_result = rides_per_year_query_job.to_dataframe()

# View results
print(rides_per_year_result)

#    year  num_trips
# 0  2013   27217716
# 1  2014   37395436
# 2  2015   32385875
# 3  2016   31759339
# 4  2017   24988003
# 5  2018   20732088
# 6  2019   16477365
# 7  2020    3419510

## 7.5.2.4. Dive slightly deeper

In [None]:
rides_per_month_query = """
                        SELECT EXTRACT(MONTH FROM trip_start_timestamp) AS month, 
                               COUNT(1) AS num_trips
                        FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
                        WHERE EXTRACT(YEAR FROM trip_start_timestamp) = 2017
                        GROUP BY month
                        ORDER BY month
                        """

# Set up the query (cancel the query if it would use too much of 
# your quota)
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
rides_per_month_query_job = client.query(rides_per_month_query, job_config=safe_config)

# API request - run the query, and return a pandas DataFrame
rides_per_month_result = rides_per_month_query_job.to_dataframe()

# View results
print(rides_per_month_result)

#     month  num_trips
# 0       1    1972071
# 1       2    1909802
# 2       3    2362105
# 3       4    2194702
# 4       5    2323386
# 5       6    2324472
# 6       7    2054299
# 7       8    2079861
# 8       9    1950631
# 9      10    2141197
# 10     11    1907997
# 11     12    1767480

## 7.5.2.5. Write the query

In [None]:
speeds_query = """
               WITH RelevantRides AS
               (
                   SELECT EXTRACT(HOUR FROM trip_start_timestamp) AS hour_of_day, 
                          trip_miles, 
                          trip_seconds
                   FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
                   WHERE trip_start_timestamp > '2017-01-01' AND 
                         trip_start_timestamp < '2017-07-01' AND 
                         trip_seconds > 0 AND 
                         trip_miles > 0
               )
               SELECT hour_of_day, 
                      COUNT(1) AS num_trips, 
                      3600 * SUM(trip_miles) / SUM(trip_seconds) AS avg_mph
               FROM RelevantRides
               GROUP BY hour_of_day
               ORDER BY hour_of_day
               """

# Set up the query (cancel the query if it would use too much of 
# your quota)
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
speeds_query_job = client.query(speeds_query, job_config=safe_config)

# API request - run the query, and return a pandas DataFrame
speeds_result = speeds_query_job.to_dataframe()

# View results
print(speeds_result)

#     hour_of_day  num_trips    avg_mph
# 0             0     319339  20.230524
# 1             1     266529  18.937621
# 2             2     210147  18.777070
# 3             3     159668  20.158048
# 4             4     122183  26.736014
# 5             5     119312  30.769172
# 6             6     182738  24.588313
# 7             7     358406  17.735967
# 8             8     541775  15.079892
# 9             9     565548  16.543882
# 10           10     525120  18.539614
# 11           11     594603  18.928379
# 12           12     622324  17.838745
# 13           13     630181  17.671089
# 14           14     622465  16.974239
# 15           15     640430  15.688418
# 16           16     701435  14.283888
# 17           17     756627  12.462955
# 18           18     768251  13.646810
# 19           19     701064  16.642882
# 20           20     598614  19.536777
# 21           21     552726  20.433874
# 22           22     501095  19.531374
# 23           23     399587  19.877046