# 8. ADVANCED SQL

# 8.2. ANALYTIC FUNCTIONS

# 8.2.1. COURS

## 8.2.1.1. SYNTAX

In [None]:
query = """
        SELECT *,
               AVG(time) OVER(
                              PARTITION BY id
                              ORDER BY date
                              ROWS BETWWEN 1 PRECEDING AND CURRENT ROW
                             ) as avg_time
        FROM 'bigquery-public-data.runners_train_time'
        """
# OVER : définit les ensembles de lignes utilisés dans chaque calcul.
# PARTITION BY : divise les lignes de la table en différents groupes. 
# Dans la requête ci-dessus, nous divisons par id afin que les calculs 
# soient séparés par coureur.
# ORDER BY : défini un ordre dans chaque partition. 
# Dans l'exemple de requête, le classement par colonne de date garanti 
# que les sessions de formation précédentes apparaissent en premier.
# window frame : identifie l'ensemble de lignes utilisé dans chaque calcul.
# Nous pouvons désigner ce groupe de lignes comme une fenêtre. 
# (En fait, les fonctions analytiques sont parfois appelées fonctions de 
# fenêtre analytique ou simplement fonctions de fenêtre!)

## 8.2.1.2. WINDOWS FRAME

In [None]:
# There are many ways to write window frame clauses:

# ROWS BETWEEN 1 PRECEDING AND CURRENT ROW 
# - the previous row and the current row.

# ROWS BETWEEN 3 PRECEDING AND 1 FOLLOWING 
# - the 3 previous rows, the current row, and the following row.

# ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING 
# - all rows in the partition.

## 8.2.1.3. 3 TYPES ANALYTIC FUNCTIONS

## 8.2.1.3.1. ANALYTIC AGGREGATE FUNCTIONS

In [None]:
# Aggregate functions take all of the values within the window as input 
# # and return a single value.
# - MIN() (or MAX()) - Returns the minimum (or maximum) of input values
# - AVG() (or SUM()) - Returns the average (or sum) of input values
# - COUNT() - Returns the number of rows in the input

## 8.2.1.3.2. ANALYTIC NAVIGATION FUNCTIONS

In [None]:
# Navigation functions assign a value based on the value in a (usually) 
# different row than the current row.
# - FIRST_VALUE() (or LAST_VALUE()) - Returns the first (or last) value 
#   in the input
# - LEAD() (and LAG()) - Returns the value on a subsequent (or preceding) 
#   row

## 8.2.1.3.3. ANALYTIC NUMBERING FUNCTIONS

In [None]:
# Numbering functions assign integer values to each row based on the 
# ordering.
# - ROW_NUMBER() - Returns the order in which rows appear in the input 
#   (starting with 1)
# - RANK() - All rows with the same value in the ordering column receive 
#   the same rank value, where the next row receives a rank value which 
#   increments by the number of rows with the previous rank value.

## 8.2.1.3.4. EXEMPLES

In [2]:
# Query to track beginning and ending stations on October 25, 2015, for each bike
start_end_query = """
                  SELECT bike_number,
                      TIME(start_date) AS trip_time,
                      FIRST_VALUE(start_station_id)
                          OVER (
                               PARTITION BY bike_number
                               ORDER BY start_date
                               ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
                               ) AS first_station_id,
                      LAST_VALUE(end_station_id)
                          OVER (
                               PARTITION BY bike_number
                               ORDER BY start_date
                               ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
                               ) AS last_station_id,
                      start_station_id,
                      end_station_id
                  FROM `bigquery-public-data.san_francisco.bikeshare_trips`
                  WHERE DATE(start_date) = '2015-10-25' 
                  """

# Run the query, and return a pandas DataFrame
start_end_result = client.query(start_end_query).result().to_dataframe()
start_end_result.head()

# 	bike_number	trip_time	first_station_id	last_station_id	start_station_id	end_station_id
# 0	22	13:25:00	2	16	2	16
# 1	25	11:43:00	77	51	77	60
# 2	25	12:14:00	77	51	60	51
# 3	29	14:59:00	46	74	46	60
# 4	29	21:23:00	46	74	60	74

# Since there is no PARTITION BY clause, the entire table is treated as a 
# single partition.
# The ORDER BY clause orders the rows by date, where earlier dates appear 
# first.
# By setting the window frame clause to ROWS BETWEEN UNBOUNDED PRECEDING 
# AND CURRENT ROW, we ensure that all rows up to and including the current 
# date are used to calculate the (cumulative) sum. 
# (Note: If you read the documentation, you'll see that this is the default 
# behavior, and so the query would return the same result if we left out 
# this window frame clause.)

NameError: name 'client' is not defined

In [None]:
# Query to track beginning and ending stations on October 25, 2015, for each bike
start_end_query = """
                  SELECT bike_number,
                      TIME(start_date) AS trip_time,
                      FIRST_VALUE(start_station_id)
                          OVER (
                               PARTITION BY bike_number
                               ORDER BY start_date
                               ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
                               ) AS first_station_id,
                      LAST_VALUE(end_station_id)
                          OVER (
                               PARTITION BY bike_number
                               ORDER BY start_date
                               ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
                               ) AS last_station_id,
                      start_station_id,
                      end_station_id
                  FROM `bigquery-public-data.san_francisco.bikeshare_trips`
                  WHERE DATE(start_date) = '2015-10-25' 
                  """

# Run the query, and return a pandas DataFrame
start_end_result = client.query(start_end_query).result().to_dataframe()
start_end_result.head()

# 	bike_number	trip_time	first_station_id	last_station_id	start_station_id	end_station_id
# 0	22	13:25:00	2	16	2	16
# 1	25	11:43:00	77	51	77	60
# 2	25	12:14:00	77	51	60	51
# 3	29	14:59:00	46	74	46	60
# 4	29	21:23:00	46	74	60	74

# The query uses both FIRST_VALUE() and LAST_VALUE() as analytic functions.
# The PARTITION BY clause breaks the data into partitions based on the 
# bike_number column. Since this column holds unique identifiers for the 
# bikes, this ensures the calculations are performed separately for each
# bike.
# The ORDER BY clause puts the rows within each partition in chronological 
# order.
# Since the window frame clause is ROWS BETWEEN UNBOUNDED PRECEDING AND 
# UNBOUNDED FOLLOWING, for each row, its entire partition is used to 
# perform the calculation. (This ensures the calculated values for rows in 
# the same partition are identical.)

# 8.2.2. EXERCICES

In [None]:
from google.cloud import bigquery

# Create a "Client" object
client = bigquery.Client()

# Construct a reference to the "chicago_taxi_trips" dataset
dataset_ref = client.dataset("chicago_taxi_trips", project="bigquery-public-data")

# API request - fetch the dataset
dataset = client.get_dataset(dataset_ref)

# Construct a reference to the "taxi_trips" table
table_ref = dataset_ref.table("taxi_trips")

# API request - fetch the table
table = client.get_table(table_ref)

# Preview the first five lines of the table
client.list_rows(table, max_results=5).to_dataframe()

## 8.2.2.1. How can you predict the demand for taxis?

In [None]:
# Say you work for a taxi company, and you're interested in predicting the 
# demand for taxis. Towards this goal, you'd like to create a plot that 
# shows a rolling average of the daily number of taxi trips. Amend the 
# (partial) query below to return a DataFrame with two columns:
# - trip_date - contains one entry for each date from January 1, 2016, 
#   to December 31, 2017.
# - avg_num_trips - shows the average number of daily trips, calculated 
#   over a window including the value for the current date, along with the
#   values for the preceding 15 days and the following 15 days, as long as 
#   the days fit within the two-year time frame. For instance, when 
#   calculating the value in this column for January 5, 2016, the window 
#   will include the number of trips for the preceding 4 days, the current 
#   date, and the following 15 days.

In [None]:
# Fill in the blank below
avg_num_trips_query = """
                      WITH trips_by_day AS
                      (
                      SELECT DATE(trip_start_timestamp) AS trip_date,
                          COUNT(*) as num_trips
                      FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
                      WHERE trip_start_timestamp >= '2016-01-01' AND trip_start_timestamp < '2018-01-01'
                      GROUP BY trip_date
                      ORDER BY trip_date
                      )
                      SELECT trip_date,
                          AVG(num_trips) 
                          OVER (
                               ORDER BY trip_date
                               ROWS BETWEEN 15 PRECEDING AND 15 FOLLOWING
                               ) AS avg_num_trips
                      FROM trips_by_day
                      """

# 	trip_date	avg_num_trips
# 0	2016-01-01	80461.937500
# 1	2016-01-02	80150.647059
# 2	2016-01-03	79419.611111
# 3	2016-01-04	79810.421053
# 4	2016-01-05	80293.900000

## 8.2.2.2. Can you separate and order trips by community area?

In [None]:
# The query below returns a DataFrame with three columns from the table: 
# `pickup_community_area`, `trip_start_timestamp`, and `trip_end_timestamp`.  
# Amend the query to return an additional column called `trip_number` 
# which shows the order in which the trips were taken from their 
# respective community areas.  So, the first trip of the day originating
# from community area 1 should receive a value of 1; the second trip of 
# the day from the same area should receive a value of 2.  Likewise, the 
# first trip of the day from community area 2 should receive a value of 1, 
# and so on.
# Note that there are many numbering functions that can be used to solve 
# this problem (depending on how you want to deal with trips that started 
# at the same time from the same community area); to answer this question, 
# please use the **RANK()** function.

In [None]:
trip_number_query = """
                    SELECT pickup_community_area,
                        trip_start_timestamp,
                        trip_end_timestamp,
                        RANK()
                            OVER (
                                  PARTITION BY pickup_community_area
                                  ORDER BY trip_start_timestamp
                                 ) AS trip_number
                    FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
                    WHERE DATE(trip_start_timestamp) = '2017-05-01' 
                    """

trip_number_result = client.query(trip_number_query).result().to_dataframe()


## 8.2.2.3. How much time elapses between trips?

In [None]:
# The (partial) query in the code cell below shows, for each trip in the 
# selected time frame, the corresponding taxi_id, trip_start_timestamp, 
# and trip_end_timestamp.
# Your task in this exercise is to edit the query to include an additional 
# prev_break column that shows the length of the break (in minutes) that 
# the driver had before each trip started (this corresponds to the time 
# between trip_start_timestamp of the current trip and trip_end_timestamp 
# of the previous trip). Partition the calculation by taxi_id, and order 
# the results within each partition by trip_start_timestamp.

In [None]:
# Fill in the blanks below
reak_time_query = """
                   SELECT taxi_id,
                       trip_start_timestamp,
                       trip_end_timestamp,
                       TIMESTAMP_DIFF(
                           trip_start_timestamp, 
                           LAG(trip_end_timestamp, 1) OVER (PARTITION BY taxi_id ORDER BY trip_start_timestamp), 
                           MINUTE) as prev_break
                   FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
                   WHERE DATE(trip_start_timestamp) = '2017-05-01' 
                   """

# break_time_result = client.query(break_time_query).result().to_dataframe()
