# 8. ADVANCED SQL

# 8.1. JOINS AND UNIONS

# 8.1.1. COURS

## 8.1.1.1. JOINS

## 8.1.1.1.1. INNER JOIN

In [None]:
# Que les éléments communs entre la table de droite et celle de gauche
query = """
        SELECT o.Name AS Owner_Name, p.Name AS Pet_Name
        FROM 'bigquery-public-data.pet_records.owners' AS o
        INNER JOIN 'bigquery-public-data.pet_records.pets' AS p
        ON p.ID = o.Pet_ID
        """
# Owner_Name       Pet_Name
# Aubrey Little    Dr Harris bonkers
# Magnus Burnsides Moon
# Chett Crawfich   Ripley
# Jules Spinner    Tom

## 8.1.1.1.2. LEFT JOIN

In [None]:
# Tous les éléments de la table de gauche
query = """
        SELECT o.Name AS Owner_Name, p.Name AS Pet_Name
        FROM 'bigquery-public-data.pet_records.owners' AS o
        LEFT JOIN 'bigquery-public-data.pet_records.pets' AS p
        ON p.ID = o.Pet_ID
        """
# Owner_Name       Pet_Name
# Aubrey Little    Dr Harris bonkers
# Magnus Burnsides Moon
# Chett Crawfich   Ripley
# Jules Spinner    Tom
# Veronica Dunn    NULL

## 8.1.1.1.3. RIGHT JOIN

In [None]:
# Tous les éléments de la table de droite
query = """
        SELECT o.Name AS Owner_Name, p.Name AS Pet_Name
        FROM 'bigquery-public-data.pet_records.owners' AS o
        LEFT JOIN 'bigquery-public-data.pet_records.pets' AS p
        ON p.ID = o.Pet_ID
        """
# Owner_Name       Pet_Name
# Aubrey Little    Dr Harris bonkers
# Magnus Burnsides Moon
# Chett Crawfich   Ripley
# Jules Spinner    Tom
# NULL             Maisie

## 8.1.1.1.4. FULL JOIN

In [None]:
# Tous les éléments de toutes les tables


In [None]:
# Tous les éléments de la table de gauche
query = """
        SELECT o.Name AS Owner_Name, p.Name AS Pet_Name
        FROM 'bigquery-public-data.pet_records.owners' AS o
        LEFT JOIN 'bigquery-public-data.pet_records.pets' AS p
        ON p.ID = o.Pet_ID
        """
# Owner_Name       Pet_Name
# Aubrey Little    Dr Harris bonkers
# Magnus Burnsides Moon
# Chett Crawfich   Ripley
# Jules Spinner    Tom
# Veronica Dunn    NULL
# NULL             Maisie

## 8.1.1.2. UNIONS

## 8.1.1.2.1. UNION ALL

In [None]:
# Tous les éléments des 2 tables doublons possibles
query = """
        SELECT Age FROM 'bigquery-public-data.pet_records.owners' 
        UNION ALL
        SELECT Age FROM 'bigquery-public-data.pet_records.pets' 
        """
# ATTENTION : les éléments sélectionnés des 2 tables doivent avoir le même
# nom ou les mêmes noms
# REMARQUE : il peut y avoir des doublons

## 8.1.1.2.2. UNION DISTINCT

In [None]:
# Tous les éléments différents/uniques des 2 tables pas de doublons
query = """
        SELECT Age FROM 'bigquery-public-data.pet_records.owners' 
        UNION DISTNCT
        SELECT Age FROM 'bigquery-public-data.pet_records.pets' 
        """
# ATTENTION : les éléments sélectionnés des 2 tables doivent avoir le même
# nom ou les mêmes noms

## 8.1.1.3. EXEMPLES

In [None]:
# Query to select all stories posted on January 1, 2012, with number of comments
join_query = """
             WITH c AS
             (
             SELECT parent, COUNT(*) as num_comments
             FROM `bigquery-public-data.hacker_news.comments` 
             GROUP BY parent
             )
             SELECT s.id as story_id, s.by, s.title, c.num_comments
             FROM `bigquery-public-data.hacker_news.stories` AS s
             LEFT JOIN c
             ON s.id = c.parent
             WHERE EXTRACT(DATE FROM s.time_ts) = '2012-01-01'
             ORDER BY c.num_comments DESC
             """

# Run the query, and return a pandas DataFrame
join_result = client.query(join_query).result().to_dataframe()
join_result.head()

# 	story_id	by	title	num_comments
# 0	3412900	whoishiring	Ask HN: Who is Hiring? (January 2012)	154.0
# 1	3412901	whoishiring	Ask HN: Freelancer? Seeking freelancer? (Janua...	97.0
# 2	3412643	jemeshsu	Avoid Apress	30.0
# 3	3412891	Brajeshwar	There's no shame in code that is simply "good ...	27.0
# 4	3414012	ramanujam	Impress.js - a Prezi like implementation using...	27.0

In [None]:
# None of these stories received any comments
join_result.tail()

# 	story_id	by	title	num_comments
# 439	3413269	wireheadlance	On 2011: The Year of System Failure	NaN
# 440	3412995	benjlang	Android app search engine	NaN
# 441	3413806	jensnockert	Nyan Cat progress indicators for your Mac	NaN
# 442	3412769	syedmuddassar	Ballroom Dance Shoes	NaN
# 443	3413361	learnphp123	Easy AJAX Pagination Using JQuery CakePHP	NaN

In [None]:
# Query to select all users who posted stories or comments on January 1, 2014
union_query = """
              SELECT c.by
              FROM `bigquery-public-data.hacker_news.comments` AS c
              WHERE EXTRACT(DATE FROM c.time_ts) = '2014-01-01'
              UNION DISTINCT
              SELECT s.by
              FROM `bigquery-public-data.hacker_news.stories` AS s
              WHERE EXTRACT(DATE FROM s.time_ts) = '2014-01-01'
              """

# Run the query, and return a pandas DataFrame
union_result = client.query(union_query).result().to_dataframe()
union_result.head()

# 	by
# 0	vitd
# 1	gpvos
# 2	mbell
# 3	znowi
# 4	GSimon

In [None]:
# Number of users who posted stories or comments on January 1, 2014
len(union_result)

# 2282

# 8.1.2. EXERCICES

In [None]:
from google.cloud import bigquery

# Create a "Client" object
client = bigquery.Client()

# Construct a reference to the "stackoverflow" dataset
dataset_ref = client.dataset("stackoverflow", project="bigquery-public-data")

# API request - fetch the dataset
dataset = client.get_dataset(dataset_ref)

# Construct a reference to the "posts_questions" table
table_ref = dataset_ref.table("posts_questions")

# API request - fetch the table
table = client.get_table(table_ref)

# Preview the first five lines of the table
client.list_rows(table, max_results=5).to_dataframe()

In [None]:
# Construct a reference to the "posts_answers" table
table_ref = dataset_ref.table("posts_answers")

# API request - fetch the table
table = client.get_table(table_ref)

# Preview the first five lines of the table
client.list_rows(table, max_results=5).to_dataframe()

## 8.1.2.1. How long does it take for questions to receive answers?

In [None]:
# You're interested in exploring the data to have a better understanding 
# of how long it generally takes for questions to receive answers. 
# Armed with this knowledge, you plan to use this information to better 
# design the order in which questions are presented to Stack Overflow users.

# With this goal in mind, you write the query below, which focuses on 
# questions asked in January 2018. 
# It returns a table with two columns:
#  - q_id - the ID of the question
#  - time_to_answer - how long it took (in seconds) for the question to 
#    receive an answer

In [None]:
first_query = """
              SELECT q.id AS q_id,
                  MIN(TIMESTAMP_DIFF(a.creation_date, q.creation_date, SECOND)) as time_to_answer
              FROM `bigquery-public-data.stackoverflow.posts_questions` AS q
                    INNER JOIN `bigquery-public-data.stackoverflow.posts_answers` AS a
              ON q.id = a.parent_id
              WHERE q.creation_date >= '2018-01-01' and q.creation_date < '2018-02-01'
              GROUP BY q_id
              ORDER BY time_to_answer
              """

first_result = client.query(first_query).result().to_dataframe()
print("Percentage of answered questions: %s%%" % \
      (sum(first_result["time_to_answer"].notnull()) / len(first_result) * 100))
print("Number of questions:", len(first_result))
first_result.head()

Percentage of answered questions: 100.0%
Number of questions: 134526
    
# 	q_id	time_to_answer
# 0	48369687	0
# 1	48350506	0
# 2	48054907	0
# 3	48118024	0
# 4	48123689	0

In [1]:
# You're surprised at the results and strongly suspect that something is 
# wrong with your query. In particular,

# According to the query, 100% of the questions from January 2018 received 
# an answer. But, you know that ~80% of the questions on the site usually 
# receive an answer.
# The total number of questions is surprisingly low. You expected to see at
# least 150,000 questions represented in the table.
# Given these observations, you think that the type of JOIN you have chosen
# has inadvertently excluded unanswered questions. 
# Using the code cell below, can you figure out what type of JOIN to use 
# to fix the problem so that the table includes unanswered questions?

# Note: You need only amend the type of JOIN (i.e., INNER, LEFT, RIGHT, or 
# FULL) to answer the question successfully.

# 8.1.2.2. Initial questions and answers, Part 1

In [None]:
# You're interested in understanding the initial experiences that users 
# typically have with the Stack Overflow website. 
# Is it more common for users to first ask questions or provide answers? 
# After signing up, how long does it take for users to first interact with 
# the website? To explore this further, you draft the (partial) query in 
# the code cell below.

# The query returns a table with three columns:
#  - owner_user_id - the user ID
#  - q_creation_date - the first time the user asked a question
#  - a_creation_date - the first time the user contributed an answer

In [None]:
# Your code here
q_and_a_query = """
                SELECT q.owner_user_id AS owner_user_id,
                    MIN(q.creation_date) AS q_creation_date,
                    MIN(a.creation_date) AS a_creation_date
                FROM `bigquery-public-data.stackoverflow.posts_questions` AS q
                    FULL JOIN `bigquery-public-data.stackoverflow.posts_answers` AS a
                ON q.owner_user_id = a.owner_user_id 
                WHERE q.creation_date >= '2019-01-01' AND q.creation_date < '2019-02-01' 
                    AND a.creation_date >= '2019-01-01' AND a.creation_date < '2019-02-01'
                GROUP BY owner_user_id
                """

# owner_user_id	q_creation_date	a_creation_date
# 0	10938008	2019-01-19 16:27:53.953000+00:00	2019-01-19 19:17:06.910000+00:00
# 1	8986917	2019-01-27 18:50:10.753000+00:00	2019-01-18 19:10:26.380000+00:00
# 2	7840155	2019-01-08 09:10:22.257000+00:00	2019-01-19 22:30:48.383000+00:00
# 3	5772095	2019-01-16 11:27:00.510000+00:00	2019-01-16 23:23:56.940000+00:00
# 4	7675437	2019-01-15 10:26:14.527000+00:00	2019-01-01 02:39:23.947000+00:00

# 8.1.2.3.  Initial questions and answers, Part 2

In [None]:
# Write a query that returns the following columns:

# id - the IDs of all users who created Stack Overflow accounts in January 2
# 019 (January 1, 2019, to January 31, 2019, inclusive)
# q_creation_date - the first time the user posted a question on the site; 
# if the user has never posted a question, the value should be null
# a_creation_date - the first time the user posted a question on the site; 
# if the user has never posted a question, the value should be null

In [None]:
# Your code here
three_tables_query = """
                     SELECT u.id AS id,
                         MIN(q.creation_date) AS q_creation_date,
                         MIN(a.creation_date) AS a_creation_date
                     FROM `bigquery-public-data.stackoverflow.users` AS u
                         LEFT JOIN `bigquery-public-data.stackoverflow.posts_answers` AS a
                             ON u.id = a.owner_user_id
                         LEFT JOIN `bigquery-public-data.stackoverflow.posts_questions` AS q
                             ON q.owner_user_id = u.id
                     WHERE u.creation_date >= '2019-01-01' and u.creation_date < '2019-02-01'
                     GROUP BY id
                     """

	id	q_creation_date	a_creation_date
0	10991255	2019-01-30 15:49:51.077000+00:00	2019-02-17 11:03:45.440000+00:00
1	10947844	NaT	NaT
2	10925137	NaT	NaT
3	10984435	NaT	NaT
4	10991739	NaT	NaT

# 8.1.2.4.  How many distinct users posted on January 1, 2019?

In [None]:
# In the code cell below, write a query that returns a table with a single 
# column:
# - owner_user_id - the IDs of all users who posted at least one question 
#   or answer on January 1, 2019. Each user ID should appear at most once.

# In the posts_questions (and posts_answers) tables, you can get the ID of 
# the original poster from the owner_user_id column. Likewise, the date of 
# the original posting can be found in the creation_date column.

# In order for your answer to be marked correct, your query must use a 
# UNION.

In [None]:
# Your code here
all_users_query = """
                  SELECT q.owner_user_id 
                  FROM `bigquery-public-data.stackoverflow.posts_questions` AS q
                  WHERE EXTRACT(DATE FROM q.creation_date) = '2019-01-01'
                  UNION DISTINCT
                  SELECT a.owner_user_id
                  FROM `bigquery-public-data.stackoverflow.posts_answers` AS a
                  WHERE EXTRACT(DATE FROM a.creation_date) = '2019-01-01'
                  """

# owner_user_id
# 0	8607740.0
# 1	104080.0
# 2	7585847.0
# 3	1621517.0
# 4	44852.0