# 7. INTRODUCTION TO SQL

# 7.6. JOINING DATA

# 7.6.1. COURS

## 7.6.1.1. JOIN

In [None]:
# récupérer des infos de plusieurs tables

query = """
        SELECT p.Name AS Pet_Name, o.Name AS Owner_Name
        FROM 'bigquery-public-data.pet_records.pets' AS p
        INNER JOIN 'bigquery-public-data.pet_records.owners' AS o
        """

In [None]:
#  exemple :

query = """
        SELECT L.license, COUNT(1) AS number_of_files
        FROM `bigquery-public-data.github_repos.sample_files` AS sf
        INNER JOIN `bigquery-public-data.github_repos.licenses` AS L 
            ON sf.repo_name = L.repo_name
        GROUP BY L.license
        ORDER BY number_of_files DESC
        """

# Set up the query (cancel the query if it would use too much of 
# your quota, with the limit set to 10 GB)
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
query_job = client.query(query, job_config=safe_config)

# API request - run the query, and convert the results to a pandas DataFrame
file_count_by_license = query_job.to_dataframe()

# Print the DataFrame
file_count_by_license

# 	license	number_of_files
# 0	mit	20432437
# 1	gpl-2.0	16866825
# 2	apache-2.0	7114007
# 3	gpl-3.0	4936479
# 4	bsd-3-clause	2926576
# 5	agpl-3.0	1293773
# 6	lgpl-2.1	793054
# 7	bsd-2-clause	694767
# 8	lgpl-3.0	564433
# 9	mpl-2.0	473078
# 10	cc0-1.0	405925
# 11	epl-1.0	320203
# 12	unlicense	208912
# 13	artistic-2.0	148414
# 14	isc	117709

# 7.6.2. EXERCICES

## 7.6.2.1. Explore the data

In [None]:
# Get a list of available tables 
tables = list(client.list_tables(dataset))
list_of_tables = [table.table_id for table in tables] 

# Print your answer
print(list_of_tables)

# ['badges', 'comments', 'post_history', 'post_links', 'posts_answers', 
#  'posts_moderator_nomination', 'posts_orphaned_tag_wiki', 
#  'posts_privilege_wiki', 'posts_questions', 'posts_tag_wiki', 
#  'posts_tag_wiki_excerpt', 'posts_wiki_placeholder', 'stackoverflow_posts', 
#  'tags', 'users', 'votes']

# 7.6.2.2. Review relevant tables

In [None]:
# If you are interested in people who answer questions on a given topic, 
# the posts_answers table is a natural place to look. 

# Construct a reference to the "posts_answers" table
answers_table_ref = dataset_ref.table("posts_answers")

# API request - fetch the table
answers_table = client.get_table(answers_table_ref)

# Preview the first five lines of the "posts_answers" table
client.list_rows(answers_table, max_results=5).to_dataframe()

In [None]:
# Look at posts_questions

# Construct a reference to the "posts_questions" table
questions_table_ref = dataset_ref.table("posts_questions")

# API request - fetch the table
questions_table = client.get_table(questions_table_ref)

# Preview the first five lines of the "posts_questions" table
client.list_rows(questions_table, max_results=5).to_dataframe()

## 7.6.2.3. Selecting the right questions

In [None]:
# Write a query that selects the id, title and owner_user_id columns from 
# the posts_questions table.

# Restrict the results to rows that contain the word "bigquery" in the tags
# column.
# Include rows where there is other text in addition to the word "bigquery"
# (e.g., if a row has a tag "bigquery-sql", your results should include 
# that too).

# Your code here
questions_query =  """
                  SELECT id, title, owner_user_id
                  FROM `bigquery-public-data.stackoverflow.posts_questions`
                  WHERE tags LIKE '%bigquery%'
                  """

# Set up the query (cancel the query if it would use too much of 
# your quota, with the limit set to 1 GB)
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
questions_query_job = client.query(questions_query, job_config=safe_config)

# API request - run the query, and return a pandas DataFrame
questions_results = questions_query_job.to_dataframe()

# Preview results
print(questions_results.head())

#          id                                              title  owner_user_id
# 0   4702789        How to Analyze and Query big chunks of data       435951.0
# 1  63628724  Will the Firebase BigQuery export still work i...      7590610.0
# 2  63758052  Scheduled query stopped writing to table in Bi...     13961038.0
# 3  63716014  Inconsistent delay in firebase analytics event...       858350.0
# 4  63604155       Dynamic Project names in BigQuery Procedures     10739327.0

## 7.6.2.4. Your first join

In [None]:
# Write a query that returns the id, body and owner_user_id columns from 
# the posts_answers table for answers to "bigquery"-related questions.

# You should have one row in your results for each answer to a question 
# that has "bigquery" in the tags.
# Remember you can get the tags for a question from the tags column in the 
# posts_questions table.

# Your code here
answers_query = """
                SELECT a.id, a.body, a.owner_user_id
                FROM `bigquery-public-data.stackoverflow.posts_questions` AS q 
                INNER JOIN `bigquery-public-data.stackoverflow.posts_answers` AS a
                    ON q.id = a.parent_id
                WHERE q.tags LIKE '%bigquery%'
                """

# Set up the query (cancel the query if it would use too much of 
# your quota, with the limit set to 1 GB)
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
answers_query_job = client.query(answers_query, job_config=safe_config)

# API request - run the query, and return a pandas DataFrame
answers_results = answers_query_job.to_dataframe()

# Preview results
print(answers_results.head())

#          id                                               body  owner_user_id
# 0  61365135  <p>import os\nfrom google.cloud.bigquery.clien...      6016731.0
# 1  61366305  <p>As other answers were to hard for me I made...     10187000.0
# 2  61389343  <p>I decided to poll with the command line gcl...      4355203.0
# 3  61394597  <p>I did as SANN3 taught me, like a following:...      1332841.0
# 4  61402047  <p>You can pass the schema explicitly and set ...      4334376.0

## 7.6.2.5. Answer the question

In [None]:
# Write a new query that has a single row for each user who answered at 
# least one question with a tag that includes the string "bigquery". 
# Your results should have two columns:

# user_id - contains the owner_user_id column from the posts_answers table
# number_of_answers - contains the number of answers the user has written 
# to "bigquery"-related questions

# Your code here
bigquery_experts_query = """
                         SELECT a.owner_user_id AS user_id, COUNT(1) AS number_of_answers
                         FROM `bigquery-public-data.stackoverflow.posts_questions` AS q
                         INNER JOIN `bigquery-public-data.stackoverflow.posts_answers` AS a
                             ON q.id = a.parent_Id
                         WHERE q.tags LIKE '%bigquery%'
                         GROUP BY a.owner_user_id
                         """

# Set up the query (cancel the query if it would use too much of 
# your quota, with the limit set to 1 GB)
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
bigquery_experts_query_job = client.query(bigquery_experts_query, job_config=safe_config)

# API request - run the query, and return a pandas DataFrame
bigquery_experts_results = bigquery_experts_query_job.to_dataframe()

# Preview results
print(bigquery_experts_results.head())

#      user_id  number_of_answers
# 0  4502697.0                  1
# 1  2514521.0                  1
# 2  7041871.0                  5
# 3  2877278.0                260
# 4  4821960.0                  1

## 7.6.2.6. Building a more generally useful service

In [None]:
# How could you convert what you've done to a general function a website 
# could call on the backend to get experts on any topic?

def expert_finder(topic, client):
    '''
    Returns a DataFrame with the user IDs who have written Stack Overflow answers on a topic.

    Inputs:
        topic: A string with the topic of interest
        client: A Client object that specifies the connection to the Stack Overflow dataset

    Outputs:
        results: A DataFrame with columns for user_id and number_of_answers. Follows similar logic to bigquery_experts_results shown above.
    '''
    my_query = """
               SELECT a.owner_user_id AS user_id, COUNT(1) AS number_of_answers
               FROM `bigquery-public-data.stackoverflow.posts_questions` AS q
               INNER JOIN `bigquery-public-data.stackoverflow.posts_answers` AS a
                   ON q.id = a.parent_Id
               WHERE q.tags like '%{topic}%'
               GROUP BY a.owner_user_id
               """

    # Set up the query (a real service would have good error handling for 
    # queries that scan too much data)
    safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)      
    my_query_job = client.query(my_query, job_config=safe_config)

    # API request - run the query, and return a pandas DataFrame
    results = my_query_job.to_dataframe()

    return results