In [1]:
from google.cloud import bigquery
from google.oauth2 import service_account

# We will import a customized function called client which actually returns an authorized bigquery client object with right credentials
# this will cost us an extra pair of () each time we call the client object which is now called by the function client we define in bq_sa_auth.py 

from bq_sa_auth import client

## [Joining Data](https://www.kaggle.com/code/dansbecker/joining-data/tutorial)

#### Combine data sources. Critical for almost all real-world data problems

---------------

### Keywords: JOIN or INNER JOIN
#### If the data you want is spread across different tables, you can use JOIN to fuse the relevant columns from different tables and combine them in a single table

#### **Example: How many files are covered by each type of software license?**

GitHub is the most popular place to collaborate on software projects. A GitHub repository (or repo) is a collection of files associated with a specific project.

Most repos on GitHub are shared under a specific legal license, which determines the legal restrictions on how they are used. For our example, we're going to look at how many different files have been released under each license.

We'll work with two tables in the database. The first table is `licenses` table, which provides the name of each GitHub repo (in the repo_name column) and its corresponding license. Here's a view of the first five rows.

In [3]:
# Construct a reference to the "github_repos" dataset
dataset_ref = client().dataset("github_repos", project="bigquery-public-data")

# API request - fetch the dataset
dataset = client().get_dataset(dataset_ref)

# Construct a reference to the "licenses" table
licenses_ref = dataset_ref.table("licenses")

# API request - fetch the table
licenses_table = client().get_table(licenses_ref)

# Preview the first five lines of the "licenses" table
client().list_rows(licenses_table, max_results=5).to_dataframe()

Unnamed: 0,repo_name,license
0,autarch/Dist-Zilla-Plugin-Test-TidyAll,artistic-2.0
1,thundergnat/Prime-Factor,artistic-2.0
2,kusha-b-k/Turabian_Engin_Fan,artistic-2.0
3,onlinepremiumoutlet/onlinepremiumoutlet.github.io,artistic-2.0
4,huangyuanlove/LiaoBa_Service,artistic-2.0


In [4]:
# List all the tables in the github_repos dataset

tables = list(client().list_tables(dataset))

for tab in tables:
    print(tab.table_id)

commits
contents
files
languages
licenses
sample_commits
sample_contents
sample_files
sample_repos


### The second table is the `sample_files` table, which provides, among other information, the GitHub repo that each file belongs to (in the `repo_name` column). The first several rows of this table are printed below.

In [6]:
# Construct a reference to the "sample_files" table
files_ref = dataset_ref.table("sample_files")

# API request - fetch the table
files_table = client().get_table(files_ref)

# Preview the first five lines of the "sample_files" table
client().list_rows(files_table, max_results=5).to_dataframe()

Unnamed: 0,repo_name,ref,path,mode,id,symlink_target
0,EOL/eol,refs/heads/master,generate/vendor/railties,40960,0338c33fb3fda57db9e812ac7de969317cad4959,/usr/share/rails-ruby1.8/railties
1,np/ling,refs/heads/master,tests/success/merger_seq_inferred.t/merger_seq...,40960,dd4bb3d5ecabe5044d3fa5a36e0a9bf7ca878209,../../../fixtures/all/merger_seq_inferred.ll
2,np/ling,refs/heads/master,fixtures/sequence/lettype.ll,40960,8fdf536def2633116d65b92b3b9257bcf06e3e45,../all/lettype.ll
3,np/ling,refs/heads/master,fixtures/failure/wrong_order_seq3.ll,40960,c2509ae1196c4bb79d7e60a3d679488ca4a753e9,../all/wrong_order_seq3.ll
4,np/ling,refs/heads/master,issues/sequence/keep.t,40960,5721de3488fb32745dfc11ec482e5dd0331fecaf,../keep.t


### Next, we write a query that uses information in both tables to determine how many files are released in each license.

In [7]:
first_join_query = """
                 SELECT l.license, COUNT(1) AS num_files
                 FROM `bigquery-public-data.github_repos.licenses` AS l 
                 INNER JOIN `bigquery-public-data.github_repos.sample_files` AS f
                    ON l.repo_name = f.repo_name
                 GROUP BY l.license
                 ORDER BY num_files DESC
                 """

safe_config = bigquery.QueryJobConfig(maximum_bytes_billed = 10**10)

query_job = client().query(first_join_query, job_config=safe_config)

git_licenses = query_job.to_dataframe()

git_licenses.head(24)

Unnamed: 0,license,num_files
0,mit,20560894
1,gpl-2.0,16608922
2,apache-2.0,7201141
3,gpl-3.0,5107676
4,bsd-3-clause,3465437
5,agpl-3.0,1372100
6,lgpl-2.1,799664
7,bsd-2-clause,692357
8,lgpl-3.0,582277
9,mpl-2.0,457000


### **Exercises : JOIN DATA**

#### Introduction

[Stack Overflow](https://stackoverflow.com/) is a widely beloved question and answer site for technical questions. You'll probably use it yourself as you keep using SQL (or any programming language). 

Their data is publicly available. What cool things do you think it would be useful for?

Here's one idea:
You could set up a service that identifies the Stack Overflow users who have demonstrated expertise with a specific technology by answering related questions about it, so someone could hire those experts for in-depth help.

In this exercise, you'll write the SQL queries that might serve as the foundation for this type of service.

In [9]:
# Construct a reference to the "stackoverflow" dataset
sof_dataset_ref = client().dataset("stackoverflow", project="bigquery-public-data")

# API request - fetch the dataset
sof_dataset = client().get_dataset(sof_dataset_ref)

### 1) Explore the data

Before writing queries or **JOIN** clauses, you'll want to see what tables are available. 

In [10]:
sof_tables = list(client().list_tables(sof_dataset))

for tab in sof_tables:
    print(tab.table_id)

badges
comments
post_history
post_links
posts_answers
posts_moderator_nomination
posts_orphaned_tag_wiki
posts_privilege_wiki
posts_questions
posts_tag_wiki
posts_tag_wiki_excerpt
posts_wiki_placeholder
stackoverflow_posts
tags
users
votes


### 2) Review relevant tables

If you are interested in people who answer questions on a given topic, the `posts_answers` table is a natural place to look. 

In [12]:
# Create a reference to the table and fetch the table by a API request = get_table 
posts_answers_table = client().get_table(sof_dataset_ref.table("posts_answers"))

# Preview the first five lines of the "sample_files" table
client().list_rows(posts_answers_table, max_results=5).to_dataframe()

Unnamed: 0,id,title,body,accepted_answer_id,answer_count,comment_count,community_owned_date,creation_date,favorite_count,last_activity_date,last_edit_date,last_editor_display_name,last_editor_user_id,owner_display_name,owner_user_id,parent_id,post_type_id,score,tags,view_count
0,18,,<p>For a table like this:</p>\n\n<pre><code>CR...,,,2,NaT,2008-08-01 05:12:44.193000+00:00,,2016-06-02 05:56:26.060000+00:00,2016-06-02 05:56:26.060000+00:00,Jeff Atwood,126039,phpguy,,17,2,59,,
1,165,,"<p>You can use a <a href=""http://sharpdevelop....",,,0,NaT,2008-08-01 18:04:25.023000+00:00,,2019-04-06 14:03:51.080000+00:00,2019-04-06 14:03:51.080000+00:00,,1721793,user2189331,,145,2,10,,
2,1028,,<p>The VB code looks something like this:</p>\...,,,0,NaT,2008-08-04 04:58:40.300000+00:00,,2013-02-07 13:22:14.680000+00:00,2013-02-07 13:22:14.680000+00:00,,395659,user2189331,,947,2,8,,
3,1073,,<p>My first choice would be a dedicated heap t...,,,0,NaT,2008-08-04 07:51:02.997000+00:00,,2015-09-01 17:32:32.120000+00:00,2015-09-01 17:32:32.120000+00:00,,45459,user2189331,,1069,2,29,,
4,1260,,<p>I found the answer. all you have to do is a...,,,0,NaT,2008-08-04 14:06:02.863000+00:00,,2016-12-20 08:38:48.867000+00:00,2016-12-20 08:38:48.867000+00:00,,1221571,Jin,,1229,2,1,,


### `parent_id` is an identifier of the main question where the corresponding post is answering. Lets have a look at the `posts_questions` table.

In [13]:
# Create a reference to the table and fetch the table by a API request = get_table 
posts_q_table = client().get_table(sof_dataset_ref.table("posts_questions"))

# Preview the first five lines of the "sample_files" table
client().list_rows(posts_q_table, max_results=5).to_dataframe()

Unnamed: 0,id,title,body,accepted_answer_id,answer_count,comment_count,community_owned_date,creation_date,favorite_count,last_activity_date,last_edit_date,last_editor_display_name,last_editor_user_id,owner_display_name,owner_user_id,parent_id,post_type_id,score,tags,view_count
0,320268,Html.ActionLink doesn’t render # properly,<p>When using Html.ActionLink passing a string...,,0,0,NaT,2008-11-26 10:42:37.477000+00:00,0,2009-02-06 20:13:54.370000+00:00,NaT,,,Paulo,,,1,0,asp.net-mvc,390
1,324003,Primitive recursion,<p>how will i define the function 'simplify' ...,,0,0,NaT,2008-11-27 15:12:37.497000+00:00,0,2012-09-25 19:54:40.597000+00:00,2012-09-25 19:54:40.597000+00:00,Marcin,1288.0,,41000.0,,1,0,haskell|lambda|functional-programming|lambda-c...,497
2,390605,While vs. Do While,<p>I've seen both the blocks of code in use se...,390608.0,0,0,NaT,2008-12-24 01:49:54.230000+00:00,2,2008-12-24 03:08:55.897000+00:00,NaT,,,Unkwntech,115.0,,1,0,language-agnostic|loops,11262
3,413246,Protect ASP.NET Source code,<p>Im currently doing some research in how to ...,,0,0,NaT,2009-01-05 14:23:51.040000+00:00,0,2009-03-24 21:30:22.370000+00:00,2009-01-05 14:42:28.257000+00:00,Tom Anderson,13502.0,Velnias,,,1,0,asp.net|deployment|obfuscation,4823
4,454921,"Difference between ""int[] myArray"" and ""int my...",<blockquote>\n <p><strong>Possible Duplicate:...,454928.0,0,0,NaT,2009-01-18 10:22:52.177000+00:00,0,2009-01-18 10:30:50.930000+00:00,2017-05-23 11:49:26.567000+00:00,,-1.0,Evan Fosmark,49701.0,,1,0,java|arrays,798


#### Are there any fields that identify what topic or technology each question is about? If so, how could you find the IDs of users who answered questions about a specific topic?

#### Answer: `posts_questions` table has a `tag` column which determines the specific topic/technology each question is about. `posts_answers` table has a column labelled as `parent_id` that identifies the ID of the question each answer corresponds to. It also has a `owner_user_id` attribute that labels the ID of the user who answered the question. 

### 3) Selecting the right questions

A lot of this data is text. 

We'll explore one last technique in this course which you can apply to this text.

A **WHERE** clause can limit your results to rows with certain text using the **LIKE** feature. For example, to select just the third row of the `pets` table from the tutorial, we could use the query in the picture below.

![](https://storage.googleapis.com/kaggle-media/learn/images/RccsXBr.png) 

You can also use `%` as a "wildcard" for any number of characters. So you can also get the third row with:

```
query = """
        SELECT * 
        FROM `bigquery-public-data.pet_records.pets` 
        WHERE Name LIKE '%ipl%'
        """
```

Try this yourself. Write a query that selects the `id`, `title` and `owner_user_id` columns from the `posts_questions` table. 
- Restrict the results to rows that contain the word "bigquery" in the `tags` column. 
- Include rows where there is other text in addition to the word "bigquery" (e.g., if a row has a tag "bigquery-sql", your results should include that too).

In [14]:
like_query = """
                 SELECT id, title, owner_user_id
                 FROM `bigquery-public-data.stackoverflow.posts_questions`  
                 WHERE tags LIKE "%bigquery%"
                 """

safe_config = bigquery.QueryJobConfig(maximum_bytes_billed = 10**10)

query_job = client().query(like_query, job_config=safe_config)

bquery_questions = query_job.to_dataframe()

bquery_questions.head(10)

Unnamed: 0,id,title,owner_user_id
0,64345717,Loop by array and union looped result in BigQuery,13304769
1,64610766,BigQuery Transfer jobs from S3 stuck pending o...,14549617
2,64383871,How to get sum of values in days intervals usi...,12472644
3,64251948,BigQuery get row above empty column,4572124
4,64323398,SQL: Remove part of string that is in another ...,6089137
5,64342164,Using schema update option in beam.io.writetob...,14376281
6,64535947,How to INSERT data in Nested Table from Nested...,14138192
7,64196655,Bigquery - How to use a previously created col...,14389733
8,64276972,"Query table in Google BigQuery has error ""Acce...",14417836
9,64266729,How update a table in Big Query and store the ...,14382768


### 4) Your first join
Now that you have a query to select questions on any given topic (in this case, you chose "bigquery"), you can find the answers to those questions with a **JOIN**.  

Write a query that returns the `id`, `body` and `owner_user_id` columns from the `posts_answers` table for answers to "bigquery"-related questions. 
- You should have one row in your results for each answer to a question that has "bigquery" in the tags.  
- Remember you can get the tags for a question from the `tags` column in the `posts_questions` table.

Here's a reminder of what a **JOIN** looked like in the tutorial:
```
query = """
        SELECT p.Name AS Pet_Name, o.Name AS Owner_Name
        FROM `bigquery-public-data.pet_records.pets` as p
        INNER JOIN `bigquery-public-data.pet_records.owners` as o 
            ON p.ID = o.Pet_ID
        """
```

It may be useful to scroll up and review the first several rows of the `posts_answers` and `posts_questions` tables.  

In [15]:
## id column of the questions table should match with the parent_id column of the answers table, which we can use to join two tables 

QAjoin_query = """
                 SELECT a.id, a.body, a.owner_user_id
                 
                 FROM `bigquery-public-data.stackoverflow.posts_answers` AS a
                 INNER JOIN `bigquery-public-data.stackoverflow.posts_questions` AS q 
                    ON q.id = a.parent_id
                 WHERE q.tags LIKE "%bigquery%"
                 """

safe_config = bigquery.QueryJobConfig(maximum_bytes_billed =27*10**10)

query_job = client().query(QAjoin_query, job_config=safe_config)

bquery_answers = query_job.to_dataframe()

bquery_answers.head(10)

Unnamed: 0,id,body,owner_user_id
0,23712400,<p>The code seems to be updated since it does ...,1654764
1,23772718,<p>This issue has been acknowledge by Google a...,1311879
2,23783473,<pre><code>def refresh_bq(self):\n credenti...,3599101
3,23956074,"<pre><code>select max_bag as BAG,\n ...",1768423
4,24034704,"<p>In the example you are linking to, the URL ...",3035921
5,24130027,"<p>When you say <code>{""projectId"" =&gt; ""proj...",1392458
6,24255399,<p>We should use other code instead of getReso...,3737020
7,23719147,<p>This error pops up if the <code>WHERE</code...,49485
8,23852673,"<p>I'd rather avoid Joins, as there are more e...",2881671
9,24087466,<p>You could also use an ETL tool that support...,1401986


### 5) Answer the question
You have the merge you need. But you want a list of users who have answered many questions... which requires more work beyond your previous result.

Write a new query that has a single row for each user who answered at least one question with a tag that includes the string "bigquery". Your results should have two columns:
- `user_id` - contains the `owner_user_id` column from the `posts_answers` table
- `number_of_answers` - contains the number of answers the user has written to "bigquery"-related questions

In [17]:
freqBQ_query = """
                 SELECT a.owner_user_id AS user_id, COUNT(1) AS number_of_answers
                 
                 FROM `bigquery-public-data.stackoverflow.posts_answers` AS a
                 INNER JOIN `bigquery-public-data.stackoverflow.posts_questions` AS q 
                    ON q.id = a.parent_id
                 WHERE q.tags LIKE "%bigquery%"
                 GROUP BY user_id
                 ORDER BY number_of_answers DESC
                 """

safe_config = bigquery.QueryJobConfig(maximum_bytes_billed =27*10**10)

query_job = client().query(freqBQ_query, job_config=safe_config)

bquery_frequsers = query_job.to_dataframe()

bquery_frequsers.head(20)

Unnamed: 0,user_id,number_of_answers
0,5221944,5203
1,1144035,1634
2,132438,898
3,6253347,737
4,1366527,620
5,243782,613
6,13473525,380
7,4490873,275
8,11206202,264
9,2877278,261


### 6) Building a more generally useful service

How could you convert what you've done to a general function a website could call on the backend to get experts on any topic?  

In [18]:
# We can define a function that takes a topic as an argument and returns the result of the query. All we need to do is to make the function take the input topic and change the query accordingly 

# need to use a f-string notation in the query to be able to account for the input topic!!!

def find_expert(topic, cl):

    '''
      Inputs:
              - topic: desired topic that might be relevant to Stack Overflow as a plain text   
              - client: bigquery client object that allows us to reach Stack Overflow dataset
      Output:
              - A data frame with user_id and number_of_answers order by the number of answers that the user gave to a specified topic
    '''

    sof_query = f"""
                   SELECT a.owner_user_id AS user_id, COUNT(1) AS number_of_answers
                   FROM `bigquery-public-data.stackoverflow.posts_answers` AS a
                   INNER JOIN `bigquery-public-data.stackoverflow.posts_questions` AS q 
                   ON q.id = a.parent_id
                   WHERE q.tags LIKE '%{topic}%'
                   GROUP BY user_id
                   ORDER BY number_of_answers DESC
                """
    safe_config = bigquery.QueryJobConfig(maximum_bytes_billed =27*10**10)
    
    my_query_job = cl.query(sof_query, job_config=safe_config)
    
    sof_query_df = my_query_job.to_dataframe()

    return sof_query_df

In [19]:
find_expert('tensorflow', client()).head()

Unnamed: 0,user_id,number_of_answers
0,3574081,1063
1,9657861,807
2,1782792,730
3,2097240,596
4,10798917,566
