In [2]:
from google.cloud import bigquery
from google.oauth2 import service_account

# We will import a customized function called client which actually returns an authorized bigquery client object with right credentials
# this will cost us an extra pair of () each time we call the client object which is now called by the function client we define in bq_sa_auth.py 

from bq_sa_auth import client

### Keywords: COUNT() function and GROUP BY, HAVING 

#### Refer to the [tutorial](https://www.kaggle.com/code/dansbecker/group-by-having-count) for more details

#### Examples: We will work with HackerNews data set. 

In [3]:
# Construct a reference to the "hacker_news" dataset
dataset_ref = client().dataset("hacker_news", project="bigquery-public-data")

# API request - fetch the dataset
dataset = client().get_dataset(dataset_ref)

# Construct a reference to the "full" table
table_ref = dataset_ref.table("full")

# API request - fetch the table
table = client().get_table(table_ref)

# Preview the first five lines of the "comments" table
client().list_rows(table, max_results=5).to_dataframe()

Unnamed: 0,title,url,text,dead,by,score,time,timestamp,type,id,parent,descendants,ranking,deleted
0,,,"If the crocodile looked him up on Google, we b...",,raxxorrax,,1633421535,2021-10-05 08:12:15+00:00,comment,28756662,28750122,,,
1,,,What exactly are you looking for? I think Pyto...,,abiro,,1569141387,2019-09-22 08:36:27+00:00,comment,21040311,21040141,,,
2,,,"Ironically, this very project might help out w...",,mjevans,,1505769703,2017-09-18 21:21:43+00:00,comment,15279716,15276626,,,
3,,,As you start to gain some experience it can be...,,every_other,,1538575027,2018-10-03 13:57:07+00:00,comment,18130207,18128477,,,
4,,,"That’s what I was referring to, yes. I heard o...",,manmal,,1615664155,2021-03-13 19:35:55+00:00,comment,26449260,26449237,,,


In [4]:
# Lets find which comments gets most replies, id is a unique identifier of the comments and parent indicates the the main thread the comment was replied to

# We added an alies for the Count(ID) column using "as ..."

# If you are unsure about what to put inside COUNT(), you can replace it to COUNT(1) to simply count the number of rows in each group. 

query_popular = """
            SELECT parent, COUNT(1) as Num_Replies
            FROM `bigquery-public-data.hacker_news.full`
            GROUP BY parent
            HAVING COUNT(1) > 20
        """

ONE_GB = 1000*1000*1000

safe_config = bigquery.QueryJobConfig(maximum_bytes_billed = ONE_GB)

query_job = client().query(query_popular, job_config=safe_config)

popular_threads = query_job.to_dataframe()

popular_threads.head()

Unnamed: 0,parent,Num_Replies
0,6186730,96
1,33416498,64
2,24209025,71
3,4130035,54
4,8546617,84


# Exercises

### 1) Prolific commenters

Hacker News would like to send awards to everyone who has written more than 10,000 posts. Write a query that returns all authors with more than 10,000 posts as well as their post counts. Call the column with post counts `NumPosts`.

In [5]:
# Need to add backsticks on the by column because SQL does not differenctiate between caps or non-caps syntax 

query_authors = """
            SELECT `by`, COUNT(1) as NumPosts
            FROM `bigquery-public-data.hacker_news.full`
            GROUP BY `by`
            HAVING COUNT(1) > 10000
        """

ONE_GB = 1000*1000*1000

safe_config = bigquery.QueryJobConfig(maximum_bytes_billed = ONE_GB)

query_job = client().query(query_authors, job_config=safe_config)

pro_authors = query_job.to_dataframe()

pro_authors.head()

Unnamed: 0,by,NumPosts
0,DanBC,23496
1,mc32,10910
2,hinkley,10011
3,,883263
4,agumonkey,16756


### 2) Deleted comments

How many comments have been deleted? (If a comment was deleted, the `deleted` column in the comments table will have the value `True`.)

In [7]:
query_deleted = """
            SELECT COUNT(1) as NumPosts_deleted
            FROM `bigquery-public-data.hacker_news.full`
            WHERE deleted = True
        """

ONE_GB = 1000*1000*1000

safe_config = bigquery.QueryJobConfig(maximum_bytes_billed = ONE_GB)

query_job = client().query(query_deleted, job_config=safe_config)

deleted_posts = query_job.to_dataframe()

deleted_posts.head()

Unnamed: 0,NumPosts_deleted
0,859401
