# 7. INTRODUCTION TO SQL

# 7.3. GROUP BY, HAVING & COUNT

# 7.3.1. COURS

In [None]:
from google.cloud import bigquery

# Create a "Client" object
client = bigquery.Client()

# Construct a reference to the "hacker_news" dataset
dataset_ref = client.dataset("hacker_news", project="bigquery-public-data")

# API request - fetch the dataset
dataset = client.get_dataset(dataset_ref)

# Construct a reference to the "comments" table
table_ref = dataset_ref.table("comments")

# API request - fetch the table
table = client.get_table(table_ref)

# Preview the first five lines of the "comments" table
client.list_rows(table, max_results=5).to_dataframe()

## 7.3.1.1.COUNT()

In [2]:
# COUNT() : compte le nombre de quelquechose

query = """
        SELECT COUNT(ID)
        FROM 'bigquery-publicdata.pet_records.pets'
        """

# COUNT() is an example of an aggregate function, which takes many values
# and returns one. 
# (Other examples of aggregate functions include SUM(), AVG(), MIN(), 
# and MAX().)

 ## 7.3.1.2. GROUP BY

In [3]:
# GROUP BY takes the name of one or more columns, and treats all rows with 
# the same value in that column as a single group when you apply aggregate 
# functions like COUNT().

query = """
        SELECT Animal, COUNT(ID)
        FROM 'bigquery-publicdata.pet_records.pets'
        GROUP BY Animal
        """

 ## 7.3.1.3. GROUP BY ... HAVING

In [None]:
# HAVING is used in combination with GROUP BY to ignore groups that don't 
# meet certain criteria.

query = """
        SELECT Animal, COUNT(ID)
        FROM 'bigquery-publicdata.pet_records.pets'
        GROUP BY Animal
        HAVING COUNT(ID) > 1
        """

In [None]:
# Query to select comments that received more than 10 replies
query_popular = """
                SELECT parent, COUNT(id)
                FROM `bigquery-public-data.hacker_news.comments`
                GROUP BY parent
                HAVING COUNT(id) > 10
                """

In [4]:
# Now that our query is ready, let's run it and store the results in a 
# pandas DataFrame:

# Set up the query (cancel the query if it would use too much of 
# your quota, with the limit set to 10 GB)
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
query_job = client.query(query_popular, job_config=safe_config)

# API request - run the query, and convert the results to a pandas DataFrame
popular_comments = query_job.to_dataframe()

# Print the first five rows of the DataFrame
popular_comments.head()

# 	parent	f0_
# 0	7536283	45
# 1	4053076	242
# 2	2530963	59
# 3	1934367	70
# 4	8204007	43

NameError: name 'bigquery' is not defined

### 7.3.1.4. Aliasing and other improvements

In [None]:
# Improved version of earlier query, now with aliasing & improved readability
query_improved = """
                 SELECT parent, COUNT(1) AS NumPosts
                 FROM `bigquery-public-data.hacker_news.comments`
                 GROUP BY parent
                 HAVING COUNT(1) > 10
                 """

safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
query_job = client.query(query_improved, job_config=safe_config)

# API request - run the query, and convert the results to a pandas DataFrame
improved_df = query_job.to_dataframe()

# Print the first five rows of the DataFrame
improved_df.head()

# parent	NumPosts
# 0	6427895	46
# 1	202918	44
# 2	8120079	148
# 3	9016949	38
# 4	7075537	51

## 7.3.2. EXERCICES

In [None]:
from google.cloud import bigquery

# Create a "Client" object
client = bigquery.Client()

# Construct a reference to the "hacker_news" dataset
dataset_ref = client.dataset("hacker_news", project="bigquery-public-data")

# API request - fetch the dataset
dataset = client.get_dataset(dataset_ref)

# Construct a reference to the "comments" table
table_ref = dataset_ref.table("comments")

# API request - fetch the table
table = client.get_table(table_ref)

# Preview the first five lines of the "comments" table
client.list_rows(table, max_results=5).to_dataframe()

In [None]:
# 	id	by	author	time	time_ts	text	parent	deleted	dead	ranking
# 0	2701393	5l	5l	1309184881	2011-06-27 14:28:01+00:00	And the glazier who fixed all the broken windo...	2701243	None	None	0
# 1	5811403	99	99	1370234048	2013-06-03 04:34:08+00:00	Does canada have the equivalent of H1B/Green c...	5804452	None	None	0
# 2	21623	AF	AF	1178992400	2007-05-12 17:53:20+00:00	Speaking of Rails, there are other options in ...	21611	None	None	0
# 3	10159727	EA	EA	1441206574	2015-09-02 15:09:34+00:00	Humans and large livestock (and maybe even pet...	10159396	None	None	0
# 4	2988424	Iv	Iv	1315853580	2011-09-12 18:53:00+00:00	I must say I reacted in the same way when I re...	2988179	None	None	0

## 7.3.2.1. Prolific commenters

In [None]:
# Hacker News would like to send awards to everyone who has written more 
# than 10,000 posts. 
# Write a query that returns all authors with more than 10,000 posts as 
# well as their post counts. Call the column with post counts NumPosts.

In [None]:
# Query to select prolific commenters and post counts
prolific_commenters_query = """
                            SELECT author, COUNT(1) AS NumPosts
                            FROM `bigquery-public-data.hacker_news.comments`
                            GROUP BY author
                            HAVING COUNT(1) > 10000
                            """

# Set up the query (cancel the query if it would use too much of 
# your quota, with the limit set to 1 GB)
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
query_job = client.query(prolific_commenters_query, job_config=safe_config)

# API request - run the query, and return a pandas DataFrame
prolific_commenters = query_job.to_dataframe()

# View top few rows of results
print(prolific_commenters.head())

In [None]:
#          author  NumPosts
# 0           eru     10448
# 1       rbanffy     10557
# 2  dragonwriter     10723
# 3          None    227736
# 4         DanBC     12902

In [None]:
# Query to determine how many posts were deleted
deleted_posts_query = """
                      SELECT COUNT(1) AS num_deleted_posts
                      FROM `bigquery-public-data.hacker_news.comments`
                      WHERE deleted = True
                      """

# Set up the query
query_job = client.query(deleted_posts_query)

# API request - run the query, and return a pandas DataFrame
deleted_posts = query_job.to_dataframe()

# View results
print(deleted_posts)

num_deleted_posts = 227736

## 7.3.2.2. Deleted comments

In [None]:
# How many comments have been deleted? 
# (If a comment was deleted, the deleted column in the comments table 
# will have the value True.)

In [None]:
# Query to determine how many posts were deleted
deleted_posts_query = """
                      SELECT COUNT(1) AS num_deleted_posts
                      FROM `bigquery-public-data.hacker_news.comments`
                      WHERE deleted = True
                      """

# Set up the query
query_job = client.query(deleted_posts_query)

# API request - run the query, and return a pandas DataFrame
deleted_posts = query_job.to_dataframe()

# View results
print(deleted_posts)

num_deleted_posts = 227736

# num_deleted_posts
# 0             227736