In [1]:
from google.cloud import bigquery

https://cloud.google.com/docs/authentication/getting-started

SQL Kaggle-4668a0b2f445.json access to cloud resources

export GOOGLE_APPLICATION_CREDENTIALS="[PATH]"


In [2]:
import os

In [3]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/Users/pavannaik/Desktop/SQL/sql-kaggle-244915-fceb1d224fa9.json"

In [4]:
client = bigquery.Client()

#### Construct a reference to the "hacker_news" dataset

In [5]:
dataset_ref = client.dataset("hacker_news", project="bigquery-public-data")

#### API request - fetch the dataset

In [6]:
dataset = client.get_dataset(dataset_ref)

#### List all the tables in the "hacker_news" dataset

In [7]:
# list_tables() method to list the tables in the dataset.
tables = list(client.list_tables(dataset))

In [8]:
for table in tables:
    print(table.table_id)

comments
full
full_201510
stories


#### Construct a refernece to the full table

In [9]:
table_ref = dataset_ref.table("full")

#### API reference to fetch the table

In [10]:
table = client.get_table(table_ref)

In [11]:
# Print information on all the columns in the "full" table in the "hacker_news" dataset
table.schema

[SchemaField('by', 'STRING', 'NULLABLE', "The username of the item's author.", ()),
 SchemaField('score', 'INTEGER', 'NULLABLE', 'Story score', ()),
 SchemaField('time', 'INTEGER', 'NULLABLE', 'Unix time', ()),
 SchemaField('timestamp', 'TIMESTAMP', 'NULLABLE', 'Timestamp for the unix time', ()),
 SchemaField('title', 'STRING', 'NULLABLE', 'Story title', ()),
 SchemaField('type', 'STRING', 'NULLABLE', 'Type of details (comment, comment_ranking, poll, story, job, pollopt)', ()),
 SchemaField('url', 'STRING', 'NULLABLE', 'Story url', ()),
 SchemaField('text', 'STRING', 'NULLABLE', 'Story or comment text', ()),
 SchemaField('parent', 'INTEGER', 'NULLABLE', 'Parent comment ID', ()),
 SchemaField('deleted', 'BOOLEAN', 'NULLABLE', 'Is deleted?', ()),
 SchemaField('dead', 'BOOLEAN', 'NULLABLE', 'Is dead?', ()),
 SchemaField('descendants', 'INTEGER', 'NULLABLE', 'Number of story or poll descendants', ()),
 SchemaField('id', 'INTEGER', 'NULLABLE', "The item's unique id.", ()),
 SchemaField('ran

In [12]:
# Preview the first five lines of the "full" table
client.list_rows(table, max_results=5).to_dataframe()

Unnamed: 0,by,score,time,timestamp,title,type,url,text,parent,deleted,dead,descendants,id,ranking
0,ars,,1368215024,2013-05-10 19:43:44+00:00,,comment,,&#62; GIMP doesn't give you a realtime boundar...,5686268.0,,,,5688276,
1,JDGM,,1366452883,2013-04-20 10:14:43+00:00,,comment,,"Indeed, I really enjoyed that bit. I hope Rain...",5580933.0,,,,5580948,
2,sandworm101,,1481672798,2016-12-13 23:46:38+00:00,,comment,,&gt;&gt; communication with other human driver...,13170930.0,,,,13172442,
3,ComputerGuru,2.0,1363612589,2013-03-18 13:16:29+00:00,Tell PG: Browsers with old cookies not logging in,story,,"Since last night (~1am CST, perhaps) on browse...",,,,1.0,5393903,
4,selmnoo,,1385556547,2013-11-27 12:49:07+00:00,,comment,,I think what OP is trying to get at is that pr...,6807484.0,,,,6807747,


In [13]:
# list 10 elements from 1st column
client.list_rows(table, selected_fields=table.schema[:1], max_results=10).to_dataframe()

Unnamed: 0,by
0,ars
1,JDGM
2,sandworm101
3,ComputerGuru
4,selmnoo
5,notemine
6,whacker
7,mmahemoff
8,btbuildem
9,lsdafjklsd


In [14]:
query_popular = """
                SELECT parent, COUNT(id)
                FROM `bigquery-public-data.hacker_news.comments`
                GROUP BY parent
                HAVING COUNT(id) > 10
                """

In [15]:
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)

In [16]:
query_job = client.query(query_popular, job_config=safe_config)

In [17]:
popular_comments = query_job.to_dataframe()

In [46]:
popular_comments.head()

Unnamed: 0,parent,f0_
0,7703585,45
1,915945,41
2,9744471,51
3,4631362,43
4,5154415,39


In [42]:
query_improved = """
                 SELECT parent, COUNT(1) AS NumPosts
                 FROM `bigquery-public-data.hacker_news.comments`
                 GROUP BY parent
                 HAVING COUNT(1) > 10
                 """

In [43]:
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
query_job1 = client.query(query_improved, job_config=safe_config)

In [44]:
improved_df = query_job1.to_dataframe()

In [45]:
improved_df.head()

Unnamed: 0,parent,NumPosts
0,3873271,50
1,5308611,63
2,5176140,63
3,9269660,51
4,9439286,69


In [49]:
# Query to select prolific commenters and post counts
prolific_commenters_query = """
                            SELECT author, COUNT(1) AS NumPosts
                            FROM `bigquery-public-data.hacker_news.comments`
                            GROUP BY author
                            HAVING COUNT(1) > 10000""" # Your code goes here


safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**9)
query_job = client.query(prolific_commenters_query, job_config=safe_config)

# API request - run the query, and return a pandas DataFrame
prolific_commenters = query_job.to_dataframe()

# View top few rows of results
print(prolific_commenters.head())

         author  NumPosts
0  dragonwriter     10723
1          None    227736
2           eru     10448
3       rbanffy     10557
4         DanBC     12902


In [51]:
#How many comments have been deleted? (If a comment was deleted, the `deleted` column in the 
#comments table will have the value `True`.)

deleted_query = """
                SELECT COUNT(1) AS NumDeletedPosts
                FROM `bigquery-public-data.hacker_news.comments`
                WHERE deleted = TRUE
                """

safe_config = bigquery.QueryJobConfig(maximum_bytes_billed = 10**9)
query_job_del = client.query(deleted_query, job_config = safe_config)

num_deleted_posts = query_job_del.to_dataframe()

print(num_deleted_posts.head())

   NumDeletedPosts
0           227736
