In [6]:
from google.cloud import bigquery
from google.oauth2 import service_account

# We will import a customized function called client which actually returns an authorized bigquery client object with right credentials
# this will cost us an extra pair of () each time we call the client object which is now called by the function client we define in bq_sa_auth.py 

from bq_sa_auth import client

## **Getting Started With SQL and BigQuery**
#### *Learn the workflow for handling big datasets with BigQuery and SQL*
------------------------------------------------


### Client object will play a central role in retrieving information from BigQuery public datasets.

In [8]:
# Construct a reference to the "hacker_news" dataset
dataset_ref = client().dataset("hacker_news", project="bigquery-public-data")

# API request - fetch the dataset
dataset = client().get_dataset(dataset_ref)

#### Every dataset is just a collection of tables. You can think of a dataset as a spreadsheet file containing multiple tables, all composed of rows and columns.

#### We use the `list_tables()` method to list the tables in the dataset.

In [9]:
# List all the tables in the "hacker_news" dataset

tables = list(client().list_tables(dataset))

# Print names of all tables in the dataset (there are four!)
for table in tables:  
    print(table.table_id)

full


#### The structure of a table is called its schema. We need to understand a table's schema to effectively pull out the data we want.

#### In this example, we'll investigate the full table that we fetched above using `schema` method

In [6]:

# Construct a reference to the "full" table
table_ref = dataset_ref.table("full")

# API request - fetch the table
table = client().get_table(table_ref)

# Print information on all the columns in the "full" table in the "hacker_news" dataset
table.schema

[SchemaField('title', 'STRING', 'NULLABLE', None, 'Story title', (), None),
 SchemaField('url', 'STRING', 'NULLABLE', None, 'Story url', (), None),
 SchemaField('text', 'STRING', 'NULLABLE', None, 'Story or comment text', (), None),
 SchemaField('dead', 'BOOLEAN', 'NULLABLE', None, 'Is dead?', (), None),
 SchemaField('by', 'STRING', 'NULLABLE', None, "The username of the item's author.", (), None),
 SchemaField('score', 'INTEGER', 'NULLABLE', None, 'Story score', (), None),
 SchemaField('time', 'INTEGER', 'NULLABLE', None, 'Unix time', (), None),
 SchemaField('timestamp', 'TIMESTAMP', 'NULLABLE', None, 'Timestamp for the unix time', (), None),
 SchemaField('type', 'STRING', 'NULLABLE', None, 'type of details (comment comment_ranking poll story job pollopt)', (), None),
 SchemaField('id', 'INTEGER', 'NULLABLE', None, "The item's unique id.", (), None),
 SchemaField('parent', 'INTEGER', 'NULLABLE', None, 'Parent comment ID', (), None),
 SchemaField('descendants', 'INTEGER', 'NULLABLE', N

#### Each SchemaField tells us about a specific column (which we also refer to as a field). In order, the information is:

- The name of the column
- The field type (or datatype) in the column
- The mode of the column ('NULLABLE' means that a column allows NULL values, and is the default)
- A description of the data in that column

#### We can use the `list_rows()` method to check just the first five lines of of the full table to make sure this is right. (Sometimes databases have outdated descriptions, so it's good to check.) This returns a BigQuery RowIterator object that can quickly be converted to a pandas DataFrame with the `to_dataframe()` method.

In [7]:
# Preview the first five lines of the "full" table

client().list_rows(table, max_results=5).to_dataframe()

Unnamed: 0,title,url,text,dead,by,score,time,timestamp,type,id,parent,descendants,ranking,deleted
0,,,"If the crocodile looked him up on Google, we b...",,raxxorrax,,1633421535,2021-10-05 08:12:15+00:00,comment,28756662,28750122,,,
1,,,What exactly are you looking for? I think Pyto...,,abiro,,1569141387,2019-09-22 08:36:27+00:00,comment,21040311,21040141,,,
2,,,"Ironically, this very project might help out w...",,mjevans,,1505769703,2017-09-18 21:21:43+00:00,comment,15279716,15276626,,,
3,,,As you start to gain some experience it can be...,,every_other,,1538575027,2018-10-03 13:57:07+00:00,comment,18130207,18128477,,,
4,,,"That’s what I was referring to, yes. I heard o...",,manmal,,1615664155,2021-03-13 19:35:55+00:00,comment,26449260,26449237,,,


#### The 'list_rows()' method will also let us look at just the information in a specific column. If we want to see the first five entries in the `by` column, for example, we can do that!

In [8]:
# Preview the first five entries in the "by" column of the "full" table
# table.schema is a list of SchemaFields, by column is the 4th so we select from 4th to 5th where 5th is not included, in this way you can slice the table with respect to other columns to include more of them

client().list_rows(table, selected_fields=table.schema[4:5], max_results=5).to_dataframe()

Unnamed: 0,by
0,raxxorrax
1,abiro
2,mjevans
3,every_other
4,manmal
