In [1]:
from time import time

import yaml
from google.cloud import bigquery


In [3]:
config = yaml.load(open("credentials.yml"), yaml.Loader)
client = bigquery.Client.from_service_account_json(config["credentials_path"])


In [2]:
def show_amount_of_data_scanned(client, query):
    # dry_run lets us see how much data the query uses without running it
    dry_run_config = bigquery.QueryJobConfig(dry_run=True)
    query_job = client.query(query, job_config=dry_run_config)
    print("Data processed: {} GB".format(round(query_job.total_bytes_processed / 10**9, 3)))


def show_time_to_run(client, query):
    time_config = bigquery.QueryJobConfig(use_query_cache=False)
    start = time()
    query_result = client.query(query, job_config=time_config).result()
    end = time()
    print("Time to run: {} seconds".format(round(end - start, 3)))


In [4]:
star_query = "SELECT * FROM `bigquery-public-data.github_repos.contents`"
show_amount_of_data_scanned(client, star_query)

basic_query = "SELECT size, binary FROM `bigquery-public-data.github_repos.contents`"
show_amount_of_data_scanned(client, basic_query)


Data processed: 2682.118 GB
Data processed: 2.531 GB


In [6]:
dataset_ref = bigquery.DatasetReference("bigquery-public-data", "github_repos")
commits_ref = bigquery.TableReference(dataset_ref, "commits")
files_ref = bigquery.TableReference(dataset_ref, "files")

commits_table = client.get_table(commits_ref)
files_table = client.get_table(files_ref)


In [8]:
client.list_rows(commits_table, max_results=5).to_dataframe()


Unnamed: 0,commit,tree,parent,author,committer,subject,message,trailer,difference,difference_truncated,repo_name,encoding
0,aa358905a1b12c6fa43b6e877e907fc9d36ff0b9,df3f8bf61bf1cb0dff3a86ebe18671792c2d4f27,[ea230a45a0e97e4d95b5f7fae9ce7ef659b60291],"{'name': 'conda-forge-coordinator', 'email': '...","{'name': 'conda-forge-coordinator', 'email': '...",Updated the qcengine feedstock.,Updated the qcengine feedstock.,[],[],True,[conda-forge/feedstocks],
1,5a6b6d6d29489f8587b247313804d70c45d0981f,ff89accb7e283ca88027d790cdb8bacf373895ab,"[4ee369feb64ee97d71da732da0e78477efbadd76, 6aa...","{'name': 'Rob Allen', 'email': '7e09c9d3e96378...","{'name': 'Rob Allen', 'email': '7e09c9d3e96378...",Merge remote-tracking branch 'weierophinney/ho...,Merge remote-tracking branch 'weierophinney/ho...,[],[],,"[MadCat34/zend-escaper, Maks3w/zend-escaper, z...",
2,6b6ac3b8ab7363b2238a36259c6adb7ba5d31482,915acc1689313e3e233d6f80338acb1629163df9,[f10bea38c15c335eb71469bc0668688d2a8cd9cd],"{'name': 'Zhihui Zhang', 'email': '9693ba60376...","{'name': 'Zhihui Zhang', 'email': '9693ba60376...",provide hook to override ZFS lockfile too,provide hook to override ZFS lockfile too\n\ng...,"[{'key': 'git-svn-id', 'value': 'f2acecaac6fbd...",[],,"[pscedu/slash2-stable, pscedu/pfl]",
3,e26e1f63938b983cea8d6b3229252a20c22ebbda,a86ea389f72e12b8905631dae68a1fd0cb5a859a,[e263e5fcd2ced2e28bacaf63778b88561accd7a2],"{'name': 'conda-forge-coordinator', 'email': '...","{'name': 'conda-forge-coordinator', 'email': '...",Updated the mailchecker feedstock.,Updated the mailchecker feedstock.,[],[],True,[conda-forge/feedstocks],
4,2a896010ccf1c86c24b2fac5cd6b9ed693d5cde6,ba238795c2befc08b1276e60c4402afd9014c417,[721364dbb5d1515f6deed9b132e34091822b0be1],"{'name': 'armaneshaghi', 'email': 'f6cc8d3a4fb...","{'name': 'armaneshaghi', 'email': 'f6cc8d3a4fb...",2014-03-06T04:30,2014-03-06T04:30\n,[],[],,[armaneshaghi/profileLife],


In [9]:
client.list_rows(files_table, max_results=5).to_dataframe()

Unnamed: 0,repo_name,ref,path,mode,id,symlink_target
0,enzbang/diouzhtu,refs/heads/master,gwiad_wiki_service/scripts/do-install.sh,33261,49365044eed28769152726537f00a93a68988b07,
1,TheMrNomis/Latex-Flavored-Markdown-PHP,refs/heads/master,LFM.php,33261,ef8cb78feed7f21115462b2e230c453ab1b9565a,
2,TheMrNomis/Latex-Flavored-Markdown-PHP,refs/heads/master,PHP/LatexFlavoredMarkdown.php,33261,d989ce59652f57efaad0f73987977dcf088c0041,
3,xurigan/uexJPush,refs/heads/master,EUExJPush/EUExJPush/EUExJPush.h,33261,85268b90caa19efa2b84337279fe9e3bdc963803,
4,xurigan/uexJPush,refs/heads/master,EUExJPush/uexJPush/plugin.xml,33261,e1623bb9d8dc7db605fa4ceb6423662ad43c91b8,


In [None]:
# Count the number of distinct committers and the number of files in several GitHub repositories.

slow_query = """
    SELECT COUNT(DISTINCT c.committer.name) AS num_committers,
           COUNT(DISTINCT f.id) AS num_files,
           repo_name
    FROM `bigquery-public-data.github_repos.commits` AS c,
    UNNEST(c.repo_name) AS repo_name
    INNER JOIN `bigquery-public-data.github_repos.files` AS f
    ON f.repo_name = repo_name
    WHERE f.repo_name IN ('tensorflow/tensorflow', 'facebook/react', 'Microsoft/vscode')
    GROUP BY repo_name
    ORDER BY repo_name       
"""

safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=30 * 10**10)
query_results = client.query(slow_query, job_config=safe_config).to_dataframe()
query_results


In [21]:
show_amount_of_data_scanned(client, slow_query)
show_time_to_run(client, slow_query)

Data processed: 251.854 GB
Time to run: 9.549 seconds


In [None]:
# Count the number of distinct committers and the number of files in several GitHub repositories.

efficient_query = """
    WITH commits AS (
        SELECT COUNT(DISTINCT committer.name) AS num_committers,
               repo_name
        FROM `bigquery-public-data.github_repos.commits`,
        UNNEST(repo_name) AS repo_name
        WHERE repo_name IN ('tensorflow/tensorflow', 'facebook/react', 'Microsoft/vscode', 'torvalds/linux')
        GROUP BY repo_name
    ),
    files AS (
        SELECT COUNT(DISTINCT id) AS num_files,
               repo_name
        FROM `bigquery-public-data.github_repos.files`
        WHERE repo_name IN ('tensorflow/tensorflow', 'facebook/react', 'Microsoft/vscode', 'torvalds/linux')
        GROUP BY repo_name
    )
    SELECT commits.num_committers, files.num_files, files.repo_name
    FROM commits
    INNER JOIN files
    ON commits.repo_name = files.repo_name
    ORDER BY files.repo_name
"""

safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=30 * 10**10)
query_results = client.query(efficient_query, job_config=safe_config).to_dataframe()
query_results

In [None]:
show_amount_of_data_scanned(client, efficient_query)
show_time_to_run(client, efficient_query)