In [2]:
import yaml
from google.cloud import bigquery


In [None]:
config = yaml.load(open("credentials.yml"), yaml.Loader)
client = bigquery.Client.from_service_account_json(config["credentials_path"])


In [3]:
dataset_ref = bigquery.DatasetReference("bigquery-public-data", "github_repos")
licenses_ref = bigquery.TableReference(dataset_ref, "licenses")
sample_files_ref = bigquery.TableReference(dataset_ref, "sample_files")

licenses_table = client.get_table(licenses_ref)
sample_files_table = client.get_table(sample_files_ref)


In [4]:
client.list_rows(licenses_table, max_results=5).to_dataframe()


Unnamed: 0,repo_name,license
0,autarch/Dist-Zilla-Plugin-Test-TidyAll,artistic-2.0
1,thundergnat/Prime-Factor,artistic-2.0
2,kusha-b-k/Turabian_Engin_Fan,artistic-2.0
3,onlinepremiumoutlet/onlinepremiumoutlet.github.io,artistic-2.0
4,huangyuanlove/LiaoBa_Service,artistic-2.0


In [5]:
client.list_rows(sample_files_table, max_results=5).to_dataframe()


Unnamed: 0,repo_name,ref,path,mode,id,symlink_target
0,EOL/eol,refs/heads/master,generate/vendor/railties,40960,0338c33fb3fda57db9e812ac7de969317cad4959,/usr/share/rails-ruby1.8/railties
1,np/ling,refs/heads/master,tests/success/merger_seq_inferred.t/merger_seq...,40960,dd4bb3d5ecabe5044d3fa5a36e0a9bf7ca878209,../../../fixtures/all/merger_seq_inferred.ll
2,np/ling,refs/heads/master,fixtures/sequence/lettype.ll,40960,8fdf536def2633116d65b92b3b9257bcf06e3e45,../all/lettype.ll
3,np/ling,refs/heads/master,fixtures/failure/wrong_order_seq3.ll,40960,c2509ae1196c4bb79d7e60a3d679488ca4a753e9,../all/wrong_order_seq3.ll
4,np/ling,refs/heads/master,issues/sequence/keep.t,40960,5721de3488fb32745dfc11ec482e5dd0331fecaf,../keep.t


In [6]:
# Query to determine the number of files per license, sorted by number of files
query = """
    SELECT l.license, COUNT(1) as number_of_files
    FROM `bigquery-public-data.github_repos.licenses` AS l
    INNER JOIN `bigquery-public-data.github_repos.sample_files` AS sf
    ON l.repo_name = sf.repo_name
    GROUP BY l.license
    ORDER BY number_of_files DESC
"""

safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)  # 10 GB
query_job = client.query(query, job_config=safe_config)
query_results = query_job.to_dataframe()
query_results

Unnamed: 0,license,number_of_files
0,mit,20560894
1,gpl-2.0,16608922
2,apache-2.0,7201141
3,gpl-3.0,5107676
4,bsd-3-clause,3465437
5,agpl-3.0,1372100
6,lgpl-2.1,799664
7,bsd-2-clause,692357
8,lgpl-3.0,582277
9,mpl-2.0,457000


In [7]:
dataset_ref = bigquery.DatasetReference("bigquery-public-data", "stackoverflow")
posts_answers_ref = bigquery.TableReference(dataset_ref, "posts_answers")
posts_questions_ref = bigquery.TableReference(dataset_ref, "posts_questions")

posts_answers_table = client.get_table(posts_answers_ref)
posts_questions_table = client.get_table(posts_questions_ref)


In [9]:
client.list_rows(posts_answers_table, max_results=5).to_dataframe()


Unnamed: 0,id,title,body,accepted_answer_id,answer_count,comment_count,community_owned_date,creation_date,favorite_count,last_activity_date,last_edit_date,last_editor_display_name,last_editor_user_id,owner_display_name,owner_user_id,parent_id,post_type_id,score,tags,view_count
0,18,,<p>For a table like this:</p>\n\n<pre><code>CR...,,,2,NaT,2008-08-01 05:12:44.193000+00:00,,2016-06-02 05:56:26.060000+00:00,2016-06-02 05:56:26.060000+00:00,Jeff Atwood,126039,phpguy,,17,2,59,,
1,165,,"<p>You can use a <a href=""http://sharpdevelop....",,,0,NaT,2008-08-01 18:04:25.023000+00:00,,2019-04-06 14:03:51.080000+00:00,2019-04-06 14:03:51.080000+00:00,,1721793,user2189331,,145,2,10,,
2,1028,,<p>The VB code looks something like this:</p>\...,,,0,NaT,2008-08-04 04:58:40.300000+00:00,,2013-02-07 13:22:14.680000+00:00,2013-02-07 13:22:14.680000+00:00,,395659,user2189331,,947,2,8,,
3,1073,,<p>My first choice would be a dedicated heap t...,,,0,NaT,2008-08-04 07:51:02.997000+00:00,,2015-09-01 17:32:32.120000+00:00,2015-09-01 17:32:32.120000+00:00,,45459,user2189331,,1069,2,29,,
4,1260,,<p>I found the answer. all you have to do is a...,,,0,NaT,2008-08-04 14:06:02.863000+00:00,,2016-12-20 08:38:48.867000+00:00,2016-12-20 08:38:48.867000+00:00,,1221571,Jin,,1229,2,1,,


In [11]:
posts_answers_table.schema

[SchemaField('id', 'INTEGER', 'NULLABLE', None, None, (), None),
 SchemaField('title', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('body', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('accepted_answer_id', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('answer_count', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('comment_count', 'INTEGER', 'NULLABLE', None, None, (), None),
 SchemaField('community_owned_date', 'TIMESTAMP', 'NULLABLE', None, None, (), None),
 SchemaField('creation_date', 'TIMESTAMP', 'NULLABLE', None, None, (), None),
 SchemaField('favorite_count', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('last_activity_date', 'TIMESTAMP', 'NULLABLE', None, None, (), None),
 SchemaField('last_edit_date', 'TIMESTAMP', 'NULLABLE', None, None, (), None),
 SchemaField('last_editor_display_name', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('last_editor_user_id', 'INTEGER', 'NULLABLE', None, None, (), None),
 Sc

In [10]:
client.list_rows(posts_questions_table, max_results=5).to_dataframe()


Unnamed: 0,id,title,body,accepted_answer_id,answer_count,comment_count,community_owned_date,creation_date,favorite_count,last_activity_date,last_edit_date,last_editor_display_name,last_editor_user_id,owner_display_name,owner_user_id,parent_id,post_type_id,score,tags,view_count
0,320268,Html.ActionLink doesn’t render # properly,<p>When using Html.ActionLink passing a string...,,0,0,NaT,2008-11-26 10:42:37.477000+00:00,0,2009-02-06 20:13:54.370000+00:00,NaT,,,Paulo,,,1,0,asp.net-mvc,390
1,324003,Primitive recursion,<p>how will i define the function 'simplify' ...,,0,0,NaT,2008-11-27 15:12:37.497000+00:00,0,2012-09-25 19:54:40.597000+00:00,2012-09-25 19:54:40.597000+00:00,Marcin,1288.0,,41000.0,,1,0,haskell|lambda|functional-programming|lambda-c...,497
2,390605,While vs. Do While,<p>I've seen both the blocks of code in use se...,390608.0,0,0,NaT,2008-12-24 01:49:54.230000+00:00,2,2008-12-24 03:08:55.897000+00:00,NaT,,,Unkwntech,115.0,,1,0,language-agnostic|loops,11262
3,413246,Protect ASP.NET Source code,<p>Im currently doing some research in how to ...,,0,0,NaT,2009-01-05 14:23:51.040000+00:00,0,2009-03-24 21:30:22.370000+00:00,2009-01-05 14:42:28.257000+00:00,Tom Anderson,13502.0,Velnias,,,1,0,asp.net|deployment|obfuscation,4823
4,454921,"Difference between ""int[] myArray"" and ""int my...",<blockquote>\n <p><strong>Possible Duplicate:...,454928.0,0,0,NaT,2009-01-18 10:22:52.177000+00:00,0,2009-01-18 10:30:50.930000+00:00,2017-05-23 11:49:26.567000+00:00,,-1.0,Evan Fosmark,49701.0,,1,0,java|arrays,798


In [12]:
posts_questions_table.schema

[SchemaField('id', 'INTEGER', 'NULLABLE', None, None, (), None),
 SchemaField('title', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('body', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('accepted_answer_id', 'INTEGER', 'NULLABLE', None, None, (), None),
 SchemaField('answer_count', 'INTEGER', 'NULLABLE', None, None, (), None),
 SchemaField('comment_count', 'INTEGER', 'NULLABLE', None, None, (), None),
 SchemaField('community_owned_date', 'TIMESTAMP', 'NULLABLE', None, None, (), None),
 SchemaField('creation_date', 'TIMESTAMP', 'NULLABLE', None, None, (), None),
 SchemaField('favorite_count', 'INTEGER', 'NULLABLE', None, None, (), None),
 SchemaField('last_activity_date', 'TIMESTAMP', 'NULLABLE', None, None, (), None),
 SchemaField('last_edit_date', 'TIMESTAMP', 'NULLABLE', None, None, (), None),
 SchemaField('last_editor_display_name', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('last_editor_user_id', 'INTEGER', 'NULLABLE', None, None, (), None),


In [18]:
# Query to determine the number of files per license, sorted by number of files
query = """
    SELECT l.license, COUNT(1) as number_of_files
    FROM `bigquery-public-data.github_repos.licenses` AS l
    INNER JOIN `bigquery-public-data.github_repos.sample_files` AS sf
    ON l.repo_name = sf.repo_name
    GROUP BY l.license
    ORDER BY number_of_files DESC
"""

safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)  # 10 GB
query_job = client.query(query, job_config=safe_config)
query_results = query_job.to_dataframe()
query_results

Unnamed: 0,license,number_of_files
0,mit,20560894
1,gpl-2.0,16608922
2,apache-2.0,7201141
3,gpl-3.0,5107676
4,bsd-3-clause,3465437
5,agpl-3.0,1372100
6,lgpl-2.1,799664
7,bsd-2-clause,692357
8,lgpl-3.0,582277
9,mpl-2.0,457000


In [19]:
# Write a query that returns the `id`, `body` and `owner_user_id` columns from
# the `posts_answers` table for answers to "bigquery"-related questions.
query = """
    SELECT a.id, a.body, a.owner_user_id
    FROM `bigquery-public-data.stackoverflow.posts_questions` AS q
    INNER JOIN `bigquery-public-data.stackoverflow.posts_answers` AS a
    ON q.id = a.parent_id
    WHERE q.tags LIKE '%bigquery%'
"""

safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=27 * 10**10)  # 10 GB
query_job = client.query(query, job_config=safe_config)
query_results = query_job.to_dataframe()
query_results


Unnamed: 0,id,body,owner_user_id
0,59374047,<p>There is a fix fully deployed for the previ...,11889760
1,59378418,<p>Not a stupid question... this happens more ...,10989606
2,59385674,"<p>Are you looking for : </p>\n\n<p><a href=""h...",8402583
3,59385786,<p>You can generate script ..</p>\n\n<ol>\n<li...,2274648
4,59388357,<p>In regards to using SSIS to load the data f...,9146820
...,...,...,...
27802,30244113,<p>The underlying bug here has been fixed as o...,1366527
27803,30197788,"<p>There isn't a simple way to do this, but yo...",1366527
27804,30172227,<p><strong>Note</strong>: Everything in this p...,1366527
27805,30019879,<p>The only fields available in the <code>TABL...,1366527


In [22]:
# Write a new query that has a single row for each user who answered at least one question with
# a tag that includes the string "bigquery". Your results should have two columns:
# - `user_id` - contains the `owner_user_id` column from the `posts_answers` table
# - `number_of_answers` - contains the number of answers the user has written to "bigquery"-related questions
query = """
    SELECT a.owner_user_id AS user_id, COUNT(1) AS number_of_answers
    FROM `bigquery-public-data.stackoverflow.posts_questions` AS q
    INNER JOIN `bigquery-public-data.stackoverflow.posts_answers` AS a
    ON q.id = a.parent_id
    WHERE q.tags LIKE '%bigquery%'
    GROUP BY a.owner_user_id
    HAVING number_of_answers >= 1
    ORDER BY number_of_answers DESC
"""

safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=27 * 10**10)  # 10 GB
query_job = client.query(query, job_config=safe_config)
query_results = query_job.to_dataframe()
query_results


Unnamed: 0,user_id,number_of_answers
0,5221944,5203
1,1144035,1634
2,132438,898
3,6253347,737
4,1366527,620
...,...,...
6366,2643353,1
6367,459863,1
6368,12919986,1
6369,500776,1


In [23]:
def expert_finder(client, topic):
    query = f"""
        SELECT a.owner_user_id AS user_id, COUNT(1) AS number_of_answers
        FROM `bigquery-public-data.stackoverflow.posts_questions` AS q
        INNER JOIN `bigquery-public-data.stackoverflow.posts_answers` AS a
        ON q.id = a.parent_id
        WHERE q.tags LIKE '%{topic}%'
        GROUP BY a.owner_user_id
        HAVING number_of_answers >= 1
        ORDER BY number_of_answers DESC
    """

    safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)  # 10 GB
    query_job = client.query(query, job_config=safe_config)

    return query_job.to_dataframe()


In [24]:
expert_finder(client, "bigquery")


Unnamed: 0,user_id,number_of_answers
0,5221944,5203
1,1144035,1634
2,132438,898
3,6253347,737
4,1366527,620
...,...,...
6366,805870,1
6367,927477,1
6368,1678652,1
6369,562508,1
