**How to Query the Libraries.io Data (BigQuery Dataset)**

In [1]:
import bq_helper
from bq_helper import BigQueryHelper
# https://www.kaggle.com/sohier/introduction-to-the-bq-helper-package
library = bq_helper.BigQueryHelper(active_project="bigquery-public-data",
                                   dataset_name="libraries_io")

In [2]:
bq_assistant = BigQueryHelper("bigquery-public-data", "libraries_io")
bq_assistant.list_tables()

['dependencies',
 'projects',
 'projects_with_repository_fields',
 'repositories',
 'repository_dependencies',
 'tags',
 'versions']

In [3]:
bq_assistant.head("repositories", num_rows=20)

Unnamed: 0,id,host_type,name_with_owner,description,fork,created_timestamp,updated_timestamp,last_pushed_timestamp,homepage_url,size,...,security_audit_filename,status,last_synced_timestamp,sourcerank,display_name,scm_type,pull_requests_enabled,logo_url,keywords,an
0,5408655,GitHub,BIG-notor/Exercice,,False,2016-02-26 13:51:14+00:00,2017-04-04 03:25:44+00:00,2016-02-26 14:05:19+00:00,,1,...,,,NaT,3,GitHub,,,,,
1,5547415,GitHub,occidens/org-async,Generalized asynchronous processing for Org Mode,False,2016-01-26 04:39:45+00:00,2017-06-01 17:00:55+00:00,2016-01-26 05:01:09+00:00,,15,...,,,NaT,3,GitHub,,,,,
2,6132536,GitHub,emacsmirror/bfbuilder,A brainfuck development environment with inter...,True,2015-11-04 11:49:23+00:00,2015-11-04 11:49:32+00:00,2015-10-07 06:16:15+00:00,http://hins11.yu-yake.com,188,...,,,NaT,0,GitHub,,,,,
3,7486452,GitHub,dublebuble/lua-web-server,Automatically exported from code.google.com/p/...,False,2016-03-31 04:30:49+00:00,2016-03-31 04:31:34+00:00,2016-03-31 04:33:22+00:00,,64,...,,,NaT,0,GitHub,,,,,
4,9112903,GitHub,dahu/Land-of-newLISP,Selected snippets from Land of Lisp rewritten ...,False,2012-05-17 09:36:50+00:00,2017-01-23 00:16:20+00:00,2012-08-29 10:26:25+00:00,,101,...,,,2016-05-28 09:31:04+00:00,3,GitHub,,,,,
5,5469997,GitHub,jetho/scms,A Continuation Passing Interpreter for Scheme,False,2012-03-31 18:15:48+00:00,2017-06-12 12:32:54+00:00,2013-03-06 20:28:45+00:00,,260,...,,,NaT,2,GitHub,,,,,
6,8007056,GitHub,evaneschneider/cooling_routines,,False,2016-01-11 19:24:19+00:00,2016-04-05 07:42:58+00:00,2016-01-12 18:02:06+00:00,,6415,...,,,NaT,0,GitHub,,,,,
7,716745,GitHub,clubhouse/liberator,Liberator is a Clojure library for building RE...,True,2015-01-14 23:52:36+00:00,2017-03-24 03:46:57+00:00,2015-01-15 00:00:48+00:00,http://clojure-liberator.github.io/liberator,1162,...,,,2016-04-15 08:04:26+00:00,4,GitHub,,,,,
8,10723374,GitHub,johnros/chords,Respondent driven sample population size estim...,False,2016-04-20 10:24:23+00:00,2017-03-10 08:17:59+00:00,2016-06-21 19:34:39+00:00,,2844,...,,,2016-06-04 23:12:08+00:00,2,GitHub,,,,,
9,6371379,GitHub,rakudojs/nqp,Not Quite Perl,True,2013-11-01 16:20:09+00:00,2017-04-07 09:58:51+00:00,2013-11-01 00:45:39+00:00,,75631,...,,,NaT,4,GitHub,,,,,


In [4]:
bq_assistant.table_schema("repositories")

[SchemaField('id', 'INTEGER', 'NULLABLE', 'he unique primary key of the repository in the Libraries.io database.', ()),
 SchemaField('host_type', 'STRING', 'NULLABLE', 'Which website the repository is hosted on, either GitHub, GitLab or Bitbucket.', ()),
 SchemaField('name_with_owner', 'STRING', 'NULLABLE', 'The repository name and owner seperated by a slash, also maps to the url slug on the given repository host e.g. librariesio/libraries.io.', ()),
 SchemaField('description', 'STRING', 'NULLABLE', 'Description of repository.', ()),
 SchemaField('fork', 'BOOLEAN', 'NULLABLE', 'Is the repository a fork of another.', ()),
 SchemaField('created_timestamp', 'TIMESTAMP', 'NULLABLE', 'Timestamp of when the repository was created on the host.', ()),
 SchemaField('updated_timestamp', 'TIMESTAMP', 'NULLABLE', 'Timestamp of when the repository was last saved by Libraries.io.', ()),
 SchemaField('last_pushed_timestamp', 'TIMESTAMP', 'NULLABLE', 'Timestamp of when the repository was last pushed t

What are the repositories, avg project size, and avg # of stars?




In [5]:
query1 = """
SELECT
  host_type,
  COUNT(*) repositories,
  ROUND(AVG(size),2) avg_size,
  ROUND(AVG(stars_count),2) avg_stars
FROM
  `bigquery-public-data.libraries_io.repositories`
GROUP BY
  host_type
ORDER BY
  repositories DESC
LIMIT
  1000;
        """
response1 = library.query_to_pandas_safe(query1)
response1.head(10)

Unnamed: 0,host_type,repositories,avg_size,avg_stars
0,GitHub,23112891,6267.36,3.16
1,Bitbucket,218487,15275.19,0.0
2,GitLab,199807,5762.56,0.1


What are the top dependencies per platform?



In [6]:
query2 = """
SELECT
  dependency_platform,
  COUNT(*) dependencies,
  APPROX_TOP_COUNT(dependency_name, 3) top_dependencies
FROM
  `bigquery-public-data.libraries_io.dependencies`
GROUP BY
  dependency_platform
ORDER BY
  dependencies DESC;
        """
response2 = library.query_to_pandas_safe(query2, max_gb_scanned=10)
response2.head(20)

Unnamed: 0,dependency_platform,dependencies,top_dependencies
0,NPM,35985662,"[{'value': 'mocha', 'count': 894580}, {'value'..."
1,Rubygems,3985122,"[{'value': 'rake', 'count': 322632}, {'value':..."
2,Packagist,3135799,"[{'value': 'phpunit/phpunit', 'count': 247529}..."
3,Maven,2782529,"[{'value': 'junit:junit', 'count': 125745}, {'..."
4,CPAN,2421659,"[{'value': 'Test-More', 'count': 139974}, {'va..."
5,NuGet,2087217,"[{'value': 'Newtonsoft.Json', 'count': 114688}..."
6,Cargo,210203,"[{'value': 'libc', 'count': 9354}, {'value': '..."
7,CRAN,206240,"[{'value': 'R', 'count': 37970}, {'value': 'st..."
8,Npm,163567,"[{'value': 'atom-space-pen-views', 'count': 11..."
9,Pypi,105649,"[{'value': 'requests', 'count': 7040}, {'value..."


What are the top unmaintained or deprecated projects?



In [7]:
query3 = """
SELECT
  name,
  repository_sourcerank,
  LANGUAGE,
  status
FROM
  `bigquery-public-data.libraries_io.projects_with_repository_fields`
WHERE
  status IN ('Deprecated',
    'Unmaintained')
ORDER BY
  repository_sourcerank DESC
LIMIT
  20;
        """
response3 = library.query_to_pandas_safe(query3, max_gb_scanned=10)
response3.head(20)

Unnamed: 0,name,repository_sourcerank,LANGUAGE,status
0,react-linked-input,27,JavaScript,Unmaintained
1,spree,20,Ruby,Deprecated
2,external-react-hot-loader,19,JavaScript,Deprecated
3,jade,18,JavaScript,Deprecated
4,pouchdb-mapreduce,18,JavaScript,Deprecated
5,unittest,18,Dart,Deprecated
6,pouchdb-abstract-mapreduce,18,JavaScript,Deprecated
7,wiredep,17,JavaScript,Unmaintained
8,scss-lint,16,Ruby,Deprecated
9,edp-jscs,16,JavaScript,Deprecated
