In [None]:
#import libraries
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time

There were a few APIs at the time of this notebook.
Directly at pypi
- https://wiki.python.org/moin/PyPIJSON
- In the format https://pypi.python.org/pypi/<package_name>/json

There is also https://pypistats.org/
 - They ask that any large amount of API access go through Google Big Query

Google Big Query
- https://cloud.google.com/docs/authentication/getting-started
- https://bigquery.cloud.google.com/table/the-psf:pypi.downloads


In [2]:
# May need to install these
#!pip install jk_pypiorgapi
#!pip install pyarrow

## Authenticate and Construct Google Big Query Object

In [2]:
from google.cloud import bigquery
from google.oauth2 import service_account

# TODO(developer): Set key_path to the path to the service account key
#                  file.
key_path = "msdscapstone-db80de77b383.json"

credentials = service_account.Credentials.from_service_account_file(
    key_path, scopes=["https://www.googleapis.com/auth/cloud-platform"],
)

client = bigquery.Client(credentials=credentials, project=credentials.project_id,)

## Pypi Distribution Metadata Statistics and Data Download Queries

In [None]:
# Find schema for thsi table here: https://docs.google.com/document/d/1mtZD2iw0TmzfNepCABdMCkhk-jwLgdgFP9awSkccwaw/edit#

In [3]:
#This query gives number of rows in the distribution meta data table
query = """
    SELECT Count(*) 
    FROM `the-psf.pypi.distribution_metadata`
    WHERE license is NOT NULL
"""
query_job = client.query(query)  # Make an API request.
results = query_job.result()  # Waits for job to complete.
#iterate through results object to fetch rows.
for row in results:
    row = str(row)[5:].split(',')[0]
    print(f"Row count with licenses = {row}")

Row count with licenses = 4223089


In [55]:
#This query gives number of rows in the distribution meta data table which have licenses and have the home_page in github
query = """
    SELECT count(*)
    FROM `the-psf.pypi.distribution_metadata`
    WHERE (lower(license)!='none' or lower(license)!='no license' or license is NOT NULL) and
    version NOT LIKE '%%dev%%' and home_page LIKE '%%github%%    
"""
query_job = client.query(query)  # Make an API request.
print("The query data:")
results = query_job.result()  # Waits for job to complete.
#iterate through results object to fetch rows.
for row in results:
    row = str(row)[5:].split(',')[0]
    print(f"rows with license and with home page in github = {row}")

The query data:
rows with license and with home page in github = 2861425


In [None]:
#For understanding more on handling Array String data types in bigquery: 
# https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions#array_length
#https://cloud.google.com/spanner/docs/array_functions

In [3]:
#This query gives number of rows in the distribution meta data table which have licenses and have the home_page in github and also have dependencies
query = """
    SELECT name, version, author, author_email, license, home_page, ARRAY_TO_STRING(requires_dist,',') as dependency
    FROM `the-psf.pypi.distribution_metadata`
    WHERE (lower(license) <> 'none' or lower(license) <> 'no license' or license is NOT NULL) and
    version NOT LIKE '%%dev%%' and home_page LIKE '%%github%%' and ARRAY_LENGTH(requires_dist)<>0    
"""
query_job = client.query(query)  # Make an API request.
results = query_job.result()  # Waits for job to complete.


In [14]:
%%time
dataframe_meta_with_license_dependecies = results.to_dataframe(
    # Optionally, explicitly request to use the BigQuery Storage API. As of
    # google-cloud-bigquery version 1.26.0 and above, the BigQuery Storage
    # API is used by default.
    #create_bqstorage_client=True,
)

In [15]:
dataframe_meta_with_license_dependecies

Unnamed: 0,name,version,author,author_email,license,home_page,dependency
0,vmush,0.0.1,Volund,,???,https://github.com/volundmush/vmush,"pymush,django (>=3.2)"
1,pymush,0.0.1,Volund,,???,https://github.com/volundmush/pymush,"athanor,mudstring,rich,passlib,argon2-cffi,pyt..."
2,ephemeris,0.10.7,Galaxy Project and Community,jmchilton@gmail.com,AFL,https://github.com/galaxyproject/ephemeris,"six (>=1.9.0),PyYAML,bioblend (>=0.10.0),Jinja..."
3,deephyper,0.2.5,"Prasanna Balaprakash <pbalapra@anl.gov>, Romai...",,ANL,https://github.com/deephyper/deephyper,"tensorflow (>=2.0.0),tensorflow-probability,nu..."
4,kcli,99.0.202105302125,Karim Boumedhel,karimboumedhel@gmail.com,ASL,http://github.com/karmab/kcli,"argcomplete,netaddr,PyYAML,prettytable,jinja2,..."
...,...,...,...,...,...,...,...
1257990,fastbook,0.0.15,Jeremy Howard and Sylvain Gugger,info@fast.ai,Apache Software License 2.0,https://github.com/fastai/fastbook,"pip,packaging,fastai (>=2.1),ipywidgets,nbdev ..."
1257991,fastai,2.1.8,"Jeremy Howard, Sylvain Gugger, and contributors",info@fast.ai,Apache Software License 2.0,https://github.com/fastai/fastai/tree/master/,"pip,packaging,fastcore (>=1.3.8),torchvision (..."
1257992,kinstabot,0.117.3,"Daniil Okhlopkov, Evgeny Kemerov","danokhlopkov@gmail.com, eskemerov@gmail.com",Apache Software License 2.0,https://github.com/instagrambot/instabot,"certifi (>=2019.11.28),chardet (>=3.0.4),futur..."
1257993,kinstabot,0.117.2,"Daniil Okhlopkov, Evgeny Kemerov","danokhlopkov@gmail.com, eskemerov@gmail.com",Apache Software License 2.0,https://github.com/instagrambot/instabot,"certifi (>=2019.11.28),chardet (>=3.0.4),futur..."


In [16]:
#some stats
print('All=',dataframe_meta_with_license_dependecies.shape)
print('with licenses=',dataframe_meta_with_license_dependecies[dataframe_meta_with_license_dependecies.license.isna()==False].shape)
print('with authors=',dataframe_meta_with_license_dependecies[dataframe_meta_with_license_dependecies.author.isna()==False].shape)
print('with author_email=',dataframe_meta_with_license_dependecies[dataframe_meta_with_license_dependecies.author_email.isna()==False].shape)

All= (1257995, 7)
with licenses= (1257995, 7)
with authors= (1212457, 7)
with author_email= (1077321, 7)


In [17]:
#changing all text to lower for easy querying
df_pypi_meta_with_license = dataframe_meta_with_license_dependecies.apply(lambda x: x.astype(str).str.lower())

In [18]:
#look at sample data
df_pypi_meta_with_license.sample(10)

Unnamed: 0,name,version,author,author_email,license,home_page,dependency
427879,pymartini,0.3.0,kyle barron,kylebarron2@gmail.com,mit,https://github.com/kylebarron/pymartini,"numpy,pytest ; extra == 'test',pytest-benchmar..."
512348,beets-vtalbumartist,1.0.0,rafael parente,rafaelp.dev@gmail.com,mit,https://github.com/rafaelp-dev/beets-vtalbumar...,beets
767051,e-data,0.1.7,vmg,vmayorg@outlook.es,gplv3,https://github.com/uvejota/python-edata,"bs4,pandas,pyjwt,python-dateutil,requests"
828160,credsafe,0.0.24,f̣ộx̣ệ6,foxe6@protonmail.com,agpl-3.0,https://github.com/foxe6-temp/credsafe,"easyrsa (>=0.0.1),omnitools (>=0.0.1)"
1051487,sphinx-click,1.4.1,stephen finucane,stephen@that.guru,mit license,https://github.com/stephenfin/sphinx-click,"pbr (>=2.0),sphinx (<2.0,>=1.5)"
142731,pyibex,1.8.0b1,benoit desrochers,ben.ensta@gmail.com,lgplv3+,https://github.com/benensta/pyibex,vibes
305672,ipwb,0.2021.5.20.1444,mat kelly,me@matkelly.com,mit,https://github.com/oduwsdl/ipwb,"warcio (>=1.5.3),ipfshttpclient (>=0.6.0),flas..."
957991,aws-cdk.aws-amazonmq,1.64.0,amazon web services,none,apache-2.0,https://github.com/aws/aws-cdk,"aws-cdk.core (==1.64.0),constructs (<4.0.0,>=3..."
903812,aws-cdk.aws-codedeploy,1.106.0,amazon web services,none,apache-2.0,https://github.com/aws/aws-cdk,"aws-cdk.aws-autoscaling (==1.106.0),aws-cdk.aw..."
436570,proplot,0.2.1,luke davis,lukelbd@gmail.com,mit,https://lukelbd.github.io/proplot,"matplotlib,lxml"


In [99]:
#save to csv
df_pypi_meta_with_license.to_csv('pypi_meta_with_license_dependecies.csv')

## Explode the requires_dist field into sperate rows

In [34]:
#save data in a temp dataframe
temp_df = df_pypi_meta_with_license.sample(10)

In [41]:
%%time
#check results
df = temp_df.join(temp_df['dependency'].str.split(',', expand=True))

Unnamed: 0,name,version,author,author_email,license,home_page,dependency,0,1,2,...,20,21,22,23,24,25,26,27,28,29
427919,shipane-sdk,1.4.3,sinall,gaoruinan@163.com,mit,https://github.com/sinall/shipane-python-sdk,"apscheduler,bs4,cssselect,html5lib,lxml,pandas...",apscheduler,bs4,cssselect,...,,,,,,,,,,
788385,pfifo,1.1.5,kotone itaya,kotone@sfc.keio.ac.jp,apache,https://github.com/ktnyt/pfifo,pybind11 (>=2.2),pybind11 (>=2.2),,,...,,,,,,,,,,
664789,genice2-cage,2.1,masakazu matsumoto,vitroid@gmail.com,mit,https://github.com/vitroid/genice-cage/,"cycless,attrdict,networkx,numpy,yaplotlib (>=0...",cycless,attrdict,networkx,...,,,,,,,,,,
570444,csr2transmart,0.0.18,gijs kant,gijs@thehyve.nl,mit,https://github.com/thehyve/python_csr2transmart,"click (<8.0,>=7.0),transmart-loader (<1.4.0,>=...",click (<8.0,>=7.0),transmart-loader (<1.4.0,...,,,,,,,,,,
483554,cdrouter,0.4.27,qa cafe,support@qacafe.com,mit,https://github.com/qacafe/cdrouter.py,"future,marshmallow,requests,requests-toolbelt",future,marshmallow,requests,...,,,,,,,,,,
544792,azure-cli-vm,2.2.13,microsoft corporation,azpycli@microsoft.com,mit,https://github.com/azure/azure-cli,"azure-mgmt-msi (==0.2.0),azure-mgmt-authorizat...",azure-mgmt-msi (==0.2.0),azure-mgmt-authorization (==0.50.0),azure-mgmt-compute (==4.4.0),...,,,,,,,,,,
345381,markdown-aafigure,0.1.3,manuel barkhau,mbarkhau@gmail.com,mit,https://github.com/mbarkhau/markdown-aafigure,"markdown,aafigure",markdown,aafigure,,...,,,,,,,,,,
298538,aiomisc,14.1.0,dmitry orlov,me@mosquito.su,mit,https://github.com/mosquito/aiomisc,"colorlog,async-generator ; python_version < ""3...",colorlog,"async-generator ; python_version < ""3.7""",aiohttp ; extra == 'aiohttp',...,pytest-cov (~=2.5.1) ; extra == 'develop',pytest-freezegun (~=0.4.2) ; extra == 'develop',sphinx (>=3.5.1) ; extra == 'develop',sphinx-autobuild ; extra == 'develop',sphinx-intl ; extra == 'develop',timeout-decorator ; extra == 'develop',tox (>=2.4) ; extra == 'develop',raven-aiohttp ; extra == 'raven',uvloop (<1,>=0.14) ; extra == 'uvloop'
1074305,mypy-boto3-fis,1.19.5,vlad emelianov,vlad.emelianov.nz@gmail.com,mit license,https://github.com/vemel/mypy_boto3_builder,"typing-extensions ; python_version < ""3.9""","typing-extensions ; python_version < ""3.9""",,,...,,,,,,,,,,
29673,fsds,0.2.19,james irving,james.irving.phd@gmail.com,gnu general public license v3,https://github.com/jirvingphd/fsds,"numpy (>=1.18),missingno,pandas (>1.0.0),seabo...",numpy (>=1.18),missingno,pandas (>1.0.0),...,,,,,,,,,,


In [None]:
#save to csv
df.to_csv('pypi_name_version_author_license_homepg.csv')

# Sample query to pull Downloads data

In [None]:
#https://medium.com/google-cloud/how-to-work-with-array-and-structs-in-bigquery-9c0a2ea584a6

In [21]:
%%time
#This query pulls data from downloads table

query_downloads = """
    SELECT  country_code, STRUCT(file.project as name), STRUCT(file.version as version)
    FROM `bigquery-public-data.pypi.file_downloads`
    WHERE
    DATE(timestamp)
    BETWEEN DATE_TRUNC(DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY), DAY)
    AND CURRENT_DATE()
    --LIMIT 5 --use for checking sample data before running whole query
"""
query_job_dl = client.query(query_downloads)  # Make an API request.
results_downloads = query_job_dl.result()  # Waits for job to complete.

CPU times: user 11.6 ms, sys: 1.97 ms, total: 13.6 ms
Wall time: 16.9 s


In [42]:
%%time
#This query pulls data from downloads table

query_downloads = """
    SELECT  country_code, ANY_VALUE(STRUCT(file.project)) as name, ANY_VALUE(STRUCT(file.version)) as version,
    Count(STRUCT(file.project)) as num_downloads 
    FROM `bigquery-public-data.pypi.file_downloads`
    WHERE
    DATE(timestamp)
    BETWEEN DATE_TRUNC(DATE_SUB(CURRENT_DATE(), INTERVAL 30 DAY), 30DAY)
    AND CURRENT_DATE()
    GROUP BY country_code, file.project ,file.version
    ORDER BY Count(STRUCT(file.project)) DESC
    --LIMIT 5 --use for checking sample data before running whole query
"""
query_job_dl = client.query(query_downloads)  # Make an API request.
results_downloads = query_job_dl.result()  # Waits for job to complete.

CPU times: user 6.94 ms, sys: 4.92 ms, total: 11.9 ms
Wall time: 24.1 s


In [43]:
%%time
dataframe_downloads = results_downloads.to_dataframe(
    # Optionally, explicitly request to use the BigQuery Storage API. As of
    # google-cloud-bigquery version 1.26.0 and above, the BigQuery Storage
    # API is used by default.
    #create_bqstorage_client=True,
)

CPU times: user 4min 2s, sys: 3.58 s, total: 4min 6s
Wall time: 13min 30s


In [44]:
%%time
dataframe_downloads#.shape

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.77 µs


Unnamed: 0,country_code,name,version,num_downloads
0,CA,{'project': 'six'},{'version': '1.16.0'},9625437
1,CA,{'project': 'python-dateutil'},{'version': '2.8.2'},9609601
2,CA,{'project': 'urllib3'},{'version': '1.26.7'},9602770
3,CA,{'project': 'jmespath'},{'version': '0.10.0'},9587288
4,CA,{'project': 's3transfer'},{'version': '0.5.0'},9554432
...,...,...,...,...
12405430,NL,{'project': 'mosaik-demo-semver'},{'version': '0.1.0rc20200908125020'},1
12405431,KG,{'project': 'moto'},{'version': '2.2.11'},1
12405432,NL,{'project': 'odoo10-addon-connector-prestashop'},{'version': '10.0.1.1.0.99.dev2'},1
12405433,NL,{'project': 'odoo14-addon-fieldservice-equipme...,{'version': '14.0.1.1.0'},1


In [45]:
dataframe_downloads.to_csv('pypi_downloads_DAY_1028201.csv')