In [7]:
import os
import requests
from bs4 import BeautifulSoup

There were a few APIs at the time of this notebook.
Directly at pypi
- https://wiki.python.org/moin/PyPIJSON
- In the format https://pypi.python.org/pypi/<package_name>/json

There is also https://pypistats.org/
 - They ask that any large amount of API access go through Google Big Query

Google Big Query
- https://cloud.google.com/docs/authentication/getting-started
- https://bigquery.cloud.google.com/table/the-psf:pypi.downloads


In [None]:
# May need to install these
#!pip install jk_pypiorgapi
#!pip install pyarrow

## Authenticate and Construct Google Big Query Object

In [3]:
from google.cloud import bigquery
from google.oauth2 import service_account

# TODO(developer): Set key_path to the path to the service account key
#                  file.
key_path = "msdscapstone-db80de77b383.json"

credentials = service_account.Credentials.from_service_account_file(
    key_path, scopes=["https://www.googleapis.com/auth/cloud-platform"],
)

client = bigquery.Client(credentials=credentials, project=credentials.project_id,)

# Construct a BigQuery client object.
#client = bigquery.Client()

query = """
    SELECT COUNT(*) AS num_downloads
    FROM `bigquery-public-data.pypi.file_downloads`
    WHERE file.project = 'pytest'
    -- Only query the last 30 days of history
    AND DATE(timestamp)
    BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 30 DAY)
    AND CURRENT_DATE()
"""
query_job = client.query(query)  # Make an API request.

print("The query data:")
results = query_job.result()  # Waits for job to complete.
for row in results:
    print("{} downloads".format(row.num_downloads))

The query data:
30963935 downloads


In [None]:
from google.cloud import bigquery
import pyarrow

bqclient = bigquery.Client(credentials=credentials, project=credentials.project_id)
''' 
The below query gets name, version, author,author_email,license,package_type, filename for pypi packages
'''
# Download a table.
table = bigquery.TableReference.from_string(
    "the-psf.pypi.distribution_metadata"
)
rows = bqclient.list_rows(
    table,
    selected_fields=[
        bigquery.SchemaField("name", "STRING"),
        bigquery.SchemaField("version", "STRING"),
        bigquery.SchemaField("author", "STRING"),
        bigquery.SchemaField("author_email", "STRING"),
        bigquery.SchemaField("license", "STRING"),
        #bigquery.SchemaField("requires", "STRING"),
        #bigquery.SchemaField("requires_dist", "STRING"),
        bigquery.SchemaField("packagetype", "STRING"),
        bigquery.SchemaField("filename", "STRING")
    ],
)
dataframe = rows.to_dataframe(
    # Optionally, explicitly request to use the BigQuery Storage API. As of
    # google-cloud-bigquery version 1.26.0 and above, the BigQuery Storage
    # API is used by default.
    #create_bqstorage_client=True,
)

In [47]:
#some stats
print('All=',dataframe.shape)
print('with licenses=',dataframe[dataframe.license.isna()==False].shape)
print('with authors=',dataframe[dataframe.author.isna()==False].shape)
print('with author_email=',dataframe[dataframe.author_email.isna()==False].shape)

All= (5506499, 7)
with licenses= (4121423, 7)
with authors= (5258826, 7)
with author_email= (4866969, 7)


In [21]:
#changing all text to lower for easy querying
df_pypi_meta = dataframe.apply(lambda x: x.astype(str).str.lower())

In [41]:
#data cleaning, last 65 rows do not seem to parse properly..dropping them
df_pypi_meta = df_pypi_meta.loc[0:5506433]


In [49]:
#save the file
df_pypi_meta.to_csv('pypi_author_license.csv')

In [48]:
df_pypi_meta[df_pypi_meta.name=='pandas']

Unnamed: 0,name,version,author,author_email,license,packagetype,filename
13017,pandas,0.24.0,none,none,bsd,pandas-0.24.0-cp35-cp35m-manylinux1_x86_64.whl,bdist_wheel
13075,pandas,0.25.0rc0,none,none,bsd,pandas-0.25.0rc0-cp37-cp37m-manylinux1_x86_64.whl,bdist_wheel
13103,pandas,1.0.0rc0,none,none,bsd,pandas-1.0.0rc0-cp38-cp38-win_amd64.whl,bdist_wheel
13104,pandas,1.0.0rc0,none,none,bsd,pandas-1.0.0rc0-cp38-cp38-win_amd64.whl,bdist_wheel
27049,pandas,0.19.1,the pydata development team,pydata@googlegroups.com,bsd,pandas-0.19.1-cp35-cp35m-manylinux1_i686.whl,bdist_wheel
...,...,...,...,...,...,...,...
5457617,pandas,1.3.3,the pandas development team,pandas-dev@python.org,bsd-3-clause,pandas-1.3.3-cp310-cp310-manylinux_2_17_x86_64...,bdist_wheel
5457618,pandas,1.3.3,the pandas development team,pandas-dev@python.org,bsd-3-clause,pandas-1.3.3-cp37-cp37m-macosx_10_9_x86_64.whl,bdist_wheel
5457619,pandas,1.3.3,the pandas development team,pandas-dev@python.org,bsd-3-clause,pandas-1.3.3-cp37-cp37m-manylinux_2_17_aarch64...,bdist_wheel
5457620,pandas,1.3.3,the pandas development team,pandas-dev@python.org,bsd-3-clause,pandas-1.3.3-cp39-cp39-manylinux_2_5_i686.many...,bdist_wheel


## Pull contributors from github

In [18]:
#https://towardsdatascience.com/all-the-things-you-can-do-with-github-api-and-python-f01790fca131   

<github.MainClass.Github at 0x18ac011bc70>

In [49]:
#goto developer settings on your profile settings and generate this
token = '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'

from github import Github

# First create a Github instance:

# using an access token
g = Github(token)

# Github Enterprise with custom hostname
repo = g.get_repo("pandas-dev/pandas")
contrib = repo.get_contributors()
contrib



<github.PaginatedList.PaginatedList at 0x18ad53b4040>

In [50]:
contrib_list = []
for con in contrib:
	contrib_list.append(con)

In [52]:
contrib_list

[NamedUser(login="jreback"),
 NamedUser(login="jbrockmendel"),
 NamedUser(login="wesm"),
 NamedUser(login="jorisvandenbossche"),
 NamedUser(login="TomAugspurger"),
 NamedUser(login="cpcloud"),
 NamedUser(login="simonjayhawkins"),
 NamedUser(login="gfyoung"),
 NamedUser(login="mroeschke"),
 NamedUser(login="sinhrks"),
 NamedUser(login="adamklein"),
 NamedUser(login="topper-123"),
 NamedUser(login="jtratner"),
 NamedUser(login="changhiskhan"),
 NamedUser(login="WillAyd"),
 NamedUser(login="phofl"),
 NamedUser(login="ShaharNaveh"),
 NamedUser(login="datapythonista"),
 NamedUser(login="jschendel"),
 NamedUser(login="hayd"),
 NamedUser(login="dsaxton"),
 NamedUser(login="rhshadrach"),
 NamedUser(login="attack68"),
 NamedUser(login="MarcoGorelli"),
 NamedUser(login="fangchenli"),
 NamedUser(login="mzeitlin11"),
 NamedUser(login="orbitfold"),
 NamedUser(login="alimcmaster1"),
 NamedUser(login="toobaz"),
 NamedUser(login="jseabold"),
 NamedUser(login="charlesdong1991"),
 NamedUser(login="chris