In [1]:
#import libraries
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

There were a few APIs at the time of this notebook.
Directly at pypi
- https://wiki.python.org/moin/PyPIJSON
- In the format https://pypi.python.org/pypi/<package_name>/json

There is also https://pypistats.org/
 - They ask that any large amount of API access go through Google Big Query

Google Big Query
- https://cloud.google.com/docs/authentication/getting-started
- https://bigquery.cloud.google.com/table/the-psf:pypi.downloads


In [2]:
# May need to install these
#!pip install jk_pypiorgapi
#!pip install pyarrow

## Authenticate and Construct Google Big Query Object

In [2]:
from google.cloud import bigquery
from google.oauth2 import service_account

# TODO(developer): Set key_path to the path to the service account key
#                  file.
key_path = "msdscapstone-db80de77b383.json"

credentials = service_account.Credentials.from_service_account_file(
    key_path, scopes=["https://www.googleapis.com/auth/cloud-platform"],
)

client = bigquery.Client(credentials=credentials, project=credentials.project_id,)

## Pypi Distribution Metadata Statistics and Data Download Queries

In [None]:
# Find schema for thsi table here: https://docs.google.com/document/d/1mtZD2iw0TmzfNepCABdMCkhk-jwLgdgFP9awSkccwaw/edit#

In [11]:
#This query gives number of rows in the distribution meta data table
query = """
    SELECT Count(*) 
    FROM `the-psf.pypi.distribution_metadata`
    WHERE license is NOT NULL
"""
query_job = client.query(query)  # Make an API request.
results = query_job.result()  # Waits for job to complete.
#iterate through results object to fetch rows.
for row in results:
    row = str(row)[5:].split(',')[0]
    print(f"Row count with licenses = {row}")

Row count with licenses = 4204374


In [55]:
#This query gives number of rows in the distribution meta data table which have licenses and have the home_page in github
query = """
    SELECT count(*)
    FROM `the-psf.pypi.distribution_metadata`
    WHERE (lower(license)!='none' or lower(license)!='no license' or license is NOT NULL) and
    version NOT LIKE '%%dev%%' and home_page LIKE '%%github%%    
"""
query_job = client.query(query)  # Make an API request.
print("The query data:")
results = query_job.result()  # Waits for job to complete.
#iterate through results object to fetch rows.
for row in results:
    row = str(row)[5:].split(',')[0]
    print(f"rows with license and with home page in github = {row}")

The query data:
rows with license and with home page in github = 2861425


In [None]:
#For understanding more on handling Array String data types in bigquery: 
# https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions#array_length

In [120]:
#This query gives number of rows in the distribution meta data table which have licenses and have the home_page in github and also have dependencies
query = """
    SELECT name, version, author, author_email, license, home_page, requires_dist
    FROM `the-psf.pypi.distribution_metadata`
    WHERE (lower(license) <> 'none' or lower(license) <> 'no license' or license is NOT NULL) and
    version NOT LIKE '%%dev%%' and home_page LIKE '%%github%%' and ARRAY_LENGTH(requires_dist)<>0    
"""
query_job = client.query(query)  # Make an API request.
results = query_job.result()  # Waits for job to complete.


In [121]:
#pulling data differently as pyarrow doesn't support array of strings datatype. 
dataframe_meta_with_license_dependecies = pd.DataFrame()

In [None]:
# store row data in tabular format. Use below two lines to get field names from result set
#format_string = "{!s:<16} " * len(results.schema)
#field_names_meta = [field.name for field in results.schema]

In [None]:
#iterate through results and append to dataframe. This maynto be optimal way to fetch results as appending data row by row is taking forever
for row in results:
    dict_rows={'name':row['name'],'version':row['version'],'author':row['author'],\
           'author_email':row['author_email'],'license':row['license'],\
               'home_page':row['home_page'],\
                   'requires_dist':row['requires_dist'] }
    dataframe_meta_with_license_dependecies = dataframe_meta_with_license_dependecies.append((pd.DataFrame().from_dict(dict_rows,orient='index')).T)

In [123]:
dataframe_meta_with_license_dependecies

Unnamed: 0,name,version,author,author_email,license,home_page,requires_dist
0,promo,1.3.3,Tim Mahrt,timmahrt@gmail.com,LICENSE,https://github.com/timmahrt/ProMo,[praatio (~=4.1)]
0,pyacoustics,1.0.7,Tim Mahrt,timmahrt@gmail.com,LICENSE,https://github.com/timmahrt/pyAcoustics,[praatio (~=4.1)]
0,wecrap,1.0.0,hongjinpyo,hojp7874@gmail.com,hongjinpyo,https://github.com/hojp7874/wecrap,"[bs4 (==0.0.1), requests (==2.26.0)]"
0,scigee,0.0.5,Songyan Zhu,soonyenju@foxmail.com,MIT Licence,https://github.com/soonyenju/scigee,"[pygal, geetools]"
0,grub,0.0.12,Thor Whalen,,apache-2.0,https://github.com/thorwhalen/grub,"[py2store, scikit-learn]"
...,...,...,...,...,...,...,...
0,ChefsHatGym,0.1.944,Pablo Barros,pablovin@gmail.com,MIT,https://github.com/pablovin/ChefsHatGYM,"[cloudpickle (==1.6.0), cycler (==0.10.0), fut..."
0,poke-env,0.4.11,Haris Sahovic,contact@sahovic.fr,MIT,https://github.com/hsahovic/poke-env,"[aiologger, gym, numpy, orjson, requests, tabu..."
0,givemedata,0.3.0,Alexander Tatchin,alexander.tatchin@gmail.com,MIT,https://github.com/sancau/givemedata/,"[pandas (>=0.24.2), SQLAlchemy (>=1.3.3), pyya..."
0,borealis-fireworks,0.8.0,Jerry Morrison,j.erry.morrison@gmail.com,MIT,https://github.com/CovertLab/borealis,"[google-cloud-logging (>=2.0.0), google-cloud-..."


In [None]:
#some stats
print('All=',dataframe_meta_with_license_dependecies.shape)
print('with licenses=',dataframe_meta_with_license_dependecies[dataframe_meta_with_license_dependecies.license.isna()==False].shape)
print('with authors=',dataframe_meta_with_license_dependecies[dataframe_meta_with_license_dependecies.author.isna()==False].shape)
print('with author_email=',dataframe_meta_with_license_dependecies[dataframe_meta_with_license_dependecies.author_email.isna()==False].shape)

In [None]:
#sample query to filter by package name
'''
query = """
    SELECT name, version, author, author_email, license, home_page, requires_dist
    FROM `the-psf.pypi.distribution_metadata`
    WHERE (lower(license) <> 'none' or lower(license) <> 'no license' or license is NOT NULL) and
    version NOT LIKE '%%dev%%' and home_page LIKE '%%github%%' and ARRAY_LENGTH(requires_dist)<>0
    and name LIKE 'pandas%%'
    --GROUP BY name,version, license
    --ORDER BY name DESC
    --LIMIT 15
    
"""
query_job = client.query(query)  # Make an API request.
results = query_job.result()  # Waits for job to complete.
dataframe_meta_pandas = pd.DataFrame()
# store row data in tabular format.
for row in results:
    dict_rows={'name':row['name'],'version':row['version'],'author':row['author'],\
           'author_email':row['author_email'],'license':row['license'],\
               'home_page':row['home_page'],'requires':row['requires'],\
                   'requires_dist':row['requires_dist'] }
    dataframe_meta_pandas = dataframe_meta_pandas.append((pd.DataFrame().from_dict(dict_rows,orient='index')).T)
'''

In [97]:
dataframe_meta_with_license_dependecies

Unnamed: 0,name,version,author,author_email,license,home_page,requires,requires_dist
0,pandas-datacube,0.0.2,Tristan Saminadayar,tristan.saminadayar@gmail.com,MIT,https://github.com/tristansaminadayar/pandas-d...,[],"[pandas, sparqlwrapper]"
0,pandas-datacube,0.0.1,Tristan Saminadayar,tristan.saminadayar@gmail.com,MIT,https://github.com/tristansaminadayar/pandas-d...,[],"[pandas, sparqlwrapper]"
0,pandas-genomics,0.8.0,John McGuigan,jrm5100@psu.edu,BSD-3-Clause,https://github.com/HallLab/pandas-genomics/,[],"[numpy (>=1.21,<2.0), pandas (>=1.2,<2.0), cyv..."
0,pandas-genomics,0.8.0,John McGuigan,jrm5100@psu.edu,BSD-3-Clause,https://github.com/HallLab/pandas-genomics/,[],"[numpy (>=1.21,<2.0), pandas (>=1.2,<2.0), cyv..."
0,pandas-x,0.0.4,Aaron Schroeder,aaron@trailzealot.com,MIT,https://github.com/aaron-schroeder/pandas-x,[],"[numpy, pandas]"
...,...,...,...,...,...,...,...,...
0,pandas-stubs,1.2.0.21,Zbigniew Królikowski,zkrolikowski@virtuslab.com,MIT,https://github.com/VirtusLab/pandas-stubs,[],[typing-extensions (>=3.7.4.3) ; python_versio...
0,pandas-pcaxis,0.1,Juha Yrjölä,juha.yrjola@iki.fi,BSD,https://github.com/juyrjola/pandas-pcaxis,[],"[pandas, requests, lark-parser]"
0,pandas-pcaxis,0.2,Juha Yrjölä,juha.yrjola@iki.fi,BSD,https://github.com/juyrjola/pandas-pcaxis,[],"[lark-parser, pandas, requests]"
0,pandas-estat,0.2.7,Shota Imaki,shota.imaki.0801@gmail.com,BSD-3-Clause,https://github.com/simaki/pandas-estat,[],"[pandas (>=1.1.4,<2.0.0), requests (>=2.23.0,<..."


In [124]:
#changing all text to lower for easy querying
df_pypi_meta_with_license = dataframe_meta_with_license_dependecies.apply(lambda x: x.astype(str).str.lower())

In [126]:
#look at sample data
df_pypi_meta_with_license.sample(10)

Unnamed: 0,name,version,author,author_email,license,home_page,requires_dist
0,pyprodigy,0.0.0,leonardo testi,ltesti120a@gmail.com,lgplv3,https://github.com/ltesti/pyprodigy,"['numpy (>=1.9)', 'matplotlib']"
0,ocd,1.1.0,md. jahidul hamid,jahidulhamid@yahoo.com,bsd,https://github.com/neurobin/python-ocd,['packaging']
0,git-punchcard,1.0.0,thomas gläßle,thomas@coldfix.de,unlicense,https://github.com/coldfix/git-punchcard,"['numpy', 'matplotlib', 'docopt', 'pytz']"
0,python-casacore,3.2.0,gijs molenaar,gijs@pythonic.nl,gpl,https://github.com/casacore/python-casacore,"['numpy', 'argparse', 'future', 'six']"
0,vexbot,0.3.6,ben hoff,beohoff@gmail.com,gpl3,https://github.com/benhoff/vexbot,"[""google-api-python-client; extra == 'youtube'..."
0,fyle-db-connector,0.5.0,shwetabh kumar,shwetabh.kumar@fyle.in,mit,https://github.com/fylein/fyle-db-connector,"['typing (==3.7.4.1)', 'pandas (==0.25.2)', 'l..."
0,memformer,0.0.3,phil wang,lucidrains@gmail.com,mit,https://github.com/lucidrains/memformer,"['torch (>=1.6)', 'einops (>=0.3)']"
0,discovery-transition-ds,3.2.77,gigas64,gigas64@opengrass.net,bsd,http://github.com/gigas64/discovery-transition-ds,"['aistac-foundation (>=2.12)', 'pandas (>=1.1)..."
0,pyparam,0.2.3,pwwang,pwwang@pwwang.com,mit,https://github.com/pwwang/pyparam,"['colorama', 'completions', 'python-simpleconf']"
0,deduper,0.0.1rc0,andrei sura,bmi-developers@ad.ufl.edu,mit,https://github.com/ufbmi/onefl-deduper,"['invoke (==0.14.0)', 'mock (==2.0.0)', 'panda..."


In [99]:
#save to csv
df_pypi_meta_with_license.to_csv('pypi_meta_with_license_dependecies.csv')

## Explode the requires_dist field into sperate rows

In [127]:
#save data in a temp dataframe
temp_df = df_pypi_meta_with_license

In [128]:
#make sure the dtaa type is a list datatype for explode function to work
from ast import literal_eval
temp_df['requires_dist'] = temp_df['requires_dist'].apply(literal_eval)

In [129]:
#explode the column
df = temp_df.explode('requires_dist')

In [130]:
#check results
df

Unnamed: 0,name,version,author,author_email,license,home_page,requires_dist
0,promo,1.3.3,tim mahrt,timmahrt@gmail.com,license,https://github.com/timmahrt/promo,praatio (~=4.1)
0,pyacoustics,1.0.7,tim mahrt,timmahrt@gmail.com,license,https://github.com/timmahrt/pyacoustics,praatio (~=4.1)
0,wecrap,1.0.0,hongjinpyo,hojp7874@gmail.com,hongjinpyo,https://github.com/hojp7874/wecrap,bs4 (==0.0.1)
0,wecrap,1.0.0,hongjinpyo,hojp7874@gmail.com,hongjinpyo,https://github.com/hojp7874/wecrap,requests (==2.26.0)
0,scigee,0.0.5,songyan zhu,soonyenju@foxmail.com,mit licence,https://github.com/soonyenju/scigee,pygal
...,...,...,...,...,...,...,...
0,borealis-fireworks,0.9.0,jerry morrison,j.erry.morrison@gmail.com,mit,https://github.com/covertlab/borealis,google-cloud-storage (>=1.28.0)
0,borealis-fireworks,0.9.0,jerry morrison,j.erry.morrison@gmail.com,mit,https://github.com/covertlab/borealis,docker (>=4.1.0)
0,borealis-fireworks,0.9.0,jerry morrison,j.erry.morrison@gmail.com,mit,https://github.com/covertlab/borealis,fireworks (>=1.9.5)
0,borealis-fireworks,0.9.0,jerry morrison,j.erry.morrison@gmail.com,mit,https://github.com/covertlab/borealis,requests (>=2.22.0)


In [88]:
#use this code only if exploding reults in dict field with two keys. Creates two seprate columns for the keys
df = pd.concat([df, df["requires_dist"].apply(pd.Series)], axis=1)
df = df.drop(columns=[0])
df.rename(columns={"v": "dependency"})
df

Unnamed: 0,name,version,author,author_email,license,home_page,requires,requires_dist,0
0,dbs3-pycurl,3.17.0,none,none,"apache license, version 2.0",https://github.com/dmwm/dbs,[],pycurl (==7.19.3),pycurl (==7.19.3)
0,dbs3-client,3.17.0,none,none,"apache license, version 2.0",https://github.com/neilstid/pycurlclient,[],pycurl (==7.43.0.6),pycurl (==7.43.0.6)
0,dbs3-client,3.17.0,none,none,"apache license, version 2.0",https://github.com/neilstid/pycurlclient,[],dbs3-pycurl,dbs3-pycurl
0,cctxpsa,0.0.3,zoeyyy,yzou10@uoguelph.ca,mit,https://github.com/zoeyyyzou/cctx-pcap-safe-an...,[],dpkt,dpkt
0,cctxpsa,0.0.3,zoeyyy,yzou10@uoguelph.ca,mit,https://github.com/zoeyyyzou/cctx-pcap-safe-an...,[],progress,progress
...,...,...,...,...,...,...,...,...,...
0,pandas-datapackage-reader,0.17.0,robert gieseke,robert.gieseke@pik-potsdam.de,bsd,https://github.com/rgieseke/pandas-datapackage...,[],geopandas ; extra == 'tests',geopandas ; extra == 'tests'
0,pandas-datapackage-reader,0.18.0,robert gieseke,robert.gieseke@pik-potsdam.de,bsd,https://github.com/rgieseke/pandas-datapackage...,[],pandas (>=0.24.0),pandas (>=0.24.0)
0,pandas-datapackage-reader,0.18.0,robert gieseke,robert.gieseke@pik-potsdam.de,bsd,https://github.com/rgieseke/pandas-datapackage...,[],requests,requests
0,pandas-datapackage-reader,0.18.0,robert gieseke,robert.gieseke@pik-potsdam.de,bsd,https://github.com/rgieseke/pandas-datapackage...,[],pytest (>=4.1) ; extra == 'tests',pytest (>=4.1) ; extra == 'tests'


In [None]:
#save to csv
df.to_csv('pypi_name_version_author_license_homepg.csv')

# Sample query to pull Downloads data

In [None]:
#This query pulls data from downloads table

query = """
    SELECT  country_code,file
    FROM `bigquery-public-data.pypi.file_downloads`
    WHERE
    DATE(timestamp)
    BETWEEN DATE_TRUNC(DATE_SUB(CURRENT_DATE(), INTERVAL 1 YEAR), YEAR)
    AND CURRENT_DATE()
   --LIMIT 5 --use for checking sample data before running whole query
"""
query_job = client.query(query)  # Make an API request.
results = query_job.result()  # Waits for job to complete.

In [None]:
dataframe_downloads = pd.DataFrame()
# store row data in tabular format.
for row in results:
    dict_rows={'country_code':row['country_code'],'file':row['file']}
dataframe_downloads = dataframe_downloads.append((pd.DataFrame().from_dict(dict_rows,orient='index')).T)

In [None]:
dataframe_downloads.shape

## Pull contributors from github using API - ignore beyond this
* We may not have to do this becaue Gizem's team already has the data an we can query it.

In [None]:
df_pypi_meta = pd.read_csv('pypi_name_version_author_license.csv')
df_pypi_meta.drop(columns='Unnamed: 0')
df_pypi_meta[df_pypi_meta.name=='statistics']

In [None]:
#https://towardsdatascience.com/all-the-things-you-can-do-with-github-api-and-python-f01790fca131   

In [18]:
#goto developer settings on your profile settings and generate this
token = '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'

from github import Github

# First create a Github instance:

# using an access token
g = Github(token)

# Github Enterprise with custom hostname
repo = g.get_repo("pandas-dev/pandas")
#https://pygithub.readthedocs.io/en/latest/github_objects/Repository.html
#https://github.com/PyGithub/PyGithub/blob/master/github/Repository.py#L1989
contrib = repo.get_contributors()#get_forks()#repo.get_contributors()

<github.PaginatedList.PaginatedList at 0x1648e0419d0>

In [5]:
python_pkg = ['pandas-dev/pandas', 'numpy/numpy', 'scikit-learn/scikit-learn', 'scipy/scipy']

In [51]:
add_contributors = pd.DataFrame()

parsed_list = []
for pkg in python_pkg:
    repo = g.get_repo(pkg)
    print(repo)
    contrib = repo.get_contributors()
    contrib_list = []
    for con in contrib:
        contrib_list.append(con)
    parsed_list = []
    for i in contrib_list:
        i = str(i).split('"')[1]
        i = i[:-2]
        parsed_list.append(i)
        print(parsed_list)
        
    print(pd.DataFrame({'repo':pkg,'contributors_list':[parsed_list]}))
    add_contributors = add_contributors.append(pd.DataFrame({'repo':pkg,'contributors_list':[parsed_list]}))

In [52]:
add_contributors['name'] = ['pandas','numpy','scikit-learn','scipy']
pandas_scikit_learn_scipy_numpy = df_pypi_meta[df_pypi_meta.name.isin(['pandas', 'numpy', 'scikit-learn','scipy'])]
pandas_scikit_learn_scipy_numpy = pandas_scikit_learn_scipy_numpy.drop(columns=['Unnamed: 0'])
pandas_scikit_learn_scipy_numpy = pandas_scikit_learn_scipy_numpy.merge(add_contributors)
pandas_scikit_learn_scipy_numpy.to_csv('pandas_scikit_learn_scipy_numpy.csv')

In [None]:
'''
new_df  = pd.DataFrame()
for i in range(0,3000):
    g = requests.get(f'{contrib}?page={i}&per_page=100')
    if(str(g)[11:14]=='200'):
        print(i,g)
        g_json = json.loads(g.text)
        g_df = pd.json_normalize(g_json)
        #g_df['id'] = i
        new_df = new_df.append(g_df)
new_df
'''