## This notebook is used to select top 1000 most downloaded packages

In [2]:
import os
import json
import pandas as pd

In [4]:
# To make the results reproducible, we download the top downloaded package list into disk. The data was collected on 2022-07-01  
popular_packages_path = os.path.abspath("../dataset/metadata/top-pypi-packages-30-days.min.json")

In [5]:
# Read the list of top most downloaded packages and the associated downloads 
with open(popular_packages_path) as data_file:    
    data = json.load(data_file)  

In [7]:
# Loading top PyPI downloaded packages to dataframe for data analysis
df = pd.DataFrame(data["rows"])
print(f"Number of packages: {df.shape[0]}")
df.head()

Number of packages: 5000


Unnamed: 0,download_count,project
0,392553410,boto3
1,218476702,botocore
2,206676797,urllib3
3,205439506,setuptools
4,194347797,requests


In [8]:
# Get top 1000 most downloaded packages as we are interested in them
df_top_1000 = df.head(1000)
print(df_top_1000.head())

   download_count     project
0       392553410       boto3
1       218476702    botocore
2       206676797     urllib3
3       205439506  setuptools
4       194347797    requests


In [9]:
# Saving the list of top pypi packages by downloads to storage
result_file = os.path.abspath("../dataset/metadata/top-1000-downloaded-packages.txt")
with open(result_file, 'w') as f:
    for item in df_top_1000["project"].to_list():
        f.write("%s\n" % item)

In [10]:
# Get total number of downloads of the top 1000 PyPI packages
num_downloads_top_1000_downloaded = df_top_1000["download_count"].sum()
print(f'Total number of downloads of top 1000 downloaded packages: {num_downloads_top_1000_downloaded:,}')

Total number of downloads of top 1000 downloaded packages: 13,313,283,418


In [15]:
# Get the list of top PyPI downloaded package names
top_1000_downloaded_packages = df_top_1000["project"].to_list()
print(f"Number of top PyPI downloaded packages: {len(top_1000_downloaded_packages)}")

Number of top PyPI downloaded packages: 1000


In [16]:
# Load the list of top 1000 packages by the number of dependents
top_1000_packages_by_dependents_file = os.path.abspath("../dataset/metadata/top-pypi-packages-dependents.txt")
with open(top_1000_packages_by_dependents_file) as f:
    top_1000_dependent_packages = f.read().splitlines()
print(f"Number of top dependent packages: {len(top_1000_dependent_packages)}")

Number of top dependent packages: 1000


In [17]:
# Get the package names of the top 5000 pypi downloaded packages
top_5000_downloaded_packages = df["project"].to_list()

In [18]:
# There are some packages that are in the list of top dependent packages but not in the list of top downloaded packages
top_dependent_packages_not_in_top_5000_downloaded = set(top_1000_dependent_packages) - set(top_5000_downloaded_packages)
print(f"Number of top pypi dependent packages but not in top downloaded packages: {len(top_dependent_packages_not_in_top_downloaded)}")

Number of top pypi dependent packages but not in top downloaded packages: 169


In [24]:
# How about comparing the list of top 1000 dependent and downloaded packages
top_dependent_packages_not_in_top_downloaded = set(top_1000_dependent_packages) - set(top_1000_downloaded_packages)
print(f"Number of top dependent packages, but not in top downloaded: {len(top_dependent_packages_not_in_top_downloaded)}")

Number of top dependent packages, but not in top downloaded: 498


In [25]:
# Saving the list of top pypi packages by downloads to storage
result_file = os.path.abspath("../dataset/metadata/top-dependent-packages-not-in-top-downloaded.txt")
with open(result_file, 'w') as f:
    for item in top_dependent_packages_not_in_top_downloaded:
        f.write("%s\n" % item)

In [20]:
# How about comparing the list of top 1000 dependent and downloaded packages
top_downloaded_packages_not_in_top_dependents = set(top_1000_downloaded_packages) - set(top_1000_dependent_packages) 
print(f"Number of top downloaded packages, but not in top dependents: {len(top_downloaded_packages_not_in_top_dependents)}")

Number of top downloaded packages, but not in top dependents: 498


In [21]:
common_packages_downloaded_dependents = set(top_1000_downloaded_packages) & set(top_1000_dependent_packages)
print(f"Number of commom packages between top downloaded and dependents: {len(common_packages_downloaded_dependents)}")

Number of commom packages between top downloaded and dependents: 502


In [22]:
top_downloaded_packages_not_in_top_dependents

{'accelerate',
 'adal',
 'agate',
 'aggdraw',
 'aioboto3',
 'aioitertools',
 'aiosignal',
 'ansible-core',
 'apache-airflow-providers-amazon',
 'apache-airflow-providers-cncf-kubernetes',
 'apache-airflow-providers-http',
 'apache-airflow-providers-mysql',
 'apache-airflow-providers-postgres',
 'apache-beam',
 'applicationinsights',
 'appnope',
 'apscheduler',
 'argon2-cffi-bindings',
 'asciimatics',
 'asttokens',
 'async-generator',
 'async-lru',
 'authlib',
 'automat',
 'autopage',
 'avro',
 'avro-gen',
 'avro-python3',
 'aws-sam-translator',
 'aws-xray-sdk',
 'awswrangler',
 'azure-batch',
 'azure-cli-nspkg',
 'azure-cli-telemetry',
 'azure-cosmos',
 'azure-cosmosdb-nspkg',
 'azure-cosmosdb-table',
 'azure-data-tables',
 'azure-datalake-store',
 'azure-devops',
 'azure-eventgrid',
 'azure-eventhub',
 'azure-graphrbac',
 'azure-keyvault',
 'azure-keyvault-certificates',
 'azure-keyvault-keys',
 'azure-keyvault-secrets',
 'azure-kusto-data',
 'azure-kusto-ingest',
 'azure-loganalytics

In [23]:
top_dependent_packages_not_in_top_downloaded

{'APScheduler',
 'Adafruit-Blinka',
 'Authlib',
 'Babel',
 'Bottleneck',
 'Brotli',
 'CacheControl',
 'CairoSVG',
 'Cartopy',
 'Cerberus',
 'CherryPy',
 'ConfigArgParse',
 'Cython',
 'DateTime',
 'Deprecated',
 'Django',
 'Faker',
 'Fiona',
 'Flask',
 'Flask-Babel',
 'Flask-Caching',
 'Flask-Cors',
 'Flask-Login',
 'Flask-Mail',
 'Flask-Migrate',
 'Flask-RESTful',
 'Flask-SQLAlchemy',
 'Flask-SocketIO',
 'Flask-WTF',
 'GDAL',
 'GitPython',
 'JPype1',
 'Jinja2',
 'Keras-Preprocessing',
 'Kivy',
 'Logbook',
 'Mako',
 'Markdown',
 'MarkupSafe',
 'Paste',
 'Pillow',
 'Pint',
 'Products.CMFCore',
 'Products.CMFPlone',
 'Products.GenericSetup',
 'PuLP',
 'PyAudio',
 'PyAutoGUI',
 'PyGObject',
 'PyGithub',
 'PyHamcrest',
 'PyInquirer',
 'PyJWT',
 'PyMuPDF',
 'PyMySQL',
 'PyNaCl',
 'PyOpenGL',
 'PyPDF2',
 'PyQt5',
 'PyQtWebEngine',
 'PySide2',
 'PySide6',
 'PySimpleGUI',
 'PySocks',
 'PyVISA',
 'PyWavelets',
 'PyYAML',
 'Pygments',
 'QtPy',
 'RPi.GPIO',
 'Rx',
 'SPARQLWrapper',
 'SQLAlchemy',
