### This notebook is used for downloading the latest releases of the PyPI packages. To do so, we need:
- Extract the metadata of a package
- Compare the timestamps of the releases to get the most recent release
- Download the latest reelease with the URL provided in the meatadata

In [1]:
from urllib.request import urlopen, Request, urlretrieve
import json
from datetime import date, time, datetime
from dateutil import parser
import os
from urllib.error import HTTPError
from collections import Counter
from typing import Dict
import tarfile
import zipfile

In [2]:
def get_package_metadata(package_name: str) -> Dict[str, str]:
    """
    Get package metadata
    :return: dictionary data containing package info (E.g., homepage, codepage)
    """
    package_json_url = f"https://pypi.org/pypi/{package_name}/json"
    with urlopen(package_json_url) as response:
        data = response.read().decode()
        package_metadata = json.loads(data)
    return package_metadata  

In [3]:
def count_num_releases(package_name: str) -> int:
    """
    Return the number of releases of a package
    """
    metadata = get_package_metadata(package_name)
    for release in metadata["releases"].values():
        if release:
            return len(release)
        else:
            return 0

In [4]:
def get_latest_release_url(package_metadata: Dict[str, str]) -> str:
    """
    Get the URL of the latest release
    """
    maximum_time = datetime.min
    release_url = None
    for release in package_metadata["releases"].values():
        #print(release)
        if release:
            for i in release:
                upload_time = parser.parse(i["upload_time"])
                if upload_time > maximum_time:
                    maximum_time = upload_time
                    release_url = i["url"]
    return release_url

In [20]:
def download_latest_release(package_name: str, dest_dir: str) -> str:
    """
    Downloading the latest release of a package
    :return: the path to the latest release on the disk
    """
    metadata = get_package_metadata(package_name)
    latest_release_url = get_latest_release_url(metadata)
    latest_release_filename = latest_release_url.split("/")[-1]
    dest_filepath = os.path.join(dest_dir, latest_release_filename)
    if not os.path.exists(dest_filepath):
        urlretrieve(latest_release_url, dest_filepath)
    return dest_filepath

In [21]:
# Extracting all artifacts
def uncompressing_artifact(package_name, artifact_file, dst_dir):
    extracted_dir = os.path.join(dst_dir, package_name)
  
    if not os.path.exists(extracted_dir):
        os.mkdir(extracted_dir)
        
        try:
            if str(artifact_file).endswith(".tar.gz"):
                tfile = tarfile.open(artifact_file)
                tfile.extractall(extracted_dir)
                tfile.close()
            elif str(artifact_file).endswith(".whl"):
                with zipfile.ZipFile(artifact_file, 'r') as zip_ref:
                    zip_ref.extractall(extracted_dir)

        except Exception as e:
            print(filepath, e)

In [16]:
# Load the list of 1000 random packages
a_thousand_random_packages_filepath = os.path.abspath("../dataset/metadata/a-thousand-pypi-packages.txt")
with open(a_thousand_random_packages_filepath) as f:
    a_thousand_random_packages = f.read().splitlines()
print(f"Number of randomly PyPI packages: {len(set(a_thousand_random_packages))}")

Number of randomly PyPI packages: 1000


In [17]:
# Downloading the releases of the random packages
dest_dir = os.path.abspath("../dataset/random-packages/")
for package in a_thousand_random_packages:
    try:
        release_path = download_latest_release(package, dest_dir)
        uncompressing_artifacts(package, release_path, dest_dir)
        os.remove(release_path)
    except Exception as e:
        print(package, e)
        pass

memex 'NoneType' object has no attribute 'split'
tdebuilder 'NoneType' object has no attribute 'split'
tngems-sousou 'NoneType' object has no attribute 'split'
atlassian-analytics-utils HTTP Error 404: Not Found
towel-bootstrap 'NoneType' object has no attribute 'split'
caafindert HTTP Error 404: Not Found
resotocommon HTTP Error 404: Not Found
cloudkeeper-plugin-jira HTTP Error 404: Not Found
ngramtable HTTP Error 404: Not Found
process 'NoneType' object has no attribute 'split'
pyramid-secrets HTTP Error 404: Not Found
trajectory-distance-py3 HTTP Error 404: Not Found
pipc 'NoneType' object has no attribute 'split'
deferrable HTTP Error 404: Not Found
feri-urnik HTTP Error 404: Not Found
flask-cms-core 'NoneType' object has no attribute 'split'
mytestmodule 'NoneType' object has no attribute 'split'
attrhelpers-msalib HTTP Error 404: Not Found
noodleflow 'NoneType' object has no attribute 'split'
battleship-game 'NoneType' object has no attribute 'split'
neuron0 HTTP Error 404: Not F

### Random packages that are not available
memex 'NoneType' object has no attribute 'split'
tdebuilder 'NoneType' object has no attribute 'split'
tngems-sousou 'NoneType' object has no attribute 'split'
atlassian-analytics-utils HTTP Error 404: Not Found
towel-bootstrap 'NoneType' object has no attribute 'split'
caafindert HTTP Error 404: Not Found
resotocommon HTTP Error 404: Not Found
cloudkeeper-plugin-jira HTTP Error 404: Not Found
ngramtable HTTP Error 404: Not Found
process 'NoneType' object has no attribute 'split'
pyramid-secrets HTTP Error 404: Not Found
trajectory-distance-py3 HTTP Error 404: Not Found
pipc 'NoneType' object has no attribute 'split'
deferrable HTTP Error 404: Not Found
feri-urnik HTTP Error 404: Not Found
flask-cms-core 'NoneType' object has no attribute 'split'
mytestmodule 'NoneType' object has no attribute 'split'
attrhelpers-msalib HTTP Error 404: Not Found
noodleflow 'NoneType' object has no attribute 'split'
battleship-game 'NoneType' object has no attribute 'split'
neuron0 HTTP Error 404: Not Found
torchcast HTTP Error 404: Not Found
pysav HTTP Error 404: Not Found
source-page HTTP Error 404: Not Found
easy-eval HTTP Error 404: Not Found
django-neue-transmeta HTTP Error 404: Not Found
cambir HTTP Error 404: Not Found
forecasting HTTP Error 404: Not Found
mylinux 'NoneType' object has no attribute 'split'
bpmn 'NoneType' object has no attribute 'split'
spork 'NoneType' object has no attribute 'split'
temop HTTP Error 404: Not Found
lextenglibtest 'NoneType' object has no attribute 'split'
inotify-watcher HTTP Error 404: Not Found
covid19-info-stats HTTP Error 404: Not Found
aatree 'NoneType' object has no attribute 'split'
nesterxj 'NoneType' object has no attribute 'split'
alp-proj HTTP Error 404: Not Found
ml-prepr HTTP Error 404: Not Found
brambl-py HTTP Error 404: Not Found
raio HTTP Error 404: Not Found
fb-messeges HTTP Error 404: Not Found
dlgr 'NoneType' object has no attribute 'split'
oaktree 'NoneType' object has no attribute 'split'

In [23]:
# Load additional random packages
additional_thousand_random_packages_filepath = os.path.abspath("../dataset/metadata/additional-thousand-pypi-packages.txt")
with open(additional_thousand_random_packages_filepath) as f:
    additional_thousand_random_packages = f.read().splitlines()
print(f"Number of top additional random PyPI packages: {len(set(additional_thousand_random_packages))}")

Number of top additional random PyPI packages: 1000


In [24]:
# Downloading additional ramdomly releases because there are some of the original randomly selected packages 
# are not available
dest_dir = os.path.abspath("../dataset/random-packages/")

for package in additional_thousand_random_packages[:50]:
    try:
        release_path = download_latest_release(package, dest_dir)
        uncompressing_artifacts(package, release_path, dest_dir)
        os.remove(release_path)
    except Exception as e:
        print(package, e)
        pass



ia 'NoneType' object has no attribute 'split'


In [18]:
# Load the list of top 1000 packages by the number of dependents
top_1000_downloaded_packages_filepath = os.path.abspath("../dataset/metadata/top-pypi-packages-downloads.txt")
with open(top_1000_downloaded_packages_filepath) as f:
    top_1000_downloaded_packages = f.read().splitlines()
print(f"Number of top downloaded PyPI packages: {len(set(top_1000_downloaded_packages))}")

Number of top downloaded PyPI packages: 1000


In [22]:
# Downloading the releaes of the most downloaded packages
dest_dir = os.path.abspath("../dataset/popular-packages/")
for package in top_1000_downloaded_packages:
    try:
        release_path = download_latest_release(package, dest_dir)
        uncompressing_artifacts(package, release_path, dest_dir)
        os.remove(release_path)
    except Exception as e:
        print(package, e)
        pass

In [10]:
# Load the list of top 1000 packages by the number of dependents
top_dependent_packages_not_in_top_downloaded_filepath = os.path.abspath("../dataset/metadata/top-dependent-packages-not-in-top-downloaded.txt")
with open(top_dependent_packages_not_in_top_downloaded_filepath) as f:
    top_dependent_packages_not_in_top_downloaded = f.read().splitlines()
print(f"Number of top dependent packages, but not in top downloaded: {len(set(top_dependent_packages_not_in_top_downloaded))}")

Number of top dependent packages, but not in top downloaded: 498


In [11]:
# Downloading the releaes of the most downloaded packages
dest_dir = os.path.abspath("../dataset/popular-packages/")
for package in top_dependent_packages_not_in_top_downloaded:
    try:
        download_latest_release(package, dest_dir)
    except Exception as e:
        print(package, e)
        pass

time 'NoneType' object has no attribute 'split'
json HTTP Error 404: Not Found
odoo 'NoneType' object has no attribute 'split'


In [32]:
# Downloading additional release
num_required_releases = 1000
dest_dir = os.path.abspath("../dataset/random-packages/")
num_current_random_releases = len(os.listdir(dest_dir))
missing_releases = (num_required_releases - num_current_random_releases)
print(missing_releases)

8
