In [None]:
import os
import glob
import magic
from pathlib import Path
import shutil
import subprocess
import tarfile
import zipfile

In [None]:
# Extracting all artifacts
def uncompressing_artifact(package_name, artifact_file, dst_dir):
    extracted_dir = os.path.join(dst_dir, package_name)
  
    if not os.path.exists(extracted_dir):
        os.mkdir(extracted_dir)       
        if str(artifact_file).endswith(".tar.gz"):
            tfile = tarfile.open(artifact_file)
            tfile.extractall(extracted_dir)
            tfile.close()
        elif str(artifact_file).endswith((".whl", ".zip", ".egg")) :
            with zipfile.ZipFile(artifact_file, 'r') as zip_ref:
                zip_ref.extractall(extracted_dir)

In [None]:
# We use OSSGadget oss-download to download a release of a package
oss_download_path = os.path.abspath("../scanners/OSSGadget/src/oss-download/bin/Debug/net6.0/oss-download")

In [None]:
packages_list_path = "LIST TO FILE CONTAINING PACKAGE NAMES"
packages_downloaded_dir = os.path.abspath("../dataset/popular-packages")

In [None]:
def download_packages(packages_list_path, packages_dir):
    with open(packages_list_path) as file:
        while (package := file.readline().rstrip()):
            p = subprocess.run([oss_download_path, "--download-metadata-only", f"pkg:pypi/{package}"], capture_output=True, text=True)
            version_lst = p.stderr.split("@")[-1].split("-")
            # Extracting version of a package
            version = ""

            if version_lst:
                try:
                    version = version_lst[:-1][0]
                except IndexError:
                    # in some cases, packages names do not appear in right formats, but that's not so important
                    version = version_lst[0].replace(".tar.gz", "")
            else:
                version = version_lst[0]
            print(package, version)
            dst_dir = Path(packages_dir, package, version)

            # Fresh new downloads here to avoid confusions
            if dst_dir.exists():
                continue
                #shutil.rmtree(dst_dir)
            dst_dir.mkdir(parents=True, exist_ok=True)

            subprocess.run([oss_download_path, "--download-directory", dst_dir, f"pkg:pypi/{package}"])

            artifacts = glob.glob(f"{dst_dir}/*")

            # Prioritize source tarballs over distibutions as they have setup.py files
            for artifact in artifacts:
                artifact_type = magic.from_file(artifact).lower()
                if artifact.endswith(".tar.gz") and artifact_type.startswith("gzip"):
                    tfile = tarfile.open(artifact)
                    tfile.extractall(dst_dir)
                    tfile.close()
                    os.remove(artifact)
                    break
                # In some cases, oss-download misrecognize zip files as tar files
                elif (artifact.endswith(".tar.gz") and artifact_type.startswith("zip")) or (artifact.endswith(".zip") and artifact_type.startswith("zip")):
                    with zipfile.ZipFile(artifact, 'r') as zip_ref:
                        zip_ref.extractall(dst_dir)
                        zip_ref.close()
                        os.remove(artifact)
                        break
            else:
                if artifact_type.startswith(("zip")):
                    with zipfile.ZipFile(artifact, 'r') as zip_ref:
                        zip_ref.extractall(dst_dir)
                        zip_ref.close()
                        os.remove(artifact)

            # Clean up the downloaded directory to only keep the uncompressed directory
            for item in os.listdir(dst_dir):
                left_over_file = os.path.join(dst_dir, item)
                if os.path.isfile(left_over_file):
                    os.remove(left_over_file)
