In [None]:
# References - https://www.redhat.com/en/blog/gathering-security-data-container-images-using-pyxis-api

In [None]:
import pandas as pd

# Load the Excel sheet into a DataFrame
df = pd.read_excel('known_affected.xlsx') # You can load any dataset. This dataset has been used as this lacked SHAs of known_affected packages
# Extract distinct package names
distinct_packages = df['package'].unique().tolist()

In [None]:
PYXIS_URL="https://catalog.redhat.com/api/containers/v1"

In [None]:
import requests

# Initialize lists
packages = []
cves = []
shas = []

for package in distinct_packages:
    endpoint = f"{PYXIS_URL}/repositories/registry/registry.access.redhat.com/repository/{package}/images?page_size=500&sort_by=creation_date[desc]"
    r = requests.get(endpoint)
    images = r.json()['data']    
    for image in images:
        try:
            image_id = image["_id"]
            vul_url = f"{PYXIS_URL}/images/id/{image_id}/vulnerabilities"
            
            # Make the request with a timeout
            response = requests.get(vul_url)
            response.raise_for_status()  # Raise an exception for HTTP errors
            
            # Parse JSON and check if data exists
            vulnerabilities = response.json().get('data', [])
            if not vulnerabilities:
                continue  # Skip if no vulnerabilities found
            
            # Extract data and append to lists
            for vulnerability in vulnerabilities:
                packages.append(package) 
                cves.append(vulnerability.get('cve_id', 'N/A'))  # Default to 'N/A' if 'cve_id' is missing
                shas.append(image.get('docker_image_id', 'N/A'))  # Default to 'N/A' if 'docker_image_id' is missing
        
        except requests.exceptions.RequestException as e:
            print(f"Request failed for image {image_id}: {e}")
        except KeyError as e:
            print(f"Missing key in response for image {image_id}: {e}")
        except ValueError as e:
            print(f"Invalid JSON response for image {image_id}: {e}")
len(packages), len(cves), len(shas)

In [9]:
packages1 = packages[:800000]
cves1 = cves[:800000]
shas1 = shas[:800000]
from pandas import DataFrame
df = DataFrame({'CVE': cves1, 'package': packages1,  'tag': shas1})
df.to_excel('rpm_package_vulnerabilities_1.xlsx', sheet_name='sheet1', index=False)

In [10]:
packages2 = packages[800000:1600000]
cves2 = cves[800000:1600000]
shas2 = shas[800000:1600000]

from pandas import DataFrame
df = DataFrame({'CVE': cves2, 'package': packages2,  'tag': shas2})
df.to_excel('rpm_package_vulnerabilities_2.xlsx', sheet_name='sheet1', index=False)

In [11]:
packages3 = packages[1600001:2400000]
cves3 = cves[1600001:2400000]
shas3 = shas[1600001:2400000]

from pandas import DataFrame
df = DataFrame({'CVE': cves3, 'package': packages3,  'tag': shas3})
df.to_excel('rpm_package_vulnerabilities_3.xlsx', sheet_name='sheet1', index=False)

In [12]:
packages4 = packages[2400001:3200000]
cves4 = cves[2400001:3200000]
shas4 = shas[2400001:3200000]

from pandas import DataFrame
df = DataFrame({'CVE': cves4, 'package': packages4,  'tag': shas4})
df.to_excel('rpm_package_vulnerabilities_4.xlsx', sheet_name='sheet1', index=False)

In [13]:
packages5 = packages[3200001:]
cves5 = cves[3200001:]
shas5 = shas[3200001:]

from pandas import DataFrame
df = DataFrame({'CVE': cves5, 'package': packages5,  'tag': shas5})
df.to_excel('rpm_package_vulnerabilities_5.xlsx', sheet_name='sheet1', index=False)