## Data Fetching using NVD URL

In [None]:
import pandas as pd
import requests
import zipfile
import os
from pandas import json_normalize

In [None]:
def extract_cve_data(row):
    # Extract vendor email and affected product
    vendor_email = row['cve']['CVE_data_meta'].get('ASSIGNER')
    affectedProduct = vendor_email.split('@')[-1].split('.')[0] if vendor_email and '@' in vendor_email else None

    # Extract ID and description
    id = row['cve']['CVE_data_meta'].get('ID')
    description = None
    for data in row['cve']['description'].get('description_data', []):
        if data.get('lang') == 'en':
            description = data.get('value')
            break

    # Initialize impact-related variables
    cvssScore = None
    availabilityImpact = None
    confidentialityImpact = None
    integrityImpact = None
    accessComplexity = None
    authenticationRequired = None

    # Extract CVSS information
    for version in row.get('impact', {}):
        version_index = version[-1]
        cvv_index = 'cvssV' + version_index
        cvss_data = row['impact'][version].get(cvv_index, {})

        if cvss_data.get('baseScore') is not None:
            cvssScore = cvss_data.get('baseScore')
            availabilityImpact = cvss_data.get('availabilityImpact')
            confidentialityImpact = cvss_data.get('confidentialityImpact')
            integrityImpact = cvss_data.get('integrityImpact')
            accessComplexity = cvss_data.get('accessComplexity')
            authenticationRequired = cvss_data.get('privilegesRequired')
            break

    # Extract references
    references = [ref.get('url') for ref in row['cve'].get('references', {}).get('reference_data', [])]

    # Extract published date
    if row.get('publishedDate'):
        publishedDate = row.get('publishedDate')[0:10]

    # Return all extracted data
    return {
        'id': id,
        'affectedProduct': affectedProduct,
        'description': description,
        'cvssScore': cvssScore,
        'availabilityImpact': availabilityImpact,
        'confidentialityImpact': confidentialityImpact,
        'integrityImpact': integrityImpact,
        'accessComplexity': accessComplexity,
        'authenticationRequired': authenticationRequired,
        'references': references,
        'publishedDate': publishedDate
    }


In [None]:
cve_data = []

In [None]:
# Function to download and extract CVE data for a specific year
def download_cve_data(year):
    CVE_FEED = f'nvdcve-1.1-{year}.json'
    CVE_URL = f'https://nvd.nist.gov/feeds/json/cve/1.1/{CVE_FEED}.zip'

    # Download the CVE feed
    response = requests.get(CVE_URL)

    # Check if the download was successful
    if response.status_code != 200:
        print(f"Failed to download data for year {year}. Status code: {response.status_code}")
        return pd.DataFrame()  # Return an empty DataFrame on failure

    # Save the ZIP file locally
    zip_file_path = f'{CVE_FEED}.zip'
    with open(zip_file_path, 'wb') as f:
        f.write(response.content)

    # Validate that the downloaded file is indeed a ZIP file
    try:
        # Unzip the file
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall()  # Extract to current directory
    except zipfile.BadZipFile:
        print(f"Downloaded file for year {year} is not a valid ZIP file.")
        os.remove(zip_file_path)  # Clean up
        return pd.DataFrame()  # Return an empty DataFrame on failure

    # Load the JSON data
    cvejson = pd.read_json(CVE_FEED)
    cve_all_raw = cvejson['CVE_Items']
    for row in cve_all_raw:
      cve_data.append(extract_cve_data(row))

In [None]:
for year in range(1999,2024):
  download_cve_data(year)

Failed to download data for year 1999. Status code: 404
Failed to download data for year 2000. Status code: 404
Failed to download data for year 2001. Status code: 404


In [None]:
import json
with open("initial_data.json","w") as f:
    json.dump(cve_data, f, indent=4)