In [17]:
import pandas as pd
import json
import os
import requests
import gzip
import shutil
import xml.etree.ElementTree as ET
import csv

In [16]:
def download_and_extract_cve_data(start_year=2002, end_year=2024, base_url="https://nvd.nist.gov/feeds/json/cve/1.1/"):
    for year in range(start_year, end_year + 1):
        # Construct the download URL for each year
        url = f"{base_url}nvdcve-1.1-{year}.json.gz"
        folder_name = f"../data/CVEs/{str(year)}"
        file_name = f"nvdcve-1.1-{year}.json.gz"
        output_json = f"nvdcve-1.1-{year}.json"
        
        # Create a folder for each year if it doesn't exist
        if not os.path.exists(folder_name):
            os.makedirs(folder_name)
        
        # Download the file
        print(f"Downloading {file_name}...")
        response = requests.get(url, stream=True)
        
        # Save the .gz file
        gz_path = os.path.join(folder_name, file_name)
        with open(gz_path, 'wb') as gz_file:
            gz_file.write(response.content)
        
        # Unzip the file
        print(f"Extracting {file_name}...")
        json_path = os.path.join(folder_name, output_json)
        with gzip.open(gz_path, 'rb') as f_in:
            with open(json_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        
        print(f"Extracted {output_json} to folder {folder_name}.\n")
    
    print("Download and extraction completed.")

# Example usage:
download_and_extract_cve_data()

Downloading nvdcve-1.1-2002.json.gz...
Extracting nvdcve-1.1-2002.json.gz...
Extracted nvdcve-1.1-2002.json to folder ../data/CVEs/2002.

Downloading nvdcve-1.1-2003.json.gz...
Extracting nvdcve-1.1-2003.json.gz...
Extracted nvdcve-1.1-2003.json to folder ../data/CVEs/2003.

Downloading nvdcve-1.1-2004.json.gz...
Extracting nvdcve-1.1-2004.json.gz...
Extracted nvdcve-1.1-2004.json to folder ../data/CVEs/2004.

Downloading nvdcve-1.1-2005.json.gz...
Extracting nvdcve-1.1-2005.json.gz...
Extracted nvdcve-1.1-2005.json to folder ../data/CVEs/2005.

Downloading nvdcve-1.1-2006.json.gz...
Extracting nvdcve-1.1-2006.json.gz...
Extracted nvdcve-1.1-2006.json to folder ../data/CVEs/2006.

Downloading nvdcve-1.1-2007.json.gz...
Extracting nvdcve-1.1-2007.json.gz...
Extracted nvdcve-1.1-2007.json to folder ../data/CVEs/2007.

Downloading nvdcve-1.1-2008.json.gz...
Extracting nvdcve-1.1-2008.json.gz...
Extracted nvdcve-1.1-2008.json to folder ../data/CVEs/2008.

Downloading nvdcve-1.1-2009.json.g

In [30]:

def process_cpe_data(xml_file, output_csv):
    # Parse the XML file
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    # Extract the timestamp from the "generator" section
    generator = root.find(".//{http://cpe.mitre.org/dictionary/2.0}generator")
    release_date = generator.find("{http://cpe.mitre.org/dictionary/2.0}timestamp").text if generator is not None else "N/A"
    
    # Open the CSV file for writing
    with open(output_csv, mode='w', newline='', encoding='utf-8') as csv_file:
        fieldnames = ['cpe_name', 'title', 'notes', 'references', 'deprecated', 'deprecation_date', 'release_date']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        
        # Write the header row
        writer.writeheader()

        # Iterate over each "cpe-item" element in the XML
        for cpe_item in root.findall(".//{http://cpe.mitre.org/dictionary/2.0}cpe-item"):
            # Extract the "name" attribute
            cpe_name = cpe_item.attrib.get('name', 'N/A')

            # Extract "title" elements (there can be multiple titles in different languages)
            titles = cpe_item.findall(".//{http://cpe.mitre.org/dictionary/2.0}title")
            title_texts = [title.text for title in titles if title.text]  # Collect all titles

            # Extract "notes" elements
            notes = cpe_item.findall(".//{http://cpe.mitre.org/dictionary/2.0}notes")
            notes_texts = [note.text for note in notes if note.text]  # Collect all notes

            # Extract "references" elements
            references = cpe_item.findall(".//{http://cpe.mitre.org/dictionary/2.0}reference")
            reference_texts = [ref.attrib.get('href', 'N/A') for ref in references]  # Collect reference hrefs

            # Extract "deprecated" attribute and "deprecation_date" if present
            deprecated = cpe_item.attrib.get('deprecated', 'false')
            deprecation_date = cpe_item.attrib.get('deprecation_date', 'N/A')

            # Write the data to CSV
            writer.writerow({
                'cpe_name': cpe_name,
                'title': ', '.join(title_texts),
                'notes': ', '.join(notes_texts),
                'references': ', '.join(reference_texts),
                'deprecated': deprecated,
                'deprecation_date': deprecation_date,
                'release_date': release_date
            })
    
    print(f"Data successfully written to {output_csv}")

# Example usage:
process_cpe_data('../data/CPEs/official-cpe-dictionary_v2.3.xml', '../data/CPEs/cpe_data.csv')


Data successfully written to ../data/CPEs/cpe_data.csv


In [26]:
import os
import json
import csv

def process_cve_data_per_json(json_root_dir, output_dir):
    # Ensure the output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Walk through the directory structure
    for root, dirs, files in os.walk(json_root_dir):
        for file in files:
            if file.endswith(".json"):
                json_file_path = os.path.join(root, file)
                print(f"Processing {json_file_path}...")
                
                # Create a CSV filename based on the JSON file name
                csv_filename = f"{os.path.splitext(file)[0]}.csv"
                csv_file_path = os.path.join(output_dir, csv_filename)
                
                # Open the CSV file for writing
                with open(csv_file_path, mode='w', newline='', encoding='utf-8') as csv_file:
                    fieldnames = ['CVE_ID', 'Timestamp', 'CPE_URI', 'Version_Start', 'Version_End', 'CWE', 'Impact_BaseScore', 'Impact_Severity', 'Published_Date', 'Last_Modified_Date']
                    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
                    
                    # Write the header row
                    writer.writeheader()

                    # Process the JSON file
                    with open(json_file_path, 'r') as f:
                        cve_data = json.load(f)

                        # Extract CVE data from each "CVE_Items" entry
                        for item in cve_data.get('CVE_Items', []):
                            cve_id = item['cve']['CVE_data_meta']['ID']
                            timestamp = cve_data['CVE_data_timestamp']

                            # Extract CWE(s) from the problemtype section
                            cwes = []
                            for problemtype_data in item['cve']['problemtype']['problemtype_data']:
                                for desc in problemtype_data.get('description', []):
                                    cwes.append(desc.get('value', 'N/A'))
                            cwe_list = ', '.join(cwes) if cwes else 'None'

                            # Extract matched CPEs from the configurations section
                            for node in item.get('configurations', {}).get('nodes', []):
                                for cpe_match in node.get('cpe_match', []):
                                    cpe_uri = cpe_match.get('cpe23Uri', 'N/A')
                                    version_start = cpe_match.get('versionStartIncluding', 'N/A')
                                    version_end = cpe_match.get('versionEndIncluding', 'N/A')
                                    
                                    # Extract impact data (base score and severity)
                                    base_score = 'N/A'
                                    severity = 'N/A'
                                    impact_v3 = item.get('impact', {}).get('baseMetricV3', {})
                                    if impact_v3:
                                        base_score = impact_v3.get('cvssV3', {}).get('baseScore', 'N/A')
                                        severity = impact_v3.get('cvssV3', {}).get('baseSeverity', 'N/A')
                                    
                                    # Extract publishedDate and lastModifiedDate
                                    published_date = item.get('publishedDate', 'N/A')
                                    last_modified_date = item.get('lastModifiedDate', 'N/A')

                                    # Write each CPE entry as a separate row
                                    writer.writerow({
                                        'CVE_ID': cve_id,
                                        'Timestamp': timestamp,
                                        'CPE_URI': cpe_uri,
                                        'Version_Start': version_start,
                                        'Version_End': version_end,
                                        'CWE': cwe_list,
                                        'Impact_BaseScore': base_score,
                                        'Impact_Severity': severity,
                                        'Published_Date': published_date,
                                        'Last_Modified_Date': last_modified_date
                                    })

                print(f"Data for {json_file_path} written to {csv_file_path}")



In [2]:
def preview_csv(file_path):
    # Read the CSV file in chunks to handle large files
    chunk_size = 1000  # Adjust based on memory availability
    df_chunk = pd.read_csv(file_path, chunksize=chunk_size)
    
    # Load only the first chunk to inspect the data
    df = next(df_chunk)
    
    # Display the column names and first three rows
    print("Columns:", df.columns)
    print("\nFirst 3 rows:")
    display(df.head(3))


In [10]:
import json

def preview_cwe_json(file_path):
    # Open the JSON file and load the entire data
    with open(file_path, 'r') as file:
        try:
            data = json.load(file)
            
            # Check if the data is a list of CWEs
            if isinstance(data, list) and len(data) > 0:
                # Preview the first three entries
                print("Previewing the first 3 CWE entries:")
                for entry in data[:3]:
                    print(f"\nCWE-ID: {entry.get('CWE-ID', 'N/A')}")
                    print(f"Name: {entry.get('Name', 'N/A')}")
                    print(f"Description: {entry.get('Description', 'N/A')}")
                    print(f"Common Consequences:")
                    for consequence in entry.get('Common_Consequences', []):
                        print(f"  Scope: {consequence.get('Scope', 'N/A')}")
                        print(f"  Impacts: {', '.join(consequence.get('Impacts', []))}")
                    print(f"Related Weaknesses:")
                    for weakness in entry.get('Related_Weaknesses', []):
                        print(f"  Nature: {weakness.get('Nature', 'N/A')}")
                        print(f"  CWE_ID: {weakness.get('CWE_ID', 'N/A')}")
            else:
                print("The JSON data is not a list or is empty.")
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")


In [6]:
cpe_data = '../data/CPEs/cpe_data.csv'
cve_data = '../data/CVEs/cveData.json'
cwe_data = '../data/CWEs/cweData.json'

In [27]:
# process_cve_data('../data/CVEs/', '../data/CVEs/cve_data_output.csv')
process_cve_data_per_json('../data/CVEs/', '../data/CVEs/CSVs')

Processing ../data/CVEs/nvdcve-1.1-modified.json...
Data for ../data/CVEs/nvdcve-1.1-modified.json written to ../data/CVEs/CSVs/nvdcve-1.1-modified.csv
Processing ../data/CVEs/nvdcve-1.1-recent.json...
Data for ../data/CVEs/nvdcve-1.1-recent.json written to ../data/CVEs/CSVs/nvdcve-1.1-recent.csv
Processing ../data/CVEs/2009/nvdcve-1.1-2009.json...
Data for ../data/CVEs/2009/nvdcve-1.1-2009.json written to ../data/CVEs/CSVs/nvdcve-1.1-2009.csv
Processing ../data/CVEs/2022/nvdcve-1.1-2022.json...
Data for ../data/CVEs/2022/nvdcve-1.1-2022.json written to ../data/CVEs/CSVs/nvdcve-1.1-2022.csv
Processing ../data/CVEs/2010/nvdcve-1.1-2010.json...
Data for ../data/CVEs/2010/nvdcve-1.1-2010.json written to ../data/CVEs/CSVs/nvdcve-1.1-2010.csv
Processing ../data/CVEs/2020/nvdcve-1.1-2020.json...
Data for ../data/CVEs/2020/nvdcve-1.1-2020.json written to ../data/CVEs/CSVs/nvdcve-1.1-2020.csv
Processing ../data/CVEs/2011/nvdcve-1.1-2011.json...
Data for ../data/CVEs/2011/nvdcve-1.1-2011.json w

In [28]:

df = pd.read_csv(cpe_data)
df.head(3)


Unnamed: 0,cpe_name,title,notes,references,deprecated,deprecation_date
0,cpe:/a:%240.99_kindle_books_project:%240.99_ki...,$0.99 Kindle Books project $0.99 Kindle Books ...,,https://play.google.com/store/apps/details?id=...,False,
1,cpe:/a:%40nubosoftware%2fnode-static_project:%...,@nubosoftware/node-static Project @nubosoftwar...,,https://www.npmjs.com/package/@nubosoftware/no...,False,
2,cpe:/a:%40thi.ng%2fegf_project:%40thi.ng%2fegf...,@thi.ng/egf Project @thi.ng/egf for Node.js,,https://github.com/thi-ng/umbrella/security/ad...,False,


In [29]:
len(df)

1320456

In [12]:
preview_cwe_json(cwe_data)

Previewing the first 3 CWE entries:

CWE-ID: 5
Name: J2EE Misconfiguration: Data Transmission Without Encryption
Description: Information sent over a network can be compromised while in transit. An attacker may be able to read or modify the contents if the data are sent in plaintext or are weakly encrypted.The product configuration should ensure that SSL or an encryption mechanism of equivalent strength and vetted reputation is used for all access-controlled pages.
Common Consequences:
  Scope: Confidentiality
  Impacts: Read Application Data
  Scope: Integrity
  Impacts: Modify Application Data
Related Weaknesses:
  Nature: ChildOf
  CWE_ID: 319

CWE-ID: 6
Name: J2EE Misconfiguration: Insufficient Session-ID Length
Description: The J2EE application is configured to use an insufficient session ID length.Session identifiers should be at least 128 bits long to prevent brute-force session guessing. A shorter session identifier leaves the application open to brute-force session guessing at

In [13]:
preview_cwe_json(cve_data)

The JSON data is not a list or is empty.
