In [1]:
import json
from pathlib import Path
import pandas as pd
from datetime import datetime


In [3]:
# Project root discovery
PROJECT_ROOT = Path.cwd()
while not (PROJECT_ROOT / "data").exists():
    PROJECT_ROOT = PROJECT_ROOT.parent

RAW_NVD_DIR = PROJECT_ROOT / "data" / "raw" / "nvd"
PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"

PROCESSED_DIR.mkdir(parents=True, exist_ok=True)


In [4]:
def get_english_description(descriptions):
    for d in descriptions or []:
        if d.get("lang") == "en":
            return d.get("value")
    return None


In [5]:
def extract_cvss(cve):
    metrics = cve.get("metrics", {})

    # Prefer CVSS v3.1 → v3.0 → fallback
    for key in ("cvssMetricV31", "cvssMetricV30"):
        if key in metrics:
            m = metrics[key][0]
            return (
                m["cvssData"].get("baseScore"),
                m["cvssData"].get("vectorString"),
                m["cvssData"].get("baseSeverity"),
                key.replace("cvssMetric", "")
            )

    return None, None, None, None


In [6]:
def extract_cwe(cve):
    weaknesses = cve.get("weaknesses", [])
    for w in weaknesses:
        for d in w.get("description", []):
            if d.get("lang") == "en":
                return d.get("value")
    return "UNKNOWN"


In [7]:
def has_exploit_reference(cve):
    refs = cve.get("references", [])
    for r in refs:
        tags = r.get("tags", [])
        if "Exploit" in tags or "exploit" in tags:
            return True
    return False


In [8]:
sample_file = next(RAW_NVD_DIR.glob("nvdcve-2.0-*.json"))

with open(sample_file, "r", encoding="utf-8") as f:
    sample_data = json.load(f)

len(sample_data["vulnerabilities"])


6770

In [9]:
rows = []

for file in sorted(RAW_NVD_DIR.glob("nvdcve-2.0-*.json")):
    year = int(file.stem.split("-")[-1])

    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)

    for item in data.get("vulnerabilities", []):
        cve = item.get("cve", {})

        cvss, vector, severity, cvss_ver = extract_cvss(cve)

        rows.append({
            # Identity
            "cve_id": cve.get("id"),

            # Time
            "published": cve.get("published"),
            "last_modified": cve.get("lastModified"),
            "year": year,

            # Descriptive
            "description": get_english_description(
                cve.get("descriptions", [])
            ),

            # Severity signals (NOT final decision)
            "cvss": cvss,
            "cvss_vector": vector,
            "cvss_severity": severity,
            "cvss_version": cvss_ver,

            # Taxonomy
            "cwe": extract_cwe(cve),

            # Exploit signal
            "has_exploit_ref": has_exploit_reference(cve),

            # Provenance (EXTENSIBILITY KEY)
            "source_dataset": "NVD"
        })


In [10]:
df = pd.DataFrame(rows)

df["published"] = pd.to_datetime(df["published"], errors="coerce")
df["last_modified"] = pd.to_datetime(df["last_modified"], errors="coerce")

df.shape


(326099, 12)

In [12]:
OUTPUT_FILE = PROCESSED_DIR / "nvd_full.parquet"
df.to_parquet(OUTPUT_FILE, index=False)

OUTPUT_FILE


WindowsPath('d:/MRINAL/CYDL/data/processed/nvd_full.parquet')