In [11]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

PROJECT_ROOT = Path.cwd()
while not (PROJECT_ROOT / "data").exists():
    PROJECT_ROOT = PROJECT_ROOT.parent

df = pd.read_parquet(PROJECT_ROOT / "data" / "processed" / "nvd_full.parquet")
df.shape


(326099, 12)

In [12]:
df["published"] = pd.to_datetime(df["published"], utc=True)

now = pd.Timestamp.utcnow()

df["vuln_age_days"] = (now - df["published"]).dt.days
df["vuln_age_days"] = df["vuln_age_days"].clip(lower=0)

df["log_vuln_age"] = np.log1p(df["vuln_age_days"])


In [13]:
cvss_features = pd.get_dummies(
    df["cvss_severity"],
    prefix="cvss_sev"
)


In [14]:
def cwe_group(cwe):
    if not isinstance(cwe, str):
        return "UNKNOWN"
    if "79" in cwe:
        return "XSS"
    if "89" in cwe:
        return "SQLi"
    if "119" in cwe or "120" in cwe:
        return "MEMORY"
    if "284" in cwe:
        return "AUTH"
    return "OTHER"

df["cwe_group"] = df["cwe"].apply(cwe_group)

cwe_features = pd.get_dummies(
    df["cwe_group"],
    prefix="cwe"
)


In [15]:
X = pd.concat(
    [
        df[["log_vuln_age"]],
        cvss_features,
        cwe_features
    ],
    axis=1
)


In [16]:
X = X.reset_index(drop=True)
labels_df = df[[
    "cve_id",
    "year",
    "cvss",
    "has_exploit_ref"
]].reset_index(drop=True)


In [17]:
FEATURE_FILE = PROJECT_ROOT / "data" / "processed" / "features.parquet"
LABEL_FILE   = PROJECT_ROOT / "data" / "processed" / "labels.parquet"

X.to_parquet(FEATURE_FILE)
labels_df.to_parquet(LABEL_FILE)

FEATURE_FILE, LABEL_FILE


(WindowsPath('d:/MRINAL/CYDL/data/processed/features.parquet'),
 WindowsPath('d:/MRINAL/CYDL/data/processed/labels.parquet'))

In [18]:
X.shape

X.columns.to_list()

['log_vuln_age',
 'cvss_sev_CRITICAL',
 'cvss_sev_HIGH',
 'cvss_sev_LOW',
 'cvss_sev_MEDIUM',
 'cvss_sev_NONE',
 'cwe_AUTH',
 'cwe_MEMORY',
 'cwe_OTHER',
 'cwe_SQLi',
 'cwe_XSS']