In [1]:
# Standard lib
from typing import List, Dict
import os
import sys
from pathlib import Path
from dataclasses import asdict
from datetime import datetime

# 3rd party
import pandas as pd
from pymongo import MongoClient, ASCENDING
from tqdm import tqdm
import multiprocess as mp
import matplotlib.pyplot as plt
import numpy as np

# Local
project_root = Path.cwd().parent
src_path = str(project_root / "src")
if src_path not in sys.path:
    sys.path.append(src_path)

from analysis.remediation import find_image_remediations

# Environment setup
import dotenv
dotenv.load_dotenv(".env", override=True) # Defines MONGO_URI

True

In [2]:
client = MongoClient(os.environ["MONGO_URI"])
db = client["gallery"]

In [9]:
def process_image(image: dict) -> pd.DataFrame:
    query = {
        "registry": image["registry"],
        "repository": image["repository"],
        "tag": image["tag"],
    }
    scans = db["cves"].find(query).sort([("scan_start", ASCENDING)])
    rems = find_image_remediations(scans,
                                   include_preexisting=False,
                                   include_residual=False)
    df = pd.DataFrame(map(asdict, rems))
    for col in ["registry", "repository", "tag"]:
        df[col] = image[col]
    df["labels"] = "".join(image["labels"])
    return df


def fetch_images() -> List[Dict]:
    return list(db["images"].find())


def expand_cve_column(df: pd.DataFrame):
    expanded_df = pd.json_normalize(df["cve"])
    return pd.concat([df.drop(columns=["cve"]), expanded_df], axis=1)


def remediation_time_column(df: pd.DataFrame):
    rtime = (df["remediated_at"] - df["first_seen_at"])
    return rtime.dt.total_seconds() / 3600


# Process images
images = fetch_images()

with mp.Pool(mp.cpu_count()) as pool:
    dfs = list(tqdm(pool.imap_unordered(process_image, images),
                    desc=f"Collecting remediations",
                    total=len(images)))
rems_df = pd.concat(dfs, axis=0, ignore_index=True)

# Expand CVE column
rems_df = expand_cve_column(rems_df)

# Create remediation time column
rems_df["rtime"] = remediation_time_column(rems_df)

Collecting remediations: 100%|██████████| 195/195 [09:26<00:00,  2.91s/it]


In [10]:
# Plot Remediations

def get_label_stats(label: str, df: pd.DataFrame) -> pd.DataFrame:
    label_df = df[df["labels"].str.contains(label)]
    rtime = label_df["rtime"]
    return pd.DataFrame({"label": label,
                         "num-rems": rtime.count(),
                         "rtime-ave": rtime.mean() / 24,
                         "rtime-std": rtime.std() / 24}, index=[0])


def get_stats(labels: List[str], df: pd.DataFrame) -> pd.DataFrame:
    dfs = [get_label_stats(l, df) for l in labels]
    return pd.concat(dfs, axis=0, ignore_index=True)


def get_series(label: str, df: pd.DataFrame) -> np.ndarray:
    label_df = df[df["labels"].str.contains(label)]
    return label_df["rtime"].to_numpy()


labels = ["cgr-public", "docker-official", "ubi"]

stats = get_stats(labels, rems_df)
print(stats)
# for l in labels:
#     x = get_series(l, rems_df) / 24
#     plt.hist(x, label=l)

# plt.legend()
# plt.xlabel("Remediation Time (Days)")
# plt.ylabel("Num CVEs")

             label  num-rems  rtime-ave  rtime-std
0       cgr-public        14   1.387154   1.311875
1  docker-official      2981   2.607753   1.709839
2              ubi      2170   2.587142   1.902223


In [42]:
filter = rems_df["labels"].str.contains("cgr-public")

discovered = rems_df[filter][["first_seen_at", "remediated_at"]].sort_values("first_seen_at", axis=0, ascending=True)
discovered["first_seen_at"] = discovered["first_seen_at"].dt.floor("h")
discovered["remediated_at"] = discovered["remediated_at"].dt.floor("h")
num_hours = (discovered["last_seen_at"].max() - discovered["first_seen_at"].min()).dt.total_seconds() / 3600
discovered["first_seen_idx"] = (discovered["first_seen_at"] - discovered["first_seen_at"].min()).dt.total_seconds() / 3600
discovered["remediated_at_idx"] = (discovered["remediated_at"] - discovered["first_seen_at"].min()).dt.total_seconds() / 3600
plotted = np.zeros((discovered["last_seen_at"].max() - discovered["first_seen_at"].min()))

for _, row in discovered.iterrows():
    i, j = int(row["first_seen_idx"]), int(row["remediated_at_idx"])
    y = max(plotted[i], plotted[j])
    plotted[i] = y + 1
    plotted[j] = y + 1
    plt.plot([i, j], [y, y])

IndexError: index 109 is out of bounds for axis 0 with size 14

In [None]:
images

In [None]:
def mean_rtime(rems_df: pd.DataFrame, label: str) -> float:
    label_df = rems_df[rems_df["labels"].str.contains(label)]
    return label_df["rtime"].mean(), label_df["rtime"].std()

labels = ["cgr-public"]
for l in labels:
    print(mean_rtime(rems_df, l))

In [None]:

rtime_df = rems_df.groupby(["registry", "repository", "tag"])["rtime"].mean().reset_index()

In [None]:
rtime_df.groupby(["registry"])["rtime"].mean().sort_values(ascending=True)

In [None]:
print("Docker.io Official")
docker_mask = rtime_df["registry"] == "docker.io"
ubuntu_mask = rtime_df["repository"].str.contains("ubuntu")
rtime_df[docker_mask & ~ubuntu_mask]["rtime"].mean()

In [None]:
print("Docker.io Ubuntu Chiselled")
rtime_df[rtime_df["repository"].str.contains("ubuntu")]["rtime"].mean()

In [None]:
rems_df

In [None]:
rems_df