In [None]:
import datetime
import pandas as pd
import pyod
import numpy as np
from numpy import percentile
import os
from random import randrange
import sys
import time
from time import time
from pyvis.network import Network



#### Viewer MArk Teo: A Notebook For Working With ML Output
Part of the DUNE project (https://github.com/opendr-io/dune) and useful for hunting threats that are resistant to conventional detection. This notebook loads a dataframe that can be queried, sifted, aggregated, and sorted using the tools at the end. Now includes a graph viz for signal pattern analysis.

In [None]:
# specify yourt data file or you can use the example data
csv_file_path = 'cloudtrail-large.csv'

In [None]:
# Read the file and extract relevant fields
raw = pd.read_csv(csv_file_path, encoding='utf-8')  # Adjust the encoding if needed
print('raw data shape and column names:')
print(raw.shape)
print()
print('Choose your field names:')
raw.columns

In [None]:

from ipywidgets import (
    Checkbox,
    IntText,
    Dropdown,
    VBox,
    HBox,
    Output,
    Button,
    HTML,
    Layout,
)
from IPython.display import display

def make_groupby_ui(df):
    """
    Interactive UI to group/aggregate a DataFrame by selected fields and show counts.

    - Column discovery is dynamic
    - Group-by fields selected via checkboxes
    - Aggregation runs only when button is pressed
    """

    all_cols = list(df.columns)

    # --- Column checkboxes ---
    col_checkboxes = {
        col: Checkbox(
            value=False,
            description=col,
            indent=False,
            layout=Layout(width="280px"),
        )
        for col in all_cols
    }

    # Arrange checkboxes in rows
    rows = []
    cols_per_row = 3
    items = list(col_checkboxes.values())
    for i in range(0, len(items), cols_per_row):
        rows.append(HBox(items[i:i + cols_per_row]))
    checkboxes_box = VBox(rows)

    # --- Controls ---
    sort_widget = Dropdown(
        options=["count ↓ (desc)", "count ↑ (asc)"],
        value="count ↑ (asc)",
        description="Sort",
        layout={"width": "220px"},
    )

    top_n_widget = IntText(
        value=50,
        description="Top N",
        layout={"width": "150px"},
    )

    run_button = Button(
        description="Run aggregation",
        button_style="primary",
        tooltip="Compute group-by counts",
    )

    out = Output()

    # --- Aggregation logic (runs on button click) ---
    def run_aggregation(_):
        with out:
            out.clear_output()

            group_by = [col for col, cb in col_checkboxes.items() if cb.value]

            if not group_by:
                display(HTML("<b>Please select at least one field to group by.</b>"))
                return

            agg = (
                df
                .groupby(group_by)
                .size()
                .reset_index(name="count")
            )

            ascending = (sort_widget.value == "count ↑ (asc)")
            agg = agg.sort_values("count", ascending=ascending)

            if top_n_widget.value and top_n_widget.value > 0:
                agg_display = agg.head(top_n_widget.value)
            else:
                agg_display = agg

            display(agg_display)
            print(
                f"\nShowing {len(agg_display)} of {len(agg)} group rows "
                f"(grouped by {', '.join(group_by)})."
            )

    run_button.on_click(run_aggregation)

    # --- Layout ---
    controls_box = VBox([
        HTML("<b>Select fields to group by:</b>"),
        checkboxes_box,
        HBox([sort_widget, top_n_widget, run_button]),
    ])

    ui = VBox([controls_box, out])
    display(ui)

# Example usage
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
make_groupby_ui(raw)


In [None]:
import pandas as pd
from ipywidgets import Dropdown, VBox, HBox, Output, interactive_output

def make_filter_ui(df, filter_cols=None, max_unique=50):
    """
    Build an interactive filter UI for a DataFrame.

    - If filter_cols is None, automatically choose columns with
      1 < nunique <= max_unique.
    - max_unique controls how 'wide' a dropdown can be before we skip it.
    """
    # Auto-discover filterable columns if none provided
    if filter_cols is None:
        auto_cols = []
        for col in df.columns:
            # dropna=True so we're counting real values
            nunique = df[col].nunique(dropna=True)
            if 1 < nunique <= max_unique:
                auto_cols.append(col)

        filter_cols = auto_cols

    if not filter_cols:
        print("No suitable columns found for dropdown filters.")
        return

    print("Using filter columns:", filter_cols)

    # Create one dropdown per filter column
    dropdowns = {}
    for col in filter_cols:
        values = df[col].dropna().astype(str).unique()
        values_sorted = sorted(values)
        options = ['(All)'] + values_sorted

        dropdowns[col] = Dropdown(
            options=options,
            value='(All)',
            description=col,
            layout={'width': '400px'}
        )

    out = Output()

    def filter_df(**kwargs):
        with out:
            out.clear_output()
            mask = pd.Series(True, index=df.index)

            for col, selected in kwargs.items():
                if selected != '(All)':
                    mask &= df[col].astype(str).eq(selected)

            filtered = df[mask]
            display(filtered.head(100))
            print(f"\nShowing {min(len(filtered), 100)} of {len(filtered)} matching rows.")

    interactive = interactive_output(
        filter_df,
        {col: dropdowns[col] for col in filter_cols}
    )

    controls_box = VBox(list(dropdowns.values()))
    ui = VBox([controls_box, out])

    display(ui, interactive)

# Call this once to build the UI (auto-discover columns)
make_filter_ui(raw)

# Or, if you ever want to force a specific set:
# make_filter_ui(ddf, filter_cols=['source', 'action', 'arn', 'tactic1', 'tactic2'])
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_colwidth', None)

In [None]:
df= raw
agg = (
    df.groupby(["sourceIPAddress", "userIdentity.userName"], dropna=True)
      .size()
      .reset_index(name="count")
      .sort_values("count", ascending=False)
)
agg

In [None]:
import sys, ipaddress, pandas as pd
from ipwhois import IPWhois

MAX_LOOKUPS = 10  # circuit breaker

def ip_class(ip: str) -> str:
    try:
        o = ipaddress.ip_address(ip)
        if o.is_private:   return "PRIVATE"
        if o.is_loopback:  return "LOOPBACK"
        if o.is_reserved:  return "RESERVED"
        if o.is_link_local:return "LINK_LOCAL"
        if o.is_multicast: return "MULTICAST"
        return "PUBLIC"
    except Exception:
        return "INVALID"

def rdap_lookup(ip: str) -> dict:
    # only call for public IPs
    try:
        return IPWhois(ip).lookup_rdap()
    except Exception:
        return {}

def extract_asn_geo(rdap: dict) -> tuple[str, str]:
    # ASN/org name
    asn_name = (
        (rdap.get("network") or {}).get("name")
        or rdap.get("asn_description")
        or "UNKNOWN"
    )

    # "Geo" available from RDAP (usually country code)
    country = (
        (rdap.get("network") or {}).get("country")
        or rdap.get("asn_country_code")
        or "UNK"
    )

    return asn_name, country

# Unique IPs from agg only
ips = agg["sourceIPAddress"].dropna().astype(str).unique().tolist()

rows = []
lookups_done = 0

for ip in ips:
    cls = ip_class(ip)
    if cls != "PUBLIC":
        rows.append({"sourceIPAddress": ip, "asn_name": cls, "country": ""})
        continue

    if lookups_done >= MAX_LOOKUPS:
        rows.append({"sourceIPAddress": ip, "asn_name": "LOOKUP_SKIPPED_MAX10", "country": ""})
        continue

    rdap = rdap_lookup(ip)
    asn_name, country = extract_asn_geo(rdap)

    rows.append({"sourceIPAddress": ip, "asn_name": asn_name, "country": country})
    lookups_done += 1

print(f"RDAP lookups performed: {lookups_done} / {MAX_LOOKUPS}")

ip_enrich = pd.DataFrame(rows)

agg2 = agg.merge(ip_enrich, on="sourceIPAddress", how="left")
agg2


In [None]:
from pyvis.network import Network
from IPython.display import IFrame, display
import ipywidgets as widgets

df2 = (
    agg2.dropna(subset=["userIdentity.userName"])
        .rename(columns={"userIdentity.userName": "user"})
        .copy()
)

top_ips = widgets.IntSlider(value=30, min=3, max=200, step=1, description="Top IPs")
min_count = widgets.IntSlider(value=1, min=1, max=int(df2["count"].max()), step=1, description="Min count")

# Spacing controls
spring_length = widgets.IntSlider(value=170, min=80, max=420, step=10, description="Spacing")
repulsion = widgets.IntSlider(value=26000, min=8000, max=80000, step=2000, description="Repulsion")

# Label readability controls
font_size = widgets.IntSlider(value=24, min=14, max=40, step=2, description="Font")
node_margin = widgets.IntSlider(value=10, min=0, max=30, step=2, description="Margin")

out = widgets.Output()

def render(*_):
    out.clear_output()

    d = df2[df2["count"] >= min_count.value].copy()

    # Rank IPs by fanout (distinct users)
    ip_fanout_all = d.groupby("sourceIPAddress")["user"].nunique()
    keep_ips = set(ip_fanout_all.sort_values(ascending=False).head(top_ips.value).index)
    d = d[d["sourceIPAddress"].isin(keep_ips)]

    if d.empty:
        with out:
            print("No rows match filters.")
        return

    # Totals + fanout
    ip_total  = d.groupby("sourceIPAddress")["count"].sum()
    ip_fanout = d.groupby("sourceIPAddress")["user"].nunique()

    user_total  = d.groupby("user")["count"].sum()
    user_fanout = d.groupby("user")["sourceIPAddress"].nunique()

    net = Network(height="860px", width="100%", notebook=True, directed=False)
    net.barnes_hut()

    # IMPORTANT: set_options must be pure JSON (no "var options =" wrapper)
    net.set_options(f"""
    {{
      "nodes": {{
        "font": {{ "size": {font_size.value} }},
        "borderWidth": 1,
        "margin": {node_margin.value},
        "color": {{
          "border": "#9e9e9e",
          "background": "#e0e0e0",
          "highlight": {{ "border": "#757575", "background": "#eeeeee" }},
          "hover": {{ "border": "#757575", "background": "#eeeeee" }}
        }}
      }},
      "edges": {{
        "smooth": false,
        "color": {{ "color": "#bdbdbd" }}
      }},
      "physics": {{
        "barnesHut": {{
          "gravitationalConstant": -{repulsion.value},
          "springLength": {spring_length.value},
          "springConstant": 0.04,
          "damping": 0.12
        }},
        "stabilization": {{ "iterations": 150 }}
      }}
    }}
    """)

    # --- IP nodes (red if shared) ---
    for ip in ip_total.index:
        total = int(ip_total[ip])
        fan   = int(ip_fanout[ip])

        row0 = d.loc[d["sourceIPAddress"] == ip].iloc[0]
        asn  = row0.get("asn_name", "") or ""
        cc   = row0.get("country", "") or ""

        label = f"{ip}\n{asn}\n{cc}\n{total} events\n{fan} users"
        title = f"<b>IP</b>: {ip}<br><b>ASN</b>: {asn}<br><b>Country</b>: {cc}<br><b>Total events</b>: {total}<br><b>Distinct users</b>: {fan}"

        # Default dark grey for IPs
        color = {
            "border": "#616161",
            "background": "#9e9e9e",
            "highlight": {"border": "#424242", "background": "#bdbdbd"},
            "hover": {"border": "#424242", "background": "#bdbdbd"},
            }
        if fan > 1:
            color = {
                "border": "#b71c1c",
                "background": "#e53935",
                "highlight": {"border": "#7f0000", "background": "#ef5350"},
                "hover": {"border": "#7f0000", "background": "#ef5350"},
            }

        net.add_node(
            f"ip:{ip}",
            label=label,
            title=title,
            shape="dot",
            size=18 + total**0.5,
            color=color
        )

    # --- User nodes (light grey like IPs) ---
    for u in user_total.index:
        total = int(user_total[u])
        fan   = int(user_fanout[u])

        label = f"{u}\n{total} events\n{fan} IPs"
        title = f"<b>User</b>: {u}<br><b>Total events</b>: {total}<br><b>Distinct IPs</b>: {fan}"

        net.add_node(
            f"user:{u}",
            label=label,
            title=title,
            shape="box",
            size=18 + total**0.5,
            color={
                "border": "#9e9e9e",
                "background": "#e0e0e0",
                "highlight": {"border": "#757575", "background": "#eeeeee"},
                "hover": {"border": "#757575", "background": "#eeeeee"},
        }
            # no color override => inherits global light grey
        )

    # --- Edges ---
    for _, r in d.iterrows():
        net.add_edge(
            f"ip:{r['sourceIPAddress']}",
            f"user:{r['user']}",
            value=int(r["count"]),
            title=f"<b>Events</b>: {int(r['count'])}"
        )

    path = "ip_user_graph.html"
    net.show(path)

    with out:
        display(IFrame(path, width="100%", height=920))

for w in (top_ips, min_count, spring_length, repulsion, font_size, node_margin):
    w.observe(render, "value")

display(widgets.HBox([top_ips, min_count]))
display(widgets.HBox([spring_length, repulsion]))
display(widgets.HBox([font_size, node_margin]))
display(out)
render()
