In [3]:
import datetime
import pandas as pd
import pyod
import numpy as np
from numpy import percentile
import os
from random import randrange
import sys
import time
from time import time


#### A Notebook For Working With Dataframes
Part of the DUNE project (https://github.com/opendr-io/dune) and useful for hunting threats that are resistant to conventional detection. This notebook loads a dataframe that can be queried, sifted, aggregated, and sorted using the tools at the end.

In [4]:
# specify yourt data file or you can use the example data
csv_file_path = 'cloudtrail.csv'

In [5]:
# Read the file and extract relevant fields
raw = pd.read_csv(csv_file_path, encoding='utf-8')  # Adjust the encoding if needed
print('raw data shape and column names:')
print(raw.shape)
print()
print('Choose your field names:')
raw.columns

raw data shape and column names:
(25675, 9)

Choose your field names:


Index(['eventid', 'timestamp', 'sourceIPAddress', 'eventSource', 'eventName',
       'userAgent', 'userIdentity.arn', 'tactic1', 'tactic2'],
      dtype='object')

In [6]:

from ipywidgets import (
    Checkbox,
    IntText,
    Dropdown,
    VBox,
    HBox,
    Output,
    Button,
    HTML,
    Layout,
)
from IPython.display import display

def make_groupby_ui(df):
    """
    Interactive UI to group/aggregate a DataFrame by selected fields and show counts.

    - Column discovery is dynamic
    - Group-by fields selected via checkboxes
    - Aggregation runs only when button is pressed
    """

    all_cols = list(df.columns)

    # --- Column checkboxes ---
    col_checkboxes = {
        col: Checkbox(
            value=False,
            description=col,
            indent=False,
            layout=Layout(width="280px"),
        )
        for col in all_cols
    }

    # Arrange checkboxes in rows
    rows = []
    cols_per_row = 3
    items = list(col_checkboxes.values())
    for i in range(0, len(items), cols_per_row):
        rows.append(HBox(items[i:i + cols_per_row]))
    checkboxes_box = VBox(rows)

    # --- Controls ---
    sort_widget = Dropdown(
        options=["count ↓ (desc)", "count ↑ (asc)"],
        value="count ↑ (asc)",
        description="Sort",
        layout={"width": "220px"},
    )

    top_n_widget = IntText(
        value=50,
        description="Top N",
        layout={"width": "150px"},
    )

    run_button = Button(
        description="Run aggregation",
        button_style="primary",
        tooltip="Compute group-by counts",
    )

    out = Output()

    # --- Aggregation logic (runs on button click) ---
    def run_aggregation(_):
        with out:
            out.clear_output()

            group_by = [col for col, cb in col_checkboxes.items() if cb.value]

            if not group_by:
                display(HTML("<b>Please select at least one field to group by.</b>"))
                return

            agg = (
                df
                .groupby(group_by)
                .size()
                .reset_index(name="count")
            )

            ascending = (sort_widget.value == "count ↑ (asc)")
            agg = agg.sort_values("count", ascending=ascending)

            if top_n_widget.value and top_n_widget.value > 0:
                agg_display = agg.head(top_n_widget.value)
            else:
                agg_display = agg

            display(agg_display)
            print(
                f"\nShowing {len(agg_display)} of {len(agg)} group rows "
                f"(grouped by {', '.join(group_by)})."
            )

    run_button.on_click(run_aggregation)

    # --- Layout ---
    controls_box = VBox([
        HTML("<b>Select fields to group by:</b>"),
        checkboxes_box,
        HBox([sort_widget, top_n_widget, run_button]),
    ])

    ui = VBox([controls_box, out])
    display(ui)

# Example usage
pd.set_option('display.max_rows', 35)
pd.set_option('display.max_colwidth', None)
make_groupby_ui(raw)


VBox(children=(VBox(children=(HTML(value='<b>Select fields to group by:</b>'), VBox(children=(HBox(children=(C…

In [7]:
import pandas as pd
from ipywidgets import Dropdown, VBox, HBox, Output, interactive_output

def make_filter_ui(df, filter_cols=None, max_unique=50):
    """
    Build an interactive filter UI for a DataFrame.

    - If filter_cols is None, automatically choose columns with
      1 < nunique <= max_unique.
    - max_unique controls how 'wide' a dropdown can be before we skip it.
    """
    # Auto-discover filterable columns if none provided
    if filter_cols is None:
        auto_cols = []
        for col in df.columns:
            # dropna=True so we're counting real values
            nunique = df[col].nunique(dropna=True)
            if 1 < nunique <= max_unique:
                auto_cols.append(col)

        filter_cols = auto_cols

    if not filter_cols:
        print("No suitable columns found for dropdown filters.")
        return

    print("Using filter columns:", filter_cols)

    # Create one dropdown per filter column
    dropdowns = {}
    for col in filter_cols:
        values = df[col].dropna().astype(str).unique()
        values_sorted = sorted(values)
        options = ['(All)'] + values_sorted

        dropdowns[col] = Dropdown(
            options=options,
            value='(All)',
            description=col,
            layout={'width': '400px'}
        )

    out = Output()

    def filter_df(**kwargs):
        with out:
            out.clear_output()
            mask = pd.Series(True, index=df.index)

            for col, selected in kwargs.items():
                if selected != '(All)':
                    mask &= df[col].astype(str).eq(selected)

            filtered = df[mask]
            display(filtered.head(100))
            print(f"\nShowing {min(len(filtered), 100)} of {len(filtered)} matching rows.")

    interactive = interactive_output(
        filter_df,
        {col: dropdowns[col] for col in filter_cols}
    )

    controls_box = VBox(list(dropdowns.values()))
    ui = VBox([controls_box, out])

    display(ui, interactive)

# Call this once to build the UI (auto-discover columns)
make_filter_ui(raw)

# Or, if you ever want to force a specific set:
# make_filter_ui(ddf, filter_cols=['source', 'action', 'arn', 'tactic1', 'tactic2'])
pd.set_option('display.max_rows', 25)
pd.set_option('display.max_colwidth', None)

Using filter columns: ['sourceIPAddress', 'eventSource', 'userAgent', 'userIdentity.arn', 'tactic1', 'tactic2']


VBox(children=(VBox(children=(Dropdown(description='sourceIPAddress', layout=Layout(width='400px'), options=('…

Output()