Lloom workbench created to connect with Streamlit app via Voila

In [None]:
# Imports & configuration
import os, asyncio
from pathlib import Path
import pandas as pd
from IPython.display import display, clear_output

import ipywidgets as W  # Voilà-friendly widgets
import text_lloom.workbench as wb

# If you keep your key in .env, uncomment:
from dotenv import load_dotenv
load_dotenv(Path.cwd() / "private" / ".env")

# Make sure OPENAI_API_KEY is set in the environment your Voilà server uses.
print("Has OPENAI_API_KEY:", bool(os.getenv("OPENAI_API_KEY")))
#os.environ["OPENAI_API_KEY"] = "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"


DATA_PATH = "/Users/ltraum/Documents/GitHub/AmericanPizzaProject/data/pizza_interviews.xlsx"

DEMO_COLS = [
    "participant_id","age","city_of_residence","state_of_residence",
    "region_of_residence","income","pizza_consumption","food_restrictions",
]
RESPONSE_COLS = ["q1_response","q2_response","q3_response","q4_response","q5_response"]

# Load once (fast enough)
df_full = pd.read_excel(DATA_PATH)


Has OPENAI_API_KEY: True


In [4]:
def filter_demographics(df, regions=None):
    dff = df.copy()
    if regions:
        dff = dff[dff["region_of_residence"].isin(regions)]
    return dff.reset_index(drop=True)

def build_text_df(df, questions):
    if not questions:
        raise ValueError("Pick at least one question column.")
    missing = [q for q in questions if q not in df.columns]
    if missing:
        raise ValueError(f"Missing columns: {missing}")

    text = (
        df[questions]
        .apply(lambda row: " ".join([str(r) for r in row if pd.notnull(r) and str(r).strip() != ""]).strip(), axis=1)
    )
    out = df[DEMO_COLS].copy()
    out["text"] = text
    out = out[out["text"] != ""].copy()
    out["doc_id"] = out["participant_id"].astype(str)
    return out

def build_export_from_long(score_df, concepts, threshold=0.75, doc_id_col="doc_id"):
    n_docs = score_df[doc_id_col].astype(str).nunique()
    rows = []
    for cid, meta in concepts.items():
        name = meta.get("name") or meta.get("concept") or str(cid)
        prompt = meta.get("prompt") or ""
        summary = meta.get("summary")

        sub = score_df[score_df["concept_id"] == cid].copy()
        sub["is_match"] = pd.to_numeric(sub["score"], errors="coerce") >= threshold

        n_matches = sub.loc[sub["is_match"], doc_id_col].astype(str).nunique()
        prevalence = (n_matches / n_docs) if n_docs else 0.0
        highlights = (
            sub.loc[sub["is_match"], "highlight"]
               .dropna()
               .astype(str)
               .head(3)
               .tolist()
        )
        rows.append({
            "concept": name,
            "criteria": prompt,
            "summary": summary,
            "prevalence": prevalence,
            "n_matches": int(n_matches),
            "highlights": highlights,
        })
    return pd.DataFrame(rows).sort_values("n_matches", ascending=False).reset_index(drop=True)


In [5]:
# Widget controls
regions = sorted([r for r in df_full["region_of_residence"].dropna().unique()])
w_regions = W.SelectMultiple(options=regions, description="Regions", rows=6, layout=W.Layout(width="45%"))

w_qs = W.SelectMultiple(
    options=RESPONSE_COLS,
    value=("q1_response",),
    description="Questions",
    rows=6,
    layout=W.Layout(width="45%")
)

w_seed = W.Text(value="", description="Seed", placeholder="optional", layout=W.Layout(width="45%"))
w_max = W.IntSlider(value=5, min=1, max=10, step=1, description="Max themes", layout=W.Layout(width="45%"))
w_thresh = W.FloatSlider(value=0.75, min=0.5, max=0.95, step=0.05, description="Threshold", readout_format=".2f", layout=W.Layout(width="45%"))

w_run = W.Button(description="Run LLooM", button_style="primary", layout=W.Layout(width="200px"))
w_status = W.HTML(value="")

# Output areas
out_summary = W.Output()
out_workbench = W.Output()

ui = W.VBox([
    W.HTML("<h3>LLooM Workbench (American Pizza Project)</h3>"),
    W.HTML("<p>Select filters and parameters, then click <b>Run LLooM</b>.</p>"),
    W.HBox([w_regions, w_qs]),
    W.HBox([w_seed, w_max]),
    W.HBox([w_thresh, w_run]),
    w_status,
    W.HTML("<hr><h4>Themes Summary</h4>"),
    out_summary,
    W.HTML("<hr><h4>LLooM Workbench</h4>"),
    W.HTML("<p>Interactive matrix view of concepts × slices (slice: <code>region_of_residence</code>).</p>"),
    out_workbench,
])

display(ui)


VBox(children=(HTML(value='<h3>LLooM Workbench (American Pizza Project)</h3>'), HTML(value='<p>Select filters …

In [6]:
async def run_lloom_once():
    try:
        w_status.value = "<span style='color:gray'>Preparing data…</span>"

        # Slice & build
        regs = list(w_regions.value) if len(w_regions.value) else None
        df_slice = filter_demographics(df_full, regions=regs)
        df_text = build_text_df(df_slice, list(w_qs.value))

        if df_text.empty:
            w_status.value = "<span style='color:#b00'>No text rows after filters/questions.</span>"
            with out_summary: 
                clear_output()
                display(pd.DataFrame(columns=["concept","prevalence","n_matches","highlights"]))
            with out_workbench:
                clear_output()
            return

        # LLooM
        w_status.value = "<span style='color:gray'>Running LLooM induction…</span>"
        l = wb.lloom(df=df_text, text_col="text", id_col="doc_id")
        score_df = await l.gen_auto(max_concepts=int(w_max.value), seed=(w_seed.value or None), debug=False)
        concepts = {cid: c.to_dict() for cid, c in l.concepts.items()}

        # Summary
        w_status.value = "<span style='color:gray'>Building summary…</span>"
        export_df = build_export_from_long(score_df, concepts, threshold=float(w_thresh.value))

        # Show summary
        with out_summary:
            clear_output()
            if export_df.empty:
                display(pd.DataFrame(columns=["concept","prevalence","n_matches","highlights"]))
            else:
                # Show only non-zero matches; cap to top N=max_concepts
                shown = export_df[export_df["n_matches"] > 0].head(int(w_max.value)).copy()
                shown["prevalence (%)]"] = (shown["prevalence"] * 100).round(1)
                display(shown[["concept","prevalence (%)]","n_matches","highlights"]])

        # Show native LLooM visual (matrix)
        w_status.value = "<span style='color:gray'>Rendering Workbench…</span>"
        with out_workbench:
            clear_output()
            l.vis(slice_col="region_of_residence")  # native LLooM UI

        w_status.value = "<span style='color:green'>Done.</span>"

    except Exception as e:
        w_status.value = f"<span style='color:#b00'>Error: {e}</span>"

def on_click_run(_btn):
    # schedule the coroutine without blocking the UI (Voilà/Jupyter)
    asyncio.create_task(run_lloom_once())

w_run.on_click(on_click_run)


In [7]:
import os, pandas as pd
import text_lloom.workbench as wb

# 1) Confirm key
assert os.getenv("OPENAI_API_KEY"), "OPENAI_API_KEY not set. Export it or load from .env"

# 2) Build a tiny df_text (doc_id + text + region)
df1 = df[["participant_id","region_of_residence","q1_response"]].copy()
df1 = df1.rename(columns={"q1_response":"text"})
df1 = df1[df1["text"].notna() & (df1["text"].str.strip()!="")]
df1["doc_id"] = df1["participant_id"].astype(str)
df_text = df1[["doc_id","text","region_of_residence"]].head(50)  # small slice

# 3) Run LLooM + show workbench
l = wb.lloom(df=df_text, text_col="text", id_col="doc_id")
await l.gen_auto(max_concepts=5, seed=None, debug=False)

# IMPORTANT: end the cell with the widget so it renders
l.vis(slice_col="region_of_residence")


NameError: name 'df' is not defined