Lloom workbench created to connect with Streamlit app via Voila

In [17]:
# Imports & configuration
import os, asyncio
from pathlib import Path
import pandas as pd
from IPython.display import display, clear_output

import ipywidgets as W  # Voilà-friendly widgets
import text_lloom.workbench as wb

# If you keep your key in .env, uncomment:
from dotenv import load_dotenv
load_dotenv(Path.cwd() / "private" / ".env")

# Make sure OPENAI_API_KEY is set in the environment your Voilà server uses.

os.environ["OPENAI_API_KEY"] = "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"
print("Has OPENAI_API_KEY:", bool(os.getenv("OPENAI_API_KEY")))

DATA_PATH = "/Users/ltraum/Documents/GitHub/AmericanPizzaProject/data/pizza_interviews.xlsx"

DEMO_COLS = [
    "participant_id","age","city_of_residence","state_of_residence",
    "region_of_residence","income","pizza_consumption","food_restrictions",
]
RESPONSE_COLS = ["q1_response","q2_response","q3_response","q4_response","q5_response"]

# Load once (fast enough)
df_full= pd.read_excel(DATA_PATH)


Has OPENAI_API_KEY: True


import os, asyncio
from pathlib import Path
import pandas as pd
import ipywidgets as W
import text_lloom.workbench as wb
from IPython.display import display, clear_output


RESPONSE_COLS = ["q1_response","q2_response","q3_response","q4_response","q5_response"]
regions = sorted([r for r in df_full["region_of_residence"].dropna().unique()])

# --- Widgets ---
w_regions = W.SelectMultiple(options=regions, description="Regions", rows=5, layout=W.Layout(width="45%"))
w_qs      = W.SelectMultiple(options=RESPONSE_COLS, value=("q1_response",), description="Questions", rows=5, layout=W.Layout(width="45%"))
w_seed    = W.Text(value="", description="Seed", placeholder="optional", layout=W.Layout(width="45%"))
w_max     = W.IntSlider(value=3, min=1, max=10, step=1, description="Max themes", layout=W.Layout(width="45%"))
w_run     = W.Button(description="Run LLooM", button_style="primary", layout=W.Layout(width="220px"))
w_status  = W.HTML(value="")

# Output areas
out_summary   = W.Output()
out_workbench = W.Output()

ui = W.VBox([
    W.HTML("<h3>LLooM Workbench — American Pizza Project</h3>"),
    W.HTML("<p>Select a seed & which questions to include (Q1 vs Q1+Q2…); then click <b>Run LLooM</b>.</p>"),
    W.HBox([w_regions, w_qs]),
    W.HBox([w_seed, w_max]),
    w_run,
    w_status,
    W.HTML("<hr><h4>LLooM Workbench (native viz)</h4>"),
    out_workbench
])
display(ui)

def _build_text_df(df, questions):
    text = df[list(questions)].apply(
        lambda row: " ".join([str(r) for r in row if pd.notnull(r) and str(r).strip()!=""]).strip(),
        axis=1
    )
    out = df.copy()
    out["text"] = text
    out = out[out["text"]!=""].copy()
    out["doc_id"] = out.get("participant_id", out.index).astype(str)
    return out

async def _run():
    try:
        w_status.value = "<span style='color:gray'>Preparing data…</span>"

        # Slice & text build
        regs = list(w_regions.value) if len(w_regions.value) else None
        df_slice = df_full if not regs else df_full[df_full["region_of_residence"].isin(regs)].copy()
        df_run = _build_text_df(df_slice, list(w_qs.value))

        if df_run.empty:
            w_status.value = "<span style='color:#b00'>No text rows after filters/questions.</span>"
            with out_workbench: clear_output()
            return

        # LLooM
        w_status.value = "<span style='color:gray'>Running LLooM…</span>"
        l = wb.lloom(df=df_run, text_col="text", id_col="doc_id")
        await l.gen_auto(max_concepts=int(w_max.value), seed=(w_seed.value or None), debug=False)

        # Render native LLooM matrix (with region slice)
        with out_workbench:
            clear_output()
            l.vis(slice_col="region_of_residence")

        w_status.value = "<span style='color:green'>Done.</span>"

    except Exception as e:
        w_status.value = f"<span style='color:#b00'>Error: {e}</span>"

def _on_click(_):
    asyncio.create_task(_run())

w_run.on_click(_on_click)


In [18]:
QUESTIONS = ["q1_response", "q2_response", "q3_response", "q4_response", "q5_response"]
# If you want a smaller scope, e.g. just Q4:
# QUESTIONS = ["q4_response"]

# 3) Build a text column by concatenating selected questions (skips blanks)
text = df[QUESTIONS].apply(
    lambda row: " ".join([str(r) for r in row if pd.notnull(r) and str(r).strip() != ""]).strip(),
    axis=1
)
df_run = df.copy()
df_run["text"] = text
df_run = df_run[df_run["text"] != ""].copy()

# 4) Stable ID for LLooM
#    Use participant_id if present; otherwise fall back to index
if "participant_id" in df_run.columns:
    df_run["doc_id"] = df_run["participant_id"].astype(str)
else:
    df_run["doc_id"] = df_run.index.astype(str)

# 5) Create LLooM instance and generate concepts (auto)
l = wb.lloom(df=df_run, text_col="text", id_col="doc_id")
score_df = await l.gen_auto(max_concepts=5, seed=None, debug=False)  # set seed="family" etc. if you want

# 6) Show the original LLooM Workbench UI (matrix).
#    Add a slice by region like in the docs:
l.vis(slice_col="region_of_residence")



[1mEstimated cost[0m: $0.12
**Please note that this is only an approximate cost estimate**


[48;5;117mDistill-filter[0m
✅ Done    


[48;5;117mDistill-summarize[0m
✅ Done    


[48;5;117mCluster[0m
✅ Done    


[48;5;117mSynthesize[0m
⠹ Loading 



✅ Done    
✅ Done with concept generation!


[1mActive concepts[0m (n=5):
- [1mRegional Pizza Preferences[0m: Does the text describe specific regional pizza styles or preferences?
- [1mPizza as Social Food[0m: Does the text highlight pizza as a social or family-oriented food?
- [1mIngredient and Quality Focus[0m: Does the text emphasize the importance of high-quality ingredients or specific pizza characteristics?
- [1mPizza Evolution and Adaptation[0m: Does the text discuss changes in pizza preferences or adaptations over time?
- [1mPizza as Convenience Food[0m: Does the text describe pizza as a convenient or practical food choice?


Scoring 5 concepts for 50 documents
[1mEstimated cost[0m: $0.02
**Please note that this is only an approximate cost estimate**
100%|██████████| 5/5 [00:44<00:00,  8.86s/it]
✅ Done with concept scoring!


MatrixWidget(data='[{"id":"All","value":39,"example":"All","_my_score":0,"concept":"Regional Pizza Preferences…

In [19]:
try:
    export_df = l.export_df()
    display(export_df.head(10))
except Exception as e:
    print("export_df not available here:", e)

Unnamed: 0,concept,criteria,summary,rep_examples,prevalence,n_matches,highlights
0,Ingredient and Quality Focus,Does the text emphasize the importance of high...,"We prioritize high-quality, fresh ingredients ...",[My big pizza moment was trying Regina's in th...,0.86,43,"[I love San Diego's fresh, California-style ap..."
1,Pizza Evolution and Adaptation,Does the text discuss changes in pizza prefere...,"My pizza journey reflects regional influences,...",[My relationship with pizza has gone through p...,0.78,39,[Pizza's just not something that was part of m...
2,Pizza as Convenience Food,Does the text describe pizza as a convenient o...,Pizza is our go-to convenience food for casual...,[I've liked pizza since I was a kid and my pre...,0.72,36,[Pizza is social food for me - splitting pies ...
3,Pizza as Social Food,Does the text highlight pizza as a social or f...,"Pizza is our go-to social food, perfect for ga...","[Detroit pizza isn't just food, it's cultural ...",0.88,44,"[It's still social food for me, but I have to ..."
4,Regional Pizza Preferences,Does the text describe specific regional pizza...,Regional pizza preferences reflect diverse sty...,[Moving to Boise from Chicago was a pizza cult...,0.78,39,[New Mexico style with green chile is my go-to...


In [20]:
def filter_demographics(df, regions=None):
    dff = df.copy()
    if regions:
        dff = dff[dff["region_of_residence"].isin(regions)]
    return dff.reset_index(drop=True)

def build_text_df(df, questions):
    if not questions:
        raise ValueError("Pick at least one question column.")
    missing = [q for q in questions if q not in df.columns]
    if missing:
        raise ValueError(f"Missing columns: {missing}")

    text = (
        df[questions]
        .apply(lambda row: " ".join([str(r) for r in row if pd.notnull(r) and str(r).strip() != ""]).strip(), axis=1)
    )
    out = df[DEMO_COLS].copy()
    out["text"] = text
    out = out[out["text"] != ""].copy()
    out["doc_id"] = out["participant_id"].astype(str)
    return out

def build_export_from_long(score_df, concepts, threshold=0.75, doc_id_col="doc_id"):
    n_docs = score_df[doc_id_col].astype(str).nunique()
    rows = []
    for cid, meta in concepts.items():
        name = meta.get("name") or meta.get("concept") or str(cid)
        prompt = meta.get("prompt") or ""
        summary = meta.get("summary")

        sub = score_df[score_df["concept_id"] == cid].copy()
        sub["is_match"] = pd.to_numeric(sub["score"], errors="coerce") >= threshold

        n_matches = sub.loc[sub["is_match"], doc_id_col].astype(str).nunique()
        prevalence = (n_matches / n_docs) if n_docs else 0.0
        highlights = (
            sub.loc[sub["is_match"], "highlight"]
               .dropna()
               .astype(str)
               .head(3)
               .tolist()
        )
        rows.append({
            "concept": name,
            "criteria": prompt,
            "summary": summary,
            "prevalence": prevalence,
            "n_matches": int(n_matches),
            "highlights": highlights,
        })
    return pd.DataFrame(rows).sort_values("n_matches", ascending=False).reset_index(drop=True)


In [21]:
# Widget controls
regions = sorted([r for r in df_full["region_of_residence"].dropna().unique()])
w_regions = W.SelectMultiple(options=regions, description="Regions", rows=6, layout=W.Layout(width="45%"))

w_qs = W.SelectMultiple(
    options=RESPONSE_COLS,
    value=("q1_response",),
    description="Questions",
    rows=6,
    layout=W.Layout(width="45%")
)

w_seed = W.Text(value="", description="Seed", placeholder="optional", layout=W.Layout(width="45%"))
w_max = W.IntSlider(value=5, min=1, max=10, step=1, description="Max themes", layout=W.Layout(width="45%"))
w_thresh = W.FloatSlider(value=0.75, min=0.5, max=0.95, step=0.05, description="Threshold", readout_format=".2f", layout=W.Layout(width="45%"))

w_run = W.Button(description="Run LLooM", button_style="primary", layout=W.Layout(width="200px"))
w_status = W.HTML(value="")

# Output areas
out_summary = W.Output()
out_workbench = W.Output()

ui = W.VBox([
    W.HTML("<h3>LLooM Workbench (American Pizza Project)</h3>"),
    W.HTML("<p>Select filters and parameters, then click <b>Run LLooM</b>.</p>"),
    W.HBox([w_regions, w_qs]),
    W.HBox([w_seed, w_max]),
    W.HBox([w_thresh, w_run]),
    w_status,
    W.HTML("<hr><h4>Themes Summary</h4>"),
    out_summary,
    W.HTML("<hr><h4>LLooM Workbench</h4>"),
    W.HTML("<p>Interactive matrix view of concepts × slices (slice: <code>region_of_residence</code>).</p>"),
    out_workbench,
])

display(ui)


VBox(children=(HTML(value='<h3>LLooM Workbench (American Pizza Project)</h3>'), HTML(value='<p>Select filters …

In [22]:
async def run_lloom_once():
    try:
        w_status.value = "<span style='color:gray'>Preparing data…</span>"

        # Slice & build
        regs = list(w_regions.value) if len(w_regions.value) else None
        df_slice = filter_demographics(df_full, regions=regs)
        df_text = build_text_df(df_slice, list(w_qs.value))

        if df_text.empty:
            w_status.value = "<span style='color:#b00'>No text rows after filters/questions.</span>"
            with out_summary: 
                clear_output()
                display(pd.DataFrame(columns=["concept","prevalence","n_matches","highlights"]))
            with out_workbench:
                clear_output()
            return

        # LLooM
        w_status.value = "<span style='color:gray'>Running LLooM induction…</span>"
        l = wb.lloom(df=df_text, text_col="text", id_col="doc_id")
        score_df = await l.gen_auto(max_concepts=int(w_max.value), seed=(w_seed.value or None), debug=False)
        concepts = {cid: c.to_dict() for cid, c in l.concepts.items()}

        # Summary
        w_status.value = "<span style='color:gray'>Building summary…</span>"
        export_df = build_export_from_long(score_df, concepts, threshold=float(w_thresh.value))

        # Show summary
        with out_summary:
            clear_output()
            if export_df.empty:
                display(pd.DataFrame(columns=["concept","prevalence","n_matches","highlights"]))
            else:
                # Show only non-zero matches; cap to top N=max_concepts
                shown = export_df[export_df["n_matches"] > 0].head(int(w_max.value)).copy()
                shown["prevalence (%)]"] = (shown["prevalence"] * 100).round(1)
                display(shown[["concept","prevalence (%)]","n_matches","highlights"]])

        # Show native LLooM visual (matrix)
        w_status.value = "<span style='color:gray'>Rendering Workbench…</span>"
        with out_workbench:
            clear_output()
            l.vis(slice_col="region_of_residence")  # native LLooM UI

        w_status.value = "<span style='color:green'>Done.</span>"

    except Exception as e:
        w_status.value = f"<span style='color:#b00'>Error: {e}</span>"

def on_click_run(_btn):
    # schedule the coroutine without blocking the UI (Voilà/Jupyter)
    asyncio.create_task(run_lloom_once())

w_run.on_click(on_click_run)


In [23]:
import os, pandas as pd
import text_lloom.workbench as wb

# 1) Confirm key
assert os.getenv("OPENAI_API_KEY"), "OPENAI_API_KEY not set. Export it or load from .env"

# 2) Build a tiny df_text (doc_id + text + region)
df1 = df[["participant_id","region_of_residence","q1_response"]].copy()
df1 = df1.rename(columns={"q1_response":"text"})
df1 = df1[df1["text"].notna() & (df1["text"].str.strip()!="")]
df1["doc_id"] = df1["participant_id"].astype(str)
df_text = df1[["doc_id","text","region_of_residence"]].head(50)  # small slice

# 3) Run LLooM + show workbench
l = wb.lloom(df=df_text, text_col="text", id_col="doc_id")
await l.gen_auto(max_concepts=5, seed=None, debug=False)

# IMPORTANT: end the cell with the widget so it renders
l.vis(slice_col="region_of_residence")




[1mEstimated cost[0m: $0.06
**Please note that this is only an approximate cost estimate**


[48;5;117mDistill-filter[0m
✅ Done    


[48;5;117mDistill-summarize[0m
✅ Done    


[48;5;117mCluster[0m
✅ Done    


[48;5;117mSynthesize[0m
⠹ Loading 



✅ Done    
✅ Done with concept generation!


[1mActive concepts[0m (n=5):
- [1mUnique Toppings[0m: Does the text example mention unique or unconventional pizza toppings?
- [1mEvolving Pizza Trends[0m: Does the text example highlight changes or trends in pizza styles or preferences over time?
- [1mPizza as Craft[0m: Does the text example appreciate pizza as a craft or art form?
- [1mLocal Ingredients[0m: Does the text emphasize the use of local ingredients in pizza making?
- [1mCultural Identity[0m: Does the text discuss how pizza represents or reflects cultural identity?


Scoring 5 concepts for 50 documents
[1mEstimated cost[0m: $0.02
**Please note that this is only an approximate cost estimate**
100%|██████████| 5/5 [00:44<00:00,  8.92s/it]
✅ Done with concept scoring!


MatrixWidget(data='[{"id":"All","value":17,"example":"All","_my_score":0,"concept":"Unique Toppings","n":17},{…