perstack-ai · FL4TLiN3 · Dec 8, 2025 · Dec 8, 2025 · Dec 8, 2025 · Dec 8, 2025
diff --git a/.changeset/twenty-dragons-kick.md b/.changeset/twenty-dragons-kick.md
@@ -0,0 +1,2 @@
+---
+---
diff --git a/e2e/README.md b/e2e/README.md
@@ -33,14 +33,21 @@ e2e/
 │   ├── mixed-tools.toml      # MCP + Delegate + Interactive
 │   ├── parallel-mcp.toml     # Parallel MCP calls
 │   ├── delegate-chain.toml   # Delegation chain
-│   └── continue-resume.toml  # Continue/resume functionality
+│   ├── continue-resume.toml  # Continue/resume functionality
+│   ├── special-tools.toml    # Special tools parallel execution
+│   └── multi-modal.toml      # PDF and image reading
+├── fixtures/                 # Test fixtures
+│   ├── test.pdf              # PDF file for multi-modal tests
+│   └── test.gif              # GIF image for multi-modal tests
 ├── run.test.ts               # CLI run command
 ├── publish.test.ts           # CLI publish command
 ├── unpublish.test.ts         # CLI unpublish command
 ├── tag.test.ts               # CLI tag command
 ├── status.test.ts            # CLI status command
 ├── mixed-tools.test.ts       # Mixed tool calls (MCP + Delegate + Interactive)
 ├── parallel-mcp.test.ts      # Parallel MCP tool execution
+├── special-tools.test.ts     # Special tools (think, readPdfFile, readImageFile)
+├── multi-modal.test.ts       # PDF and image content verification
 ├── delegate-chain.test.ts    # Expert delegation chain
 └── continue-resume.test.ts   # --continue-run and --resume-from
 ```
@@ -51,24 +58,26 @@ e2e/
 
 Tests for CLI argument validation and error handling.
 
-| File | Tests | Coverage |
-|------|-------|----------|
-| run.test.ts | 4 | Missing args, nonexistent expert, invalid config |
-| publish.test.ts | 4 | dry-run success, nonexistent expert, config errors |
-| unpublish.test.ts | 2 | Missing version, missing --force |
-| tag.test.ts | 2 | Missing version, missing tags |
-| status.test.ts | 3 | Missing version/status, invalid status |
+| File              | Tests | Coverage                                           |
+| ----------------- | ----- | -------------------------------------------------- |
+| run.test.ts       | 4     | Missing args, nonexistent expert, invalid config   |
+| publish.test.ts   | 4     | dry-run success, nonexistent expert, config errors |
+| unpublish.test.ts | 2     | Missing version, missing --force                   |
+| tag.test.ts       | 2     | Missing version, missing tags                      |
+| status.test.ts    | 3     | Missing version/status, invalid status             |
 
 ### Runtime Features
 
 Tests for parallel tool calls, delegation, and state management.
 
-| File | Tests | Coverage |
-|------|-------|----------|
-| mixed-tools.test.ts | 4 | MCP + Delegate + Interactive in single response |
-| parallel-mcp.test.ts | 3 | Parallel MCP tool execution |
-| delegate-chain.test.ts | 3 | Multi-level delegation |
-| continue-resume.test.ts | 4 | --continue-run, --resume-from |
+| File                    | Tests | Coverage                                             |
+| ----------------------- | ----- | ---------------------------------------------------- |
+| mixed-tools.test.ts     | 4     | MCP + Delegate + Interactive in single response      |
+| parallel-mcp.test.ts    | 3     | Parallel MCP tool execution                          |
+| special-tools.test.ts   | 6     | think, readPdfFile, readImageFile parallel execution |
+| multi-modal.test.ts     | 2     | PDF and image content reading verification           |
+| delegate-chain.test.ts  | 3     | Multi-level delegation                               |
+| continue-resume.test.ts | 4     | --continue-run, --resume-from                        |
 
 ## Writing Tests
 
@@ -118,3 +127,14 @@ describe("Runtime feature", () => {
 - TUI-based commands (`start`) are excluded from E2E tests
 - API-calling tests (actual publish, unpublish) require registry access and are not included
 
+## Multi-Modal Test Verification (IMPORTANT)
+
+**Every time E2E tests are run, manually verify the multi-modal test output.**
+
+The `multi-modal.test.ts` outputs the LLM's summary of PDF and image files. Check the console output to ensure:
+
+1. **PDF Summary**: Should describe perstack GitHub README content (features, license, etc.)
+2. **Image Description**: Should describe terminal/CLI interface content
+
+These logs confirm that `readPdfFile` and `readImageFile` tools are correctly reading file contents. Automated assertions check for keywords, but human review ensures the content is actually understood.
+
diff --git a/e2e/experts/multi-modal.toml b/e2e/experts/multi-modal.toml
@@ -0,0 +1,47 @@
+model = "claude-sonnet-4-5"
+temperature = 0.3
+
+[provider]
+providerName = "anthropic"
+
+envPath = [".env", ".env.local"]
+
+[experts."e2e-pdf-reader"]
+version = "1.0.0"
+description = "E2E test expert for PDF file reading"
+instruction = """
+You are a PDF content analyzer.
+
+When given a task to analyze a PDF file:
+1. Use readPdfFile to read the PDF at the specified path
+2. Summarize the content briefly
+3. Call attemptCompletion with your summary
+
+Be concise but include key details about what the document contains.
+"""
+
+[experts."e2e-pdf-reader".skills."@perstack/base"]
+type = "mcpStdioSkill"
+command = "npx"
+packageName = "@perstack/base"
+pick = ["attemptCompletion", "readPdfFile"]
+
+[experts."e2e-image-reader"]
+version = "1.0.0"
+description = "E2E test expert for image file reading"
+instruction = """
+You are an image content analyzer.
+
+When given a task to analyze an image file:
+1. Use readImageFile to read the image at the specified path
+2. Describe what you see in the image
+3. Call attemptCompletion with your description
+
+Be concise but include key visual details about what the image shows.
+"""
+
+[experts."e2e-image-reader".skills."@perstack/base"]
+type = "mcpStdioSkill"
+command = "npx"
+packageName = "@perstack/base"
+pick = ["attemptCompletion", "readImageFile"]
diff --git a/e2e/multi-modal.test.ts b/e2e/multi-modal.test.ts
@@ -0,0 +1,59 @@
+import { describe, expect, it } from "vitest"
+import { assertEventSequenceContains } from "./lib/assertions.js"
+import { runExpert } from "./lib/runner.js"
+
+describe("Multi-Modal File Reading", () => {
+  describe("PDF Reading", () => {
+    it("should read and summarize PDF content about perstack github", async () => {
+      const result = await runExpert(
+        "e2e-pdf-reader",
+        "Read and summarize the PDF at e2e/fixtures/test.pdf",
+        {
+          configPath: "./e2e/experts/multi-modal.toml",
+          timeout: 180000,
+        },
+      )
+      expect(result.exitCode).toBe(0)
+      expect(
+        assertEventSequenceContains(result.events, ["startRun", "callTools", "completeRun"]).passed,
+      ).toBe(true)
+      const completeEvent = result.events.find((e) => e.type === "completeRun")
+      expect(completeEvent).toBeDefined()
+      const text = completeEvent && "text" in completeEvent ? (completeEvent.text as string) : ""
+      console.log("\n=== PDF Summary ===\n", text, "\n=== END ===\n")
+      expect(
+        text.toLowerCase().includes("perstack") ||
+          text.toLowerCase().includes("github") ||
+          text.toLowerCase().includes("repository"),
+      ).toBe(true)
+    }, 200000)
+  })
+
+  describe("Image Reading", () => {
+    it("should read and describe image content about perstack demo", async () => {
+      const result = await runExpert(
+        "e2e-image-reader",
+        "Read and describe the image at e2e/fixtures/test.gif",
+        {
+          configPath: "./e2e/experts/multi-modal.toml",
+          timeout: 180000,
+        },
+      )
+      expect(result.exitCode).toBe(0)
+      expect(
+        assertEventSequenceContains(result.events, ["startRun", "callTools", "completeRun"]).passed,
+      ).toBe(true)
+      const completeEvent = result.events.find((e) => e.type === "completeRun")
+      expect(completeEvent).toBeDefined()
+      const text = completeEvent && "text" in completeEvent ? (completeEvent.text as string) : ""
+      console.log("\n=== Image Description ===\n", text, "\n=== END ===\n")
+      expect(
+        text.toLowerCase().includes("perstack") ||
+          text.toLowerCase().includes("demo") ||
+          text.toLowerCase().includes("terminal") ||
+          text.toLowerCase().includes("cli") ||
+          text.toLowerCase().includes("interface"),
+      ).toBe(true)
+    }, 200000)
+  })
+})
diff --git a/packages/core/src/schemas/runtime.ts b/packages/core/src/schemas/runtime.ts
@@ -247,12 +247,6 @@ type ExpertEventPayloads = {
   resolveThought: {
     toolResult: ToolResult
   }
-  resolvePdfFile: {
-    toolResult: ToolResult
-  }
-  resolveImageFile: {
-    toolResult: ToolResult
-  }
   attemptCompletion: {
     toolResult: ToolResult
   }
@@ -343,8 +337,6 @@ export const callInteractiveTool = createEvent("callInteractiveTool")
 export const callDelegate = createEvent("callDelegate")
 export const resolveToolResults = createEvent("resolveToolResults")
 export const resolveThought = createEvent("resolveThought")
-export const resolvePdfFile = createEvent("resolvePdfFile")
-export const resolveImageFile = createEvent("resolveImageFile")
 export const attemptCompletion = createEvent("attemptCompletion")
 export const finishToolCall = createEvent("finishToolCall")
 export const resumeToolCalls = createEvent("resumeToolCalls")

diff --git a/packages/perstack/src/lib/tui.tsx b/packages/perstack/src/lib/tui.tsx
@@ -106,14 +106,6 @@ export function defaultEventListener(e: RunEvent): void {
       log(`${header(e)} Resolved Thought:`, e.toolResult)
       break
     }
-    case "resolvePdfFile": {
-      log(`${header(e)} Resolved PDF:`, e.toolResult)
-      break
-    }
-    case "resolveImageFile": {
-      log(`${header(e)} Resolved Image:`, e.toolResult)
-      break
-    }
     case "attemptCompletion": {
       log(`${header(e)} Attempting completion`)
       break

diff --git a/packages/runtime/README.md b/packages/runtime/README.md
@@ -193,16 +193,12 @@ stateDiagram-v2
 
     CallingTools --> ResolvingToolResults: resolveToolResults
     CallingTools --> ResolvingThought: resolveThought
-    CallingTools --> ResolvingPdfFile: resolvePdfFile
-    CallingTools --> ResolvingImageFile: resolveImageFile
     CallingTools --> GeneratingRunResult: attemptCompletion
     CallingTools --> CallingDelegate: callDelegate
     CallingTools --> CallingInteractiveTool: callInteractiveTool
 
     ResolvingToolResults --> FinishingStep: finishToolCall
     ResolvingThought --> FinishingStep: finishToolCall
-    ResolvingPdfFile --> FinishingStep: finishToolCall
-    ResolvingImageFile --> FinishingStep: finishToolCall
 
     GeneratingRunResult --> Stopped: completeRun
     GeneratingRunResult --> FinishingStep: retry
@@ -219,7 +215,7 @@ Events trigger state transitions. They are emitted by the runtime logic or exter
 
 - **Lifecycle**: `startRun`, `startGeneration`, `continueToNextStep`, `completeRun`
 - **Tool Execution**: `callTools`, `resolveToolResults`, `finishToolCall`, `resumeToolCalls`, `finishAllToolCalls`
-- **Special Types**: `resolveThought`, `resolvePdfFile`, `resolveImageFile`
+- **Special Types**: `resolveThought`
 - **Mixed Tool Calls**: `callDelegate`, `callInteractiveTool` (from CallingTools state)
 - **Interruption**: `stopRunByInteractiveTool`, `stopRunByDelegate`, `stopRunByExceededMaxSteps`
 - **Error Handling**: `retry`

diff --git a/packages/runtime/src/runtime-state-machine.ts b/packages/runtime/src/runtime-state-machine.ts
@@ -10,8 +10,6 @@ import { generatingRunResultLogic } from "./states/generating-run-result.js"
 import { generatingToolCallLogic } from "./states/generating-tool-call.js"
 import { initLogic } from "./states/init.js"
 import { preparingForStepLogic } from "./states/preparing-for-step.js"
-import { resolvingImageFileLogic } from "./states/resolving-image-file.js"
-import { resolvingPdfFileLogic } from "./states/resolving-pdf-file.js"
 import { resolvingThoughtLogic } from "./states/resolving-thought.js"
 import { resolvingToolResultLogic } from "./states/resolving-tool-result.js"
 import { createEmptyUsage, sumUsage } from "./usage.js"
@@ -237,26 +235,6 @@ export const runtimeStateMachine = setup({
               }) satisfies Step,
           }),
         },
-        resolvePdfFile: {
-          target: "ResolvingPdfFile",
-          actions: assign({
-            step: ({ context, event }) =>
-              ({
-                ...context.step,
-                toolResults: [event.toolResult],
-              }) satisfies Step,
-          }),
-        },
-        resolveImageFile: {
-          target: "ResolvingImageFile",
-          actions: assign({
-            step: ({ context, event }) =>
-              ({
-                ...context.step,
-                toolResults: [event.toolResult],
-              }) satisfies Step,
-          }),
-        },
         attemptCompletion: {
           target: "GeneratingRunResult",
           actions: assign({
@@ -336,46 +314,6 @@ export const runtimeStateMachine = setup({
       },
     },
 
-    ResolvingPdfFile: {
-      on: {
-        finishToolCall: {
-          target: "FinishingStep",
-          actions: assign({
-            checkpoint: ({ context, event }) =>
-              ({
-                ...context.checkpoint,
-                messages: [...context.checkpoint.messages, ...event.newMessages],
-              }) satisfies Checkpoint,
-            step: ({ context, event }) =>
-              ({
-                ...context.step,
-                newMessages: [...context.step.newMessages, ...event.newMessages],
-              }) satisfies Step,
-          }),
-        },
-      },
-    },
-
-    ResolvingImageFile: {
-      on: {
-        finishToolCall: {
-          target: "FinishingStep",
-          actions: assign({
-            checkpoint: ({ context, event }) =>
-              ({
-                ...context.checkpoint,
-                messages: [...context.checkpoint.messages, ...event.newMessages],
-              }) satisfies Checkpoint,
-            step: ({ context, event }) =>
-              ({
-                ...context.step,
-                newMessages: [...context.step.newMessages, ...event.newMessages],
-              }) satisfies Step,
-          }),
-        },
-      },
-    },
-
     GeneratingRunResult: {
       on: {
         retry: {
@@ -482,8 +420,6 @@ export const StateMachineLogics: Record<
   CallingTool: callingToolLogic,
   ResolvingToolResult: resolvingToolResultLogic,
   ResolvingThought: resolvingThoughtLogic,
-  ResolvingPdfFile: resolvingPdfFileLogic,
-  ResolvingImageFile: resolvingImageFileLogic,
   GeneratingRunResult: generatingRunResultLogic,
   CallingInteractiveTool: callingInteractiveToolLogic,
   CallingDelegate: callingDelegateLogic,