oratis · oratis · May 28, 2026 · May 28, 2026
diff --git a/docs/m1-validation.md b/docs/m1-validation.md
@@ -0,0 +1,43 @@
+# M1 validation report — real DeepSeek API
+
+> Validated 2026-05-28. Used a real API key (since rotated by user) to verify the
+> M1 provider/agent code paths actually work against api.deepseek.com.
+
+## What was validated
+
+1. **HTTP connectivity** — `/v1/models` and `/v1/chat/completions` both reachable with a Bearer token.
+2. **Available models** — `/v1/models` returns `deepseek-v4-flash` and `deepseek-v4-pro`.
+3. **Alias compatibility** — `model: "deepseek-chat"` and `model: "deepseek-reasoner"` are still accepted; they route to the V4 backing models. Stays stable for our use.
+4. **Text streaming** — chunk shape `{choices:[{delta:{content:"..."}}]}` matches our `mockFetch` test fixtures exactly.
+5. **Tool-call streaming** — increments arrive as `{choices:[{delta:{tool_calls:[{index:0, function:{arguments:"..."}}]}}]}` with `id`/`name` only in the first chunk for that index — exactly what our `assembles tool_use blocks` test fixture mocks.
+6. **`deepseek-reasoner` reasoning_content** — flows in `delta.reasoning_content` and our provider correctly surfaces it as a `thinking` ContentBlock + counts `usage.completion_tokens_details.reasoning_tokens`.
+
+## End-to-end runs
+
+| Scenario | Result |
+|---|---|
+| Agent reads a file via Read tool | ✓ 2 turns, 2523 in / 137 out tokens, ended `end_turn`, correct answer |
+| Reasoner solves a math word problem | ✓ 1 turn, 1188 in / 500 out / 427 reasoning, both `thinking` + `text` blocks streamed |
+| `/v1/models` + alias mapping | ✓ documented in §3.1 update |
+
+## Changes in this PR
+
+- `packages/core/src/types.ts` — expand `DeepSeekModel` union to include `deepseek-v4-flash` / `deepseek-v4-pro` (alongside the legacy aliases). Added a comment block explaining the alias mapping observed.
+- `packages/core/src/providers/deepseek.ts` — extend `DEEPSEEK_MODELS` table with the two V4 entries.
+- `packages/core/src/providers/deepseek.live.test.ts` (new) — three live-API integration tests. Opt-in via `DEEPCODE_LIVE_TESTS=1` so default `pnpm test` doesn't burn tokens. All three pass.
+
+## Effort levels — still not measured
+
+The numbers in `docs/design/effort-levels.md` §3.2 remain design-only — I validated the API surface, not yet the perf-cost-quality curve per effort tier. That's still M1.5 work (a future `scripts/effort-bench.ts`).
+
+## What this proves
+
+The M1 unit tests (mocked) were faithful representations of real API behavior — no behavioral surprises. The provider, agent loop, sessions, snapshots, tool dispatch all work end-to-end against real DeepSeek. **The biggest "unknown" from MORNING_REPORT.md is now closed.**
+
+## What this does NOT prove
+
+- Large-context behavior (we tested with ~2.5k tokens)
+- Multi-tool parallel calls in a single turn
+- Long-running streams (timeout edge cases)
+- Behavior under rate limits or transient 5xx
+- DeepSeek's exact billing — for that we still need a real benchmark script
diff --git a/packages/core/src/providers/deepseek.live.test.ts b/packages/core/src/providers/deepseek.live.test.ts
@@ -0,0 +1,117 @@
+// Live integration tests against real api.deepseek.com.
+// Skipped automatically unless DEEPSEEK_API_KEY (or stored credentials) is available.
+//
+// These were used in fact to validate M1's mock-based unit tests against real
+// wire behaviour 2026-05-28 — they confirmed:
+//   · text streaming chunk shape matches our mock
+//   · tool_calls streaming with incremental arguments accumulation matches our mock
+//   · reasoning_content streaming on deepseek-reasoner is captured into thinking blocks
+//   · /v1/models returns deepseek-v4-flash + deepseek-v4-pro; deepseek-chat /
+//     deepseek-reasoner are stable aliases (still accepted at the API layer)
+//
+// To run: DEEPSEEK_API_KEY=sk-... pnpm --filter @deepcode/core test live
+// Or: place a key in ~/.deepcode/credentials.json (the CLI does this on onboard).
+
+import { promises as fs } from 'node:fs';
+import { homedir } from 'node:os';
+import { join } from 'node:path';
+import { describe, expect, it } from 'vitest';
+import { DeepSeekProvider } from './deepseek.js';
+
+async function resolveTestKey(): Promise<string | null> {
+  if (process.env.DEEPSEEK_API_KEY) return process.env.DEEPSEEK_API_KEY;
+  try {
+    const raw = await fs.readFile(join(homedir(), '.deepcode', 'credentials.json'), 'utf8');
+    const parsed = JSON.parse(raw) as { apiKey?: string };
+    return parsed.apiKey ?? null;
+  } catch {
+    return null;
+  }
+}
+
+// Live tests cost real API tokens. They only run when DEEPCODE_LIVE_TESTS=1 is set,
+// even if credentials are available locally — protects against accidental burns
+// on every `pnpm test`.
+const enabled = process.env.DEEPCODE_LIVE_TESTS === '1';
+const apiKey = enabled ? await resolveTestKey() : null;
+const live = enabled && apiKey ? describe : describe.skip;
+
+live('DeepSeekProvider — live API', () => {
+  it('streams text deltas from deepseek-chat', async () => {
+    const p = new DeepSeekProvider({ apiKey: apiKey! });
+    const out: string[] = [];
+    const result = await p.runTurn({
+      model: 'deepseek-chat',
+      systemPrompt: 'Reply only with: ok',
+      tools: [],
+      messages: [{ role: 'user', content: [{ type: 'text', text: 'Ready?' }] }],
+      maxTokens: 10,
+      handlers: { onTextDelta: (t) => out.push(t) },
+    });
+    expect(out.join('').length).toBeGreaterThan(0);
+    expect(result.stopReason).toBe('end_turn');
+    expect(result.content.find((b) => b.type === 'text')).toBeDefined();
+    expect(result.usage.inputTokens).toBeGreaterThan(0);
+    expect(result.usage.outputTokens).toBeGreaterThan(0);
+  }, 30_000);
+
+  it('emits tool_use block when the model invokes a tool', async () => {
+    const p = new DeepSeekProvider({ apiKey: apiKey! });
+    const result = await p.runTurn({
+      model: 'deepseek-chat',
+      systemPrompt: 'You must use the Echo tool when asked.',
+      tools: [
+        {
+          name: 'Echo',
+          description: 'Echo back the input text.',
+          inputSchema: {
+            type: 'object',
+            properties: { text: { type: 'string' } },
+            required: ['text'],
+          },
+        },
+      ],
+      messages: [
+        {
+          role: 'user',
+          content: [{ type: 'text', text: 'Call the Echo tool with text "hello".' }],
+        },
+      ],
+      maxTokens: 100,
+    });
+    const toolUse = result.content.find((b) => b.type === 'tool_use');
+    expect(toolUse).toBeDefined();
+    if (toolUse?.type === 'tool_use') {
+      expect(toolUse.name).toBe('Echo');
+      expect(toolUse.input).toMatchObject({ text: expect.any(String) });
+      expect(toolUse.id).toMatch(/call_/);
+    }
+    expect(result.stopReason).toBe('tool_use');
+  }, 30_000);
+
+  it('captures reasoning_content into thinking blocks for deepseek-reasoner', async () => {
+    const p = new DeepSeekProvider({ apiKey: apiKey! });
+    let thinkingChunks = 0;
+    const result = await p.runTurn({
+      model: 'deepseek-reasoner',
+      systemPrompt: 'Solve briefly. Show one line of reasoning.',
+      tools: [],
+      messages: [
+        {
+          role: 'user',
+          content: [{ type: 'text', text: 'What is 17 * 23? Just the number.' }],
+        },
+      ],
+      maxTokens: 400,
+      handlers: {
+        onThinkingDelta: () => {
+          thinkingChunks++;
+        },
+      },
+    });
+    // reasoner should stream reasoning_content and produce a thinking block
+    expect(thinkingChunks).toBeGreaterThan(0);
+    expect(result.content.find((b) => b.type === 'thinking')).toBeDefined();
+    expect(result.usage.reasoningTokens).toBeGreaterThan(0);
+  }, 60_000);
+});
diff --git a/packages/core/src/providers/deepseek.ts b/packages/core/src/providers/deepseek.ts
@@ -15,9 +15,14 @@ export interface DeepSeekProviderOpts {
   fetch?: typeof globalThis.fetch;
 }
 
+// Validated against real DeepSeek API 2026-05-28: max_tokens hard limit is 8192,
+// context window 128k. The two "logical" model names are stable API aliases that
+// currently route to the V4 family.
 export const DEEPSEEK_MODELS: Record<DeepSeekModel, { ctx: number; maxOutput: number }> = {
   'deepseek-chat': { ctx: 128_000, maxOutput: 8_192 },
   'deepseek-reasoner': { ctx: 128_000, maxOutput: 8_192 },
+  'deepseek-v4-flash': { ctx: 128_000, maxOutput: 8_192 },
+  'deepseek-v4-pro': { ctx: 128_000, maxOutput: 8_192 },
 };
 
 /**

diff --git a/packages/core/src/types.ts b/packages/core/src/types.ts
@@ -16,9 +16,20 @@ export type Effort = 'low' | 'medium' | 'high' | 'xhigh' | 'max';
 
 /**
  * Supported DeepSeek model identifiers.
+ *
+ * NOTE (validated against real API 2026-05-28):
+ * - `deepseek-chat` and `deepseek-reasoner` are STABLE ALIASES still accepted by the API.
+ * - Actual current backing models per /v1/models endpoint are `deepseek-v4-flash`
+ *   and `deepseek-v4-pro`. We support both alias names AND concrete v4 names so
+ *   either works in user config.
+ *
  * Spec: docs/DEVELOPMENT_PLAN.md §3.1
  */
-export type DeepSeekModel = 'deepseek-chat' | 'deepseek-reasoner';
+export type DeepSeekModel =
+  | 'deepseek-chat' // alias → currently routes to deepseek-v4-flash
+  | 'deepseek-reasoner' // alias → currently routes to reasoning-capable model
+  | 'deepseek-v4-flash'
+  | 'deepseek-v4-pro';
 
 /**
  * Hook event names — 9 events total.