lobehub
diff --git a/‎packages/context-engine/src/processors/MessageContent.ts‎
Lines changed: 100 additions & 6 deletions b/‎packages/context-engine/src/processors/MessageContent.ts‎
Lines changed: 100 additions & 6 deletions
diff --git a/‎packages/context-engine/src/processors/__tests__/MessageContent.test.ts‎
Lines changed: 239 additions & 0 deletions b/‎packages/context-engine/src/processors/__tests__/MessageContent.test.ts‎
Lines changed: 239 additions & 0 deletions
@@ -1,4 +1,5 @@
 import { filesPrompts } from '@lobechat/prompts';
+import { MessageContentPart } from '@lobechat/types';
 import { imageUrlToBase64 } from '@lobechat/utils/imageToBase64';
 import { parseDataUri } from '@lobechat/utils/uriParser';
 import { isDesktopLocalStaticServerUrl } from '@lobechat/utils/url';
@@ -9,6 +10,23 @@ import type { PipelineContext, ProcessorOptions } from '../types';
 
 const log = debug('context-engine:processor:MessageContentProcessor');
 
+/**
+ * Deserialize content string to message content parts
+ * Returns null if content is not valid JSON array of parts
+ */
+const deserializeParts = (content: string): MessageContentPart[] | null => {
+  try {
+    const parsed = JSON.parse(content);
+    // Validate it's an array with valid part structure
+    if (Array.isArray(parsed) && parsed.length > 0 && parsed[0]?.type) {
+      return parsed as MessageContentPart[];
+    }
+  } catch {
+    // Not JSON, treat as plain text
+  }
+  return null;
+};
+
 export interface FileContextConfig {
   /** Whether to enable file context injection */
   enabled?: boolean;
@@ -30,6 +48,7 @@ export interface MessageContentConfig {
 }
 
 export interface UserMessageContentPart {
+  googleThoughtSignature?: string;
   image_url?: {
     detail?: string;
     url: string;
@@ -213,7 +232,7 @@ export class MessageContentProcessor extends BaseProcessor {
    * Process assistant message content
    */
   private async processAssistantMessage(message: any): Promise<any> {
-    // Check if there is reasoning content (thinking mode)
+    // Priority 1: Check if there is reasoning content with signature (thinking mode)
     const shouldIncludeThinking = message.reasoning && !!message.reasoning?.signature;
 
     if (shouldIncludeThinking) {
@@ -235,7 +254,59 @@ export class MessageContentProcessor extends BaseProcessor {
       };
     }
 
-    // Check if there are images (assistant messages may also contain images)
+    // Priority 2: Check if reasoning content is multimodal
+    const hasMultimodalReasoning = message.reasoning?.isMultimodal && message.reasoning?.content;
+
+    if (hasMultimodalReasoning) {
+      const reasoningParts = deserializeParts(message.reasoning.content);
+      if (reasoningParts) {
+        // Convert reasoning multimodal parts to plain text
+        const reasoningText = reasoningParts
+          .map((part) => {
+            if (part.type === 'text') return part.text;
+            if (part.type === 'image') return `[Image: ${part.image}]`;
+            return '';
+          })
+          .join('\n');
+
+        // Update reasoning to plain text
+        const updatedMessage = {
+          ...message,
+          reasoning: {
+            ...message.reasoning,
+            content: reasoningText,
+            isMultimodal: false, // Convert to non-multimodal
+          },
+        };
+
+        // Handle main content based on whether it's multimodal
+        if (message.metadata?.isMultimodal && message.content) {
+          const contentParts = deserializeParts(message.content);
+          if (contentParts) {
+            const convertedParts = this.convertMessagePartsToContentParts(contentParts);
+            return {
+              ...updatedMessage,
+              content: convertedParts,
+            };
+          }
+        }
+
+        return updatedMessage;
+      }
+    }
+
+    // Priority 3: Check if message content is multimodal
+    const hasMultimodalContent = message.metadata?.isMultimodal && message.content;
+
+    if (hasMultimodalContent) {
+      const parts = deserializeParts(message.content);
+      if (parts) {
+        const contentParts = this.convertMessagePartsToContentParts(parts);
+        return { ...message, content: contentParts };
+      }
+    }
+
+    // Priority 4: Check if there are images (legacy imageList field)
     const hasImages = message.imageList && message.imageList.length > 0;
 
     if (hasImages && this.config.isCanUseVision?.(this.config.model, this.config.provider)) {
@@ -253,10 +324,7 @@ export class MessageContentProcessor extends BaseProcessor {
       const imageContentParts = await this.processImageList(message.imageList || []);
       contentParts.push(...imageContentParts);
 
-      return {
-        ...message,
-        content: contentParts,
-      };
+      return { ...message, content: contentParts };
     }
 
     // Regular assistant message, return plain text content
@@ -266,6 +334,32 @@ export class MessageContentProcessor extends BaseProcessor {
     };
   }
 
+  /**
+   * Convert MessageContentPart[] (internal format) to OpenAI-compatible UserMessageContentPart[]
+   */
+  private convertMessagePartsToContentParts(parts: MessageContentPart[]): UserMessageContentPart[] {
+    const contentParts: UserMessageContentPart[] = [];
+
+    for (const part of parts) {
+      if (part.type === 'text') {
+        contentParts.push({
+          googleThoughtSignature: part.thoughtSignature,
+          text: part.text,
+          type: 'text',
+        });
+      } else if (part.type === 'image') {
+        // Images are already in S3 URL format, no conversion needed
+        contentParts.push({
+          googleThoughtSignature: part.thoughtSignature,
+          image_url: { detail: 'auto', url: part.image },
+          type: 'image_url',
+        });
+      }
+    }
+
+    return contentParts;
+  }
+
   /**
    * Process image list
    */
 
@@ -566,4 +566,243 @@ describe('MessageContentProcessor', () => {
       expect(content[2].video_url.url).toBe('http://example.com/video.mp4');
     });
   });
+
+  describe('Multimodal message content processing', () => {
+    it('should convert assistant message with metadata.isMultimodal to OpenAI format', async () => {
+      const processor = new MessageContentProcessor({
+        model: 'gpt-4',
+        provider: 'openai',
+        isCanUseVision: mockIsCanUseVision,
+        fileContext: { enabled: false },
+      });
+
+      const messages: UIChatMessage[] = [
+        {
+          id: 'test',
+          role: 'assistant',
+          content: JSON.stringify([
+            { type: 'text', text: 'Here is an image:' },
+            { type: 'image', image: 'https://s3.example.com/image.png' },
+            { type: 'text', text: 'What do you think?' },
+          ]),
+          metadata: {
+            isMultimodal: true,
+          },
+          createdAt: Date.now(),
+          updatedAt: Date.now(),
+          meta: {},
+        },
+      ];
+
+      const result = await processor.process(createContext(messages));
+
+      expect(result.messages[0]).toMatchObject({
+        content: [
+          { type: 'text', text: 'Here is an image:' },
+          {
+            type: 'image_url',
+            image_url: { detail: 'auto', url: 'https://s3.example.com/image.png' },
+          },
+          { type: 'text', text: 'What do you think?' },
+        ],
+      });
+    });
+
+    it('should convert assistant message with reasoning.isMultimodal to plain text', async () => {
+      const processor = new MessageContentProcessor({
+        model: 'gpt-4',
+        provider: 'openai',
+        isCanUseVision: mockIsCanUseVision,
+        fileContext: { enabled: false },
+      });
+
+      const messages: UIChatMessage[] = [
+        {
+          id: 'test',
+          role: 'assistant',
+          content: 'The answer is correct.',
+          reasoning: {
+            content: JSON.stringify([
+              { type: 'text', text: 'Let me analyze this image:' },
+              { type: 'image', image: 'https://s3.example.com/reasoning-image.png' },
+              { type: 'text', text: 'Based on the analysis...' },
+            ]),
+            isMultimodal: true,
+          },
+          createdAt: Date.now(),
+          updatedAt: Date.now(),
+          meta: {},
+        },
+      ];
+
+      const result = await processor.process(createContext(messages));
+
+      expect(result.messages[0]).toMatchObject({
+        reasoning: {
+          content:
+            'Let me analyze this image:\n[Image: https://s3.example.com/reasoning-image.png]\nBased on the analysis...',
+          isMultimodal: false,
+        },
+        content: 'The answer is correct.',
+      });
+    });
+
+    it('should handle both reasoning.isMultimodal and metadata.isMultimodal', async () => {
+      const processor = new MessageContentProcessor({
+        model: 'gpt-4',
+        provider: 'openai',
+        isCanUseVision: mockIsCanUseVision,
+        fileContext: { enabled: false },
+      });
+
+      const messages: UIChatMessage[] = [
+        {
+          id: 'test',
+          role: 'assistant',
+          content: JSON.stringify([
+            { type: 'text', text: 'Final result:' },
+            { type: 'image', image: 'https://s3.example.com/result.png' },
+          ]),
+          metadata: {
+            isMultimodal: true,
+          },
+          reasoning: {
+            content: JSON.stringify([
+              { type: 'text', text: 'Thinking about:' },
+              { type: 'image', image: 'https://s3.example.com/thinking.png' },
+            ]),
+            isMultimodal: true,
+          },
+          createdAt: Date.now(),
+          updatedAt: Date.now(),
+          meta: {},
+        },
+      ];
+
+      const result = await processor.process(createContext(messages));
+
+      expect(result.messages[0]).toMatchObject({
+        reasoning: {
+          content: 'Thinking about:\n[Image: https://s3.example.com/thinking.png]',
+          isMultimodal: false,
+        },
+        content: [
+          { type: 'text', text: 'Final result:' },
+          {
+            type: 'image_url',
+            image_url: { detail: 'auto', url: 'https://s3.example.com/result.png' },
+          },
+        ],
+      });
+    });
+
+    it('should prioritize reasoning.signature over reasoning.isMultimodal', async () => {
+      const processor = new MessageContentProcessor({
+        model: 'gpt-4',
+        provider: 'openai',
+        isCanUseVision: mockIsCanUseVision,
+        fileContext: { enabled: false },
+      });
+
+      const messages: UIChatMessage[] = [
+        {
+          id: 'test',
+          role: 'assistant',
+          content: 'The answer.',
+          reasoning: {
+            content: 'Some thinking process',
+            signature: 'sig123',
+            // Even if isMultimodal is true, signature takes priority
+            isMultimodal: true,
+          },
+          createdAt: Date.now(),
+          updatedAt: Date.now(),
+          meta: {},
+        },
+      ];
+
+      const result = await processor.process(createContext(messages));
+
+      expect(result.messages[0]).toMatchObject({
+        content: [
+          {
+            type: 'thinking',
+            thinking: 'Some thinking process',
+            signature: 'sig123',
+          },
+          { type: 'text', text: 'The answer.' },
+        ],
+      });
+    });
+
+    it('should handle plain text when isMultimodal is true but content is not valid JSON', async () => {
+      const processor = new MessageContentProcessor({
+        model: 'gpt-4',
+        provider: 'openai',
+        isCanUseVision: mockIsCanUseVision,
+        fileContext: { enabled: false },
+      });
+
+      const messages: UIChatMessage[] = [
+        {
+          id: 'test',
+          role: 'assistant',
+          content: 'This is plain text, not JSON',
+          metadata: {
+            isMultimodal: true,
+          },
+          createdAt: Date.now(),
+          updatedAt: Date.now(),
+          meta: {},
+        },
+      ];
+
+      const result = await processor.process(createContext(messages));
+
+      expect(result.messages[0]).toMatchObject({
+        content: 'This is plain text, not JSON',
+      });
+    });
+
+    it('should preserve thoughtSignature in multimodal content parts', async () => {
+      const processor = new MessageContentProcessor({
+        model: 'gpt-4',
+        provider: 'openai',
+        isCanUseVision: mockIsCanUseVision,
+        fileContext: { enabled: false },
+      });
+
+      const messages: UIChatMessage[] = [
+        {
+          id: 'test',
+          role: 'assistant',
+          content: JSON.stringify([
+            { type: 'text', text: 'Analysis result:', thoughtSignature: 'sig-001' },
+            { type: 'image', image: 'https://s3.example.com/chart.png', thoughtSignature: 'sig-002' },
+            { type: 'text', text: 'Conclusion' },
+          ]),
+          metadata: {
+            isMultimodal: true,
+          },
+          createdAt: Date.now(),
+          updatedAt: Date.now(),
+          meta: {},
+        },
+      ];
+
+      const result = await processor.process(createContext(messages));
+
+      expect(result.messages[0]).toMatchObject({
+        content: [
+          { type: 'text', text: 'Analysis result:', googleThoughtSignature: 'sig-001' },
+          {
+            type: 'image_url',
+            image_url: { detail: 'auto', url: 'https://s3.example.com/chart.png' },
+            googleThoughtSignature: 'sig-002',
+          },
+          { type: 'text', text: 'Conclusion' },
+        ],
+      });
+    });
+  });
 });