Skip to content

Commit a93cfcd

Browse files
authored
✨ feat: support nano banana pro (#10413)
* fix nanobanana * add types * 完成 fetch sse 和 google ai 侧转换 * thinking * ui for part render * support image in thinking * fix issue * support convert content part * support nano banana pro image generation * fix tests * fix tests
1 parent b78f24c commit a93cfcd

File tree

21 files changed

+1559
-178
lines changed

21 files changed

+1559
-178
lines changed

packages/context-engine/src/processors/MessageContent.ts

Lines changed: 100 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import { filesPrompts } from '@lobechat/prompts';
2+
import { MessageContentPart } from '@lobechat/types';
23
import { imageUrlToBase64 } from '@lobechat/utils/imageToBase64';
34
import { parseDataUri } from '@lobechat/utils/uriParser';
45
import { isDesktopLocalStaticServerUrl } from '@lobechat/utils/url';
@@ -9,6 +10,23 @@ import type { PipelineContext, ProcessorOptions } from '../types';
910

1011
const log = debug('context-engine:processor:MessageContentProcessor');
1112

13+
/**
14+
* Deserialize content string to message content parts
15+
* Returns null if content is not valid JSON array of parts
16+
*/
17+
const deserializeParts = (content: string): MessageContentPart[] | null => {
18+
try {
19+
const parsed = JSON.parse(content);
20+
// Validate it's an array with valid part structure
21+
if (Array.isArray(parsed) && parsed.length > 0 && parsed[0]?.type) {
22+
return parsed as MessageContentPart[];
23+
}
24+
} catch {
25+
// Not JSON, treat as plain text
26+
}
27+
return null;
28+
};
29+
1230
export interface FileContextConfig {
1331
/** Whether to enable file context injection */
1432
enabled?: boolean;
@@ -30,6 +48,7 @@ export interface MessageContentConfig {
3048
}
3149

3250
export interface UserMessageContentPart {
51+
googleThoughtSignature?: string;
3352
image_url?: {
3453
detail?: string;
3554
url: string;
@@ -213,7 +232,7 @@ export class MessageContentProcessor extends BaseProcessor {
213232
* Process assistant message content
214233
*/
215234
private async processAssistantMessage(message: any): Promise<any> {
216-
// Check if there is reasoning content (thinking mode)
235+
// Priority 1: Check if there is reasoning content with signature (thinking mode)
217236
const shouldIncludeThinking = message.reasoning && !!message.reasoning?.signature;
218237

219238
if (shouldIncludeThinking) {
@@ -235,7 +254,59 @@ export class MessageContentProcessor extends BaseProcessor {
235254
};
236255
}
237256

238-
// Check if there are images (assistant messages may also contain images)
257+
// Priority 2: Check if reasoning content is multimodal
258+
const hasMultimodalReasoning = message.reasoning?.isMultimodal && message.reasoning?.content;
259+
260+
if (hasMultimodalReasoning) {
261+
const reasoningParts = deserializeParts(message.reasoning.content);
262+
if (reasoningParts) {
263+
// Convert reasoning multimodal parts to plain text
264+
const reasoningText = reasoningParts
265+
.map((part) => {
266+
if (part.type === 'text') return part.text;
267+
if (part.type === 'image') return `[Image: ${part.image}]`;
268+
return '';
269+
})
270+
.join('\n');
271+
272+
// Update reasoning to plain text
273+
const updatedMessage = {
274+
...message,
275+
reasoning: {
276+
...message.reasoning,
277+
content: reasoningText,
278+
isMultimodal: false, // Convert to non-multimodal
279+
},
280+
};
281+
282+
// Handle main content based on whether it's multimodal
283+
if (message.metadata?.isMultimodal && message.content) {
284+
const contentParts = deserializeParts(message.content);
285+
if (contentParts) {
286+
const convertedParts = this.convertMessagePartsToContentParts(contentParts);
287+
return {
288+
...updatedMessage,
289+
content: convertedParts,
290+
};
291+
}
292+
}
293+
294+
return updatedMessage;
295+
}
296+
}
297+
298+
// Priority 3: Check if message content is multimodal
299+
const hasMultimodalContent = message.metadata?.isMultimodal && message.content;
300+
301+
if (hasMultimodalContent) {
302+
const parts = deserializeParts(message.content);
303+
if (parts) {
304+
const contentParts = this.convertMessagePartsToContentParts(parts);
305+
return { ...message, content: contentParts };
306+
}
307+
}
308+
309+
// Priority 4: Check if there are images (legacy imageList field)
239310
const hasImages = message.imageList && message.imageList.length > 0;
240311

241312
if (hasImages && this.config.isCanUseVision?.(this.config.model, this.config.provider)) {
@@ -253,10 +324,7 @@ export class MessageContentProcessor extends BaseProcessor {
253324
const imageContentParts = await this.processImageList(message.imageList || []);
254325
contentParts.push(...imageContentParts);
255326

256-
return {
257-
...message,
258-
content: contentParts,
259-
};
327+
return { ...message, content: contentParts };
260328
}
261329

262330
// Regular assistant message, return plain text content
@@ -266,6 +334,32 @@ export class MessageContentProcessor extends BaseProcessor {
266334
};
267335
}
268336

337+
/**
338+
* Convert MessageContentPart[] (internal format) to OpenAI-compatible UserMessageContentPart[]
339+
*/
340+
private convertMessagePartsToContentParts(parts: MessageContentPart[]): UserMessageContentPart[] {
341+
const contentParts: UserMessageContentPart[] = [];
342+
343+
for (const part of parts) {
344+
if (part.type === 'text') {
345+
contentParts.push({
346+
googleThoughtSignature: part.thoughtSignature,
347+
text: part.text,
348+
type: 'text',
349+
});
350+
} else if (part.type === 'image') {
351+
// Images are already in S3 URL format, no conversion needed
352+
contentParts.push({
353+
googleThoughtSignature: part.thoughtSignature,
354+
image_url: { detail: 'auto', url: part.image },
355+
type: 'image_url',
356+
});
357+
}
358+
}
359+
360+
return contentParts;
361+
}
362+
269363
/**
270364
* Process image list
271365
*/

packages/context-engine/src/processors/__tests__/MessageContent.test.ts

Lines changed: 239 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -566,4 +566,243 @@ describe('MessageContentProcessor', () => {
566566
expect(content[2].video_url.url).toBe('http://example.com/video.mp4');
567567
});
568568
});
569+
570+
describe('Multimodal message content processing', () => {
571+
it('should convert assistant message with metadata.isMultimodal to OpenAI format', async () => {
572+
const processor = new MessageContentProcessor({
573+
model: 'gpt-4',
574+
provider: 'openai',
575+
isCanUseVision: mockIsCanUseVision,
576+
fileContext: { enabled: false },
577+
});
578+
579+
const messages: UIChatMessage[] = [
580+
{
581+
id: 'test',
582+
role: 'assistant',
583+
content: JSON.stringify([
584+
{ type: 'text', text: 'Here is an image:' },
585+
{ type: 'image', image: 'https://s3.example.com/image.png' },
586+
{ type: 'text', text: 'What do you think?' },
587+
]),
588+
metadata: {
589+
isMultimodal: true,
590+
},
591+
createdAt: Date.now(),
592+
updatedAt: Date.now(),
593+
meta: {},
594+
},
595+
];
596+
597+
const result = await processor.process(createContext(messages));
598+
599+
expect(result.messages[0]).toMatchObject({
600+
content: [
601+
{ type: 'text', text: 'Here is an image:' },
602+
{
603+
type: 'image_url',
604+
image_url: { detail: 'auto', url: 'https://s3.example.com/image.png' },
605+
},
606+
{ type: 'text', text: 'What do you think?' },
607+
],
608+
});
609+
});
610+
611+
it('should convert assistant message with reasoning.isMultimodal to plain text', async () => {
612+
const processor = new MessageContentProcessor({
613+
model: 'gpt-4',
614+
provider: 'openai',
615+
isCanUseVision: mockIsCanUseVision,
616+
fileContext: { enabled: false },
617+
});
618+
619+
const messages: UIChatMessage[] = [
620+
{
621+
id: 'test',
622+
role: 'assistant',
623+
content: 'The answer is correct.',
624+
reasoning: {
625+
content: JSON.stringify([
626+
{ type: 'text', text: 'Let me analyze this image:' },
627+
{ type: 'image', image: 'https://s3.example.com/reasoning-image.png' },
628+
{ type: 'text', text: 'Based on the analysis...' },
629+
]),
630+
isMultimodal: true,
631+
},
632+
createdAt: Date.now(),
633+
updatedAt: Date.now(),
634+
meta: {},
635+
},
636+
];
637+
638+
const result = await processor.process(createContext(messages));
639+
640+
expect(result.messages[0]).toMatchObject({
641+
reasoning: {
642+
content:
643+
'Let me analyze this image:\n[Image: https://s3.example.com/reasoning-image.png]\nBased on the analysis...',
644+
isMultimodal: false,
645+
},
646+
content: 'The answer is correct.',
647+
});
648+
});
649+
650+
it('should handle both reasoning.isMultimodal and metadata.isMultimodal', async () => {
651+
const processor = new MessageContentProcessor({
652+
model: 'gpt-4',
653+
provider: 'openai',
654+
isCanUseVision: mockIsCanUseVision,
655+
fileContext: { enabled: false },
656+
});
657+
658+
const messages: UIChatMessage[] = [
659+
{
660+
id: 'test',
661+
role: 'assistant',
662+
content: JSON.stringify([
663+
{ type: 'text', text: 'Final result:' },
664+
{ type: 'image', image: 'https://s3.example.com/result.png' },
665+
]),
666+
metadata: {
667+
isMultimodal: true,
668+
},
669+
reasoning: {
670+
content: JSON.stringify([
671+
{ type: 'text', text: 'Thinking about:' },
672+
{ type: 'image', image: 'https://s3.example.com/thinking.png' },
673+
]),
674+
isMultimodal: true,
675+
},
676+
createdAt: Date.now(),
677+
updatedAt: Date.now(),
678+
meta: {},
679+
},
680+
];
681+
682+
const result = await processor.process(createContext(messages));
683+
684+
expect(result.messages[0]).toMatchObject({
685+
reasoning: {
686+
content: 'Thinking about:\n[Image: https://s3.example.com/thinking.png]',
687+
isMultimodal: false,
688+
},
689+
content: [
690+
{ type: 'text', text: 'Final result:' },
691+
{
692+
type: 'image_url',
693+
image_url: { detail: 'auto', url: 'https://s3.example.com/result.png' },
694+
},
695+
],
696+
});
697+
});
698+
699+
it('should prioritize reasoning.signature over reasoning.isMultimodal', async () => {
700+
const processor = new MessageContentProcessor({
701+
model: 'gpt-4',
702+
provider: 'openai',
703+
isCanUseVision: mockIsCanUseVision,
704+
fileContext: { enabled: false },
705+
});
706+
707+
const messages: UIChatMessage[] = [
708+
{
709+
id: 'test',
710+
role: 'assistant',
711+
content: 'The answer.',
712+
reasoning: {
713+
content: 'Some thinking process',
714+
signature: 'sig123',
715+
// Even if isMultimodal is true, signature takes priority
716+
isMultimodal: true,
717+
},
718+
createdAt: Date.now(),
719+
updatedAt: Date.now(),
720+
meta: {},
721+
},
722+
];
723+
724+
const result = await processor.process(createContext(messages));
725+
726+
expect(result.messages[0]).toMatchObject({
727+
content: [
728+
{
729+
type: 'thinking',
730+
thinking: 'Some thinking process',
731+
signature: 'sig123',
732+
},
733+
{ type: 'text', text: 'The answer.' },
734+
],
735+
});
736+
});
737+
738+
it('should handle plain text when isMultimodal is true but content is not valid JSON', async () => {
739+
const processor = new MessageContentProcessor({
740+
model: 'gpt-4',
741+
provider: 'openai',
742+
isCanUseVision: mockIsCanUseVision,
743+
fileContext: { enabled: false },
744+
});
745+
746+
const messages: UIChatMessage[] = [
747+
{
748+
id: 'test',
749+
role: 'assistant',
750+
content: 'This is plain text, not JSON',
751+
metadata: {
752+
isMultimodal: true,
753+
},
754+
createdAt: Date.now(),
755+
updatedAt: Date.now(),
756+
meta: {},
757+
},
758+
];
759+
760+
const result = await processor.process(createContext(messages));
761+
762+
expect(result.messages[0]).toMatchObject({
763+
content: 'This is plain text, not JSON',
764+
});
765+
});
766+
767+
it('should preserve thoughtSignature in multimodal content parts', async () => {
768+
const processor = new MessageContentProcessor({
769+
model: 'gpt-4',
770+
provider: 'openai',
771+
isCanUseVision: mockIsCanUseVision,
772+
fileContext: { enabled: false },
773+
});
774+
775+
const messages: UIChatMessage[] = [
776+
{
777+
id: 'test',
778+
role: 'assistant',
779+
content: JSON.stringify([
780+
{ type: 'text', text: 'Analysis result:', thoughtSignature: 'sig-001' },
781+
{ type: 'image', image: 'https://s3.example.com/chart.png', thoughtSignature: 'sig-002' },
782+
{ type: 'text', text: 'Conclusion' },
783+
]),
784+
metadata: {
785+
isMultimodal: true,
786+
},
787+
createdAt: Date.now(),
788+
updatedAt: Date.now(),
789+
meta: {},
790+
},
791+
];
792+
793+
const result = await processor.process(createContext(messages));
794+
795+
expect(result.messages[0]).toMatchObject({
796+
content: [
797+
{ type: 'text', text: 'Analysis result:', googleThoughtSignature: 'sig-001' },
798+
{
799+
type: 'image_url',
800+
image_url: { detail: 'auto', url: 'https://s3.example.com/chart.png' },
801+
googleThoughtSignature: 'sig-002',
802+
},
803+
{ type: 'text', text: 'Conclusion' },
804+
],
805+
});
806+
});
807+
});
569808
});

0 commit comments

Comments
 (0)