@@ -566,4 +566,243 @@ describe('MessageContentProcessor', () => {
566566 expect ( content [ 2 ] . video_url . url ) . toBe ( 'http://example.com/video.mp4' ) ;
567567 } ) ;
568568 } ) ;
569+
570+ describe ( 'Multimodal message content processing' , ( ) => {
571+ it ( 'should convert assistant message with metadata.isMultimodal to OpenAI format' , async ( ) => {
572+ const processor = new MessageContentProcessor ( {
573+ model : 'gpt-4' ,
574+ provider : 'openai' ,
575+ isCanUseVision : mockIsCanUseVision ,
576+ fileContext : { enabled : false } ,
577+ } ) ;
578+
579+ const messages : UIChatMessage [ ] = [
580+ {
581+ id : 'test' ,
582+ role : 'assistant' ,
583+ content : JSON . stringify ( [
584+ { type : 'text' , text : 'Here is an image:' } ,
585+ { type : 'image' , image : 'https://s3.example.com/image.png' } ,
586+ { type : 'text' , text : 'What do you think?' } ,
587+ ] ) ,
588+ metadata : {
589+ isMultimodal : true ,
590+ } ,
591+ createdAt : Date . now ( ) ,
592+ updatedAt : Date . now ( ) ,
593+ meta : { } ,
594+ } ,
595+ ] ;
596+
597+ const result = await processor . process ( createContext ( messages ) ) ;
598+
599+ expect ( result . messages [ 0 ] ) . toMatchObject ( {
600+ content : [
601+ { type : 'text' , text : 'Here is an image:' } ,
602+ {
603+ type : 'image_url' ,
604+ image_url : { detail : 'auto' , url : 'https://s3.example.com/image.png' } ,
605+ } ,
606+ { type : 'text' , text : 'What do you think?' } ,
607+ ] ,
608+ } ) ;
609+ } ) ;
610+
611+ it ( 'should convert assistant message with reasoning.isMultimodal to plain text' , async ( ) => {
612+ const processor = new MessageContentProcessor ( {
613+ model : 'gpt-4' ,
614+ provider : 'openai' ,
615+ isCanUseVision : mockIsCanUseVision ,
616+ fileContext : { enabled : false } ,
617+ } ) ;
618+
619+ const messages : UIChatMessage [ ] = [
620+ {
621+ id : 'test' ,
622+ role : 'assistant' ,
623+ content : 'The answer is correct.' ,
624+ reasoning : {
625+ content : JSON . stringify ( [
626+ { type : 'text' , text : 'Let me analyze this image:' } ,
627+ { type : 'image' , image : 'https://s3.example.com/reasoning-image.png' } ,
628+ { type : 'text' , text : 'Based on the analysis...' } ,
629+ ] ) ,
630+ isMultimodal : true ,
631+ } ,
632+ createdAt : Date . now ( ) ,
633+ updatedAt : Date . now ( ) ,
634+ meta : { } ,
635+ } ,
636+ ] ;
637+
638+ const result = await processor . process ( createContext ( messages ) ) ;
639+
640+ expect ( result . messages [ 0 ] ) . toMatchObject ( {
641+ reasoning : {
642+ content :
643+ 'Let me analyze this image:\n[Image: https://s3.example.com/reasoning-image.png]\nBased on the analysis...' ,
644+ isMultimodal : false ,
645+ } ,
646+ content : 'The answer is correct.' ,
647+ } ) ;
648+ } ) ;
649+
650+ it ( 'should handle both reasoning.isMultimodal and metadata.isMultimodal' , async ( ) => {
651+ const processor = new MessageContentProcessor ( {
652+ model : 'gpt-4' ,
653+ provider : 'openai' ,
654+ isCanUseVision : mockIsCanUseVision ,
655+ fileContext : { enabled : false } ,
656+ } ) ;
657+
658+ const messages : UIChatMessage [ ] = [
659+ {
660+ id : 'test' ,
661+ role : 'assistant' ,
662+ content : JSON . stringify ( [
663+ { type : 'text' , text : 'Final result:' } ,
664+ { type : 'image' , image : 'https://s3.example.com/result.png' } ,
665+ ] ) ,
666+ metadata : {
667+ isMultimodal : true ,
668+ } ,
669+ reasoning : {
670+ content : JSON . stringify ( [
671+ { type : 'text' , text : 'Thinking about:' } ,
672+ { type : 'image' , image : 'https://s3.example.com/thinking.png' } ,
673+ ] ) ,
674+ isMultimodal : true ,
675+ } ,
676+ createdAt : Date . now ( ) ,
677+ updatedAt : Date . now ( ) ,
678+ meta : { } ,
679+ } ,
680+ ] ;
681+
682+ const result = await processor . process ( createContext ( messages ) ) ;
683+
684+ expect ( result . messages [ 0 ] ) . toMatchObject ( {
685+ reasoning : {
686+ content : 'Thinking about:\n[Image: https://s3.example.com/thinking.png]' ,
687+ isMultimodal : false ,
688+ } ,
689+ content : [
690+ { type : 'text' , text : 'Final result:' } ,
691+ {
692+ type : 'image_url' ,
693+ image_url : { detail : 'auto' , url : 'https://s3.example.com/result.png' } ,
694+ } ,
695+ ] ,
696+ } ) ;
697+ } ) ;
698+
699+ it ( 'should prioritize reasoning.signature over reasoning.isMultimodal' , async ( ) => {
700+ const processor = new MessageContentProcessor ( {
701+ model : 'gpt-4' ,
702+ provider : 'openai' ,
703+ isCanUseVision : mockIsCanUseVision ,
704+ fileContext : { enabled : false } ,
705+ } ) ;
706+
707+ const messages : UIChatMessage [ ] = [
708+ {
709+ id : 'test' ,
710+ role : 'assistant' ,
711+ content : 'The answer.' ,
712+ reasoning : {
713+ content : 'Some thinking process' ,
714+ signature : 'sig123' ,
715+ // Even if isMultimodal is true, signature takes priority
716+ isMultimodal : true ,
717+ } ,
718+ createdAt : Date . now ( ) ,
719+ updatedAt : Date . now ( ) ,
720+ meta : { } ,
721+ } ,
722+ ] ;
723+
724+ const result = await processor . process ( createContext ( messages ) ) ;
725+
726+ expect ( result . messages [ 0 ] ) . toMatchObject ( {
727+ content : [
728+ {
729+ type : 'thinking' ,
730+ thinking : 'Some thinking process' ,
731+ signature : 'sig123' ,
732+ } ,
733+ { type : 'text' , text : 'The answer.' } ,
734+ ] ,
735+ } ) ;
736+ } ) ;
737+
738+ it ( 'should handle plain text when isMultimodal is true but content is not valid JSON' , async ( ) => {
739+ const processor = new MessageContentProcessor ( {
740+ model : 'gpt-4' ,
741+ provider : 'openai' ,
742+ isCanUseVision : mockIsCanUseVision ,
743+ fileContext : { enabled : false } ,
744+ } ) ;
745+
746+ const messages : UIChatMessage [ ] = [
747+ {
748+ id : 'test' ,
749+ role : 'assistant' ,
750+ content : 'This is plain text, not JSON' ,
751+ metadata : {
752+ isMultimodal : true ,
753+ } ,
754+ createdAt : Date . now ( ) ,
755+ updatedAt : Date . now ( ) ,
756+ meta : { } ,
757+ } ,
758+ ] ;
759+
760+ const result = await processor . process ( createContext ( messages ) ) ;
761+
762+ expect ( result . messages [ 0 ] ) . toMatchObject ( {
763+ content : 'This is plain text, not JSON' ,
764+ } ) ;
765+ } ) ;
766+
767+ it ( 'should preserve thoughtSignature in multimodal content parts' , async ( ) => {
768+ const processor = new MessageContentProcessor ( {
769+ model : 'gpt-4' ,
770+ provider : 'openai' ,
771+ isCanUseVision : mockIsCanUseVision ,
772+ fileContext : { enabled : false } ,
773+ } ) ;
774+
775+ const messages : UIChatMessage [ ] = [
776+ {
777+ id : 'test' ,
778+ role : 'assistant' ,
779+ content : JSON . stringify ( [
780+ { type : 'text' , text : 'Analysis result:' , thoughtSignature : 'sig-001' } ,
781+ { type : 'image' , image : 'https://s3.example.com/chart.png' , thoughtSignature : 'sig-002' } ,
782+ { type : 'text' , text : 'Conclusion' } ,
783+ ] ) ,
784+ metadata : {
785+ isMultimodal : true ,
786+ } ,
787+ createdAt : Date . now ( ) ,
788+ updatedAt : Date . now ( ) ,
789+ meta : { } ,
790+ } ,
791+ ] ;
792+
793+ const result = await processor . process ( createContext ( messages ) ) ;
794+
795+ expect ( result . messages [ 0 ] ) . toMatchObject ( {
796+ content : [
797+ { type : 'text' , text : 'Analysis result:' , googleThoughtSignature : 'sig-001' } ,
798+ {
799+ type : 'image_url' ,
800+ image_url : { detail : 'auto' , url : 'https://s3.example.com/chart.png' } ,
801+ googleThoughtSignature : 'sig-002' ,
802+ } ,
803+ { type : 'text' , text : 'Conclusion' } ,
804+ ] ,
805+ } ) ;
806+ } ) ;
807+ } ) ;
569808} ) ;
0 commit comments