[KVCache] Add implicit KVCache reuse, disable stateful option for cha…

…tCompletion (#359) We introduced the field `stateful` in `chatCompletion()` earlier to allow easier multi-round chatting in #330. However, this is not ideal since we would prefer APIs that are functional in behavior, giving us various benefits (e.g. better fault tolerance for future use cases). Therefore, in this PR: - We disable `chatCompletionRequest.stateful`, and ask users to maintain the chat history explicitly - Instead, we introduce implicit KVCache reuse for multi-round chatting - When we detect users are doing multi-round chatting, we will not reset the KV cache, so only the new message will be prefilled - To detect multi-round chatting, we instantiate a `Conversation` instance for each request, and compare it with the current internal `Conversation`. If they match, it means that we can safely not reset the internal state, and only prefill the new input. To see the behavior, check out `mainMultiroundChat()` in `examples/openai-api/src/openai_api.ts`. Implementation details: - Instantiate `Conversation` object in `ChatModule.prefill()`, since this is the place where various workflows meet (streaming, non-streaming, n > 1, etc.) - The object's state is determined by system prompt, message history, and function calling usages - Inside `prefill()`, we then compare the two objects with `compareConversationObject()`, reset all internal states if false - Another detail is that, instead of overriding `conversation.config.system_message`, we add a field `conversation.override_system_message`, making `conversation.config` protected - We further remove all methods in `ChatModule` that overrides `this.getPipeline().conversation` by changing `updateConversationWithChatCompletionMessages()` to `getConversationFromChatCompletionRequest()`, keeping things more functional internally
mlc-ai · Apr 3, 2024 · 0d003f8 · 0d003f8
1 parent 0ca0c58
commit 0d003f8
Show file tree

Hide file tree

Showing 11 changed files with 436 additions and 216 deletions.
diff --git a/examples/openai-api/src/openai_api.ts b/examples/openai-api/src/openai_api.ts
@@ -94,45 +94,66 @@ async function mainStreaming() {
 }
 
 /**
- * We domnstrate stateful chat completion, where chat history is preserved across requests.
+ * We domnstrate multiround chatting. Though users are required to maintain chat history, internally
+ * we compare provided `messages` with the internal chat history. If it matches, we will reuse KVs
+ * and hence save computation -- essentially an implicit internal optimization.
  */
-async function mainStateful() {
+async function mainMultiroundChat() {
   const chat: webllm.ChatInterface = new webllm.ChatModule();
-
   chat.setInitProgressCallback((report: webllm.InitProgressReport) => {
     setLabel("init-label", report.text);
   });
 
   await chat.reload("Llama-2-7b-chat-hf-q4f32_1");
 
+  // Round 0
+  const messages: webllm.ChatCompletionMessageParam[] = [
+    {
+      "role": "system",
+      "content": "[INST] <<SYS>>\n\nYou are a helpful, respectful and honest assistant. " +
+        "Be as happy as you can when speaking please.\n<</SYS>>\n\n "
+    },
+    { "role": "user", "content": "Provide me three US states." },
+  ];
+
   const request0: webllm.ChatCompletionRequest = {
-    stateful: true,
-    // stream: true, // works with and without streaming
-    messages: [
-      {
-        "role": "system",
-        "content": "[INST] <<SYS>>\n\nYou are a helpful, respectful and honest assistant. " +
-          "Be as happy as you can when speaking please.\n<</SYS>>\n\n "
-      },
-      { "role": "user", "content": "Provide me three US states." },
-    ],
+    stream: false,  // can be streaming, same behavior
+    messages: messages,
   };
 
   const reply0 = await chat.chatCompletion(request0);
+  const replyMessage0 = await chat.getMessage();
   console.log(reply0);
-  console.log(await chat.getMessage());
+  console.log(replyMessage0);
+
+  // Round 1
+  // Append generated response to messages
+  messages.push({ "role": "assistant", "content": replyMessage0 });
+  // Append new user input
+  messages.push({ "role": "user", "content": "Two more please!" });
+  // Below line would cause an internal reset (clear KV cache, etc.) since the history no longer
+  // matches the new request
+  // messages[0].content = "Another system prompt";
 
   const request1: webllm.ChatCompletionRequest = {
-    stateful: true,
-    // stream: true, // works with and without streaming
-    messages: [
-      { "role": "user", "content": "Two more please!" },
-    ],
+    stream: false,  // can be streaming, same behavior
+    messages: messages
   };
 
   const reply1 = await chat.chatCompletion(request1);
+  const replyMessage1 = await chat.getMessage();
   console.log(reply1);
-  console.log(await chat.getMessage());
+  console.log(replyMessage1);
+
+  // If we used multiround chat, request1 should only prefill a small number of tokens
+  const prefillTokens0 = reply0.usage?.prompt_tokens;
+  const prefillTokens1 = reply1.usage?.prompt_tokens;
+  console.log("Requset 0 prompt tokens: ", prefillTokens0);
+  console.log("Requset 1 prompt tokens: ", prefillTokens1);
+  if (prefillTokens0 === undefined || prefillTokens1 === undefined ||
+    prefillTokens1 > prefillTokens0) {
+    throw Error("Multi-round chat is not triggered as expected.");
+  }
 
   console.log(await chat.runtimeStatsText());
 }
@@ -195,4 +216,5 @@ async function mainFunctionCalling() {
 // Run one of the functions
 // mainNonStreaming();
 // mainStreaming();
-mainFunctionCalling();
+// mainFunctionCalling();
+mainMultiroundChat();
diff --git a/examples/web-worker/src/main.ts b/examples/web-worker/src/main.ts
@@ -62,7 +62,6 @@ async function mainOpenAIAPINonStreaming() {
   await chat.reload("Llama-2-7b-chat-hf-q4f32_1");
 
   const request: webllm.ChatCompletionRequest = {
-    // stateful: true,  // set this optionally to preserve chat history
     messages: [
       {
         "role": "system",
@@ -102,7 +101,6 @@ async function mainOpenAIAPIStreaming() {
   await chat.reload("Llama-2-7b-chat-hf-q4f32_1");
 
   const request: webllm.ChatCompletionRequest = {
-    // stateful: true,  // set this optionally to preserve chat history
     stream: true,
     messages: [
       {