lm-sys · suquark · May 10, 2023 · May 4, 2023 · May 4, 2023 · May 4, 2023
diff --git a/.gitignore b/.gitignore
@@ -7,6 +7,7 @@ dist
 
 # Log
 *.log
+*.log.*
 *.json
 
 # Editor

diff --git a/docs/openai_api.md b/docs/openai_api.md
@@ -25,14 +25,45 @@ Finally, launch the RESTful API server
 python3 -m fastchat.serve.openai_api_server --host localhost --port 8000
 ```
 
-Test the API server
+Now, let us test the API server...
 
-### List Models
+### OpenAI Official SDK
+The final goal of `openai_api_server.py` is to implement a fully OpenAI-Compatible API server, so the models can be used directly with [openai-python](https://github.com/openai/openai-python) library.
+
+First, install openai-python:
+```bash
+pip install --upgrade openai
+```
+
+Then, interact with model vicuna:
+```python
+import openai
+openai.api_key = "EMPTY" # Not support yet
+openai.api_base = "http://localhost:8000/v1"
+
+# create a completion
+completion = openai.Completion.create(model="vicuna-7b-v1.1", prompt="Hello world", max_tokens=64)
+# print the completion
+print(completion.choices[0].text)
+
+# create a chat completion
+completion = openai.ChatCompletion.create(
+  model="vicuna-7b-v1.1",
+  messages=[{"role": "user", "content": "Hello world!"}]
+)
+# print the completion
+print(completion.choices[0].message.content)
+```
+
+### cURL
+cURL is another good tool for observing the output of the api.
+
+List Models:
 ```bash
 curl http://localhost:8000/v1/models
 ```
 
-### Chat Completions
+Chat Completions:
 ```bash
 curl http://localhost:8000/v1/chat/completions \
   -H "Content-Type: application/json" \
@@ -42,7 +73,7 @@ curl http://localhost:8000/v1/chat/completions \
   }'
 ```
 
-### Text Completions
+Text Completions:
 ```bash
 curl http://localhost:8000/v1/completions \
   -H "Content-Type: application/json" \
@@ -54,17 +85,18 @@ curl http://localhost:8000/v1/completions \
   }'
 ```
 
-### Embeddings
+Embeddings:
 ```bash
-curl http://localhost:8000/v1/create_embeddings \
+curl http://localhost:8000/v1/embeddings \
   -H "Content-Type: application/json" \
   -d '{
     "model": "vicuna-7b-v1.1",
     "input": "Hello, can you tell me a joke"
   }'
 ```
 
-## Client SDK
+### FastChat Client SDK
+FastChat also includes its own client SDK for the API.
 
 Assuming environment variable `FASTCHAT_BASEURL` is set to the API server URL (e.g., `http://localhost:8000`), you can use the following code to send a request to the API server:
 
@@ -105,7 +137,12 @@ The script will train classifiers based on `vicuna-7b`, `text-similarity-ada-001
 ## Todos
 Some features to be implemented:
 
-- [ ] Support more parameters like `top_p`, `presence_penalty`
-- [ ] Report token usage for chat completion
-- [ ] Proper error handling (e.g., model not found)
+- [ ] Support more parameters like `logprobs`, `logit_bias`, `user`, `presence_penalty` and `frequency_penalty`
 - [ ] The return value in the client SDK could be used like a dict
+- [ ] Model details (permissions, owner and create time)
+- [ ] Edits API
+- [ ] Authentication and API key
+- [ ] Rate Limitation Settings
+- [x] Parameter `top_p` support
+- [x] Report token usage for chat completion
+- [x] Proper error handling (e.g., model not found)
diff --git a/fastchat/client/openai_api_client.py b/fastchat/client/openai_api_client.py
@@ -48,10 +48,13 @@ async def request_completion_stream(
     ) -> AsyncGenerator:
         """
         Create chat completion as a stream
+        Parse the Event stream format: 
+        https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#event_stream_format
         :param request: The request data
         :param timeout: The timeout of the request
         :returns: Compleation stream
         """
+        VALID_EVENT_STREAM_FIELD = ["id", "data", "event", "retry"]
         async with httpx.AsyncClient() as client:
             async with client.stream(
                 "POST",
@@ -62,10 +65,25 @@ async def request_completion_stream(
                 async for chunk in response.aiter_text():
                     if not chunk:
                         continue
-                    for line in chunk.split("\n"):
-                        if not line:
+                    for message in chunk.split("\n\n"):
+                        if not message:
                             continue
-                        yield ChatCompletionStreamResponse.parse_obj(json.loads(line))
+                        lines = message.split("\n")
+
+                        message_dict = {}
+                        for line in lines:
+                            colon_index = line.find(":")
+                            if colon_index == 0 or colon_index == -1:
+                                continue
+                            message_key = line[:colon_index].strip()
+                            message_value = line[colon_index + 1:]
+                            if message_key in VALID_EVENT_STREAM_FIELD:
+                                message_dict[message_key] = message_value
+
+                        data_field = message_dict["data"]
+                        if data_field.strip() == "[DONE]":
+                            break
+                        yield ChatCompletionStreamResponse.parse_obj(json.loads(data_field))
 
 
 class ChatCompletion:

diff --git a/fastchat/constants.py b/fastchat/constants.py
@@ -1,5 +1,29 @@
+from enum import IntEnum
+
+
 CONTROLLER_HEART_BEAT_EXPIRATION = 90
 WORKER_HEART_BEAT_INTERVAL = 30
 WORKER_API_TIMEOUT = 20
 
 LOGDIR = "."
+
+class ErrorCode(IntEnum):
+    '''
+    https://platform.openai.com/docs/guides/error-codes/api-errors
+    '''
+    VALIDATION_TYPE_ERROR = 40001
+
+    INVALID_AUTH_KEY = 40101
+    INCORRECT_AUTH_KEY = 40102
+    NO_PERMISSION = 40103
+
+    INVALID_MODEL = 40301
+    PARAM_OUT_OF_RANGE = 40302
+    CONTEXT_OVERFLOW = 40303
+
+    RATE_LIMIT = 42901
+    QUOTA_EXCEEDED = 42902
+    ENGINE_OVERLOADED = 42903
+
+    INTERNAL_ERROR = 50001
+    CUDA_OUT_OF_MEMORY = 50002
diff --git a/fastchat/model/chatglm_model.py b/fastchat/model/chatglm_model.py
@@ -1,6 +1,19 @@
 import torch
 from typing import List, Tuple
 
+@torch.no_grad()
+def stream_chat_token_num(tokenizer, query: str, history: List[Tuple[str, str]] = None):
+    if history is None:
+        history = []
+    if not history:
+        prompt = query
+    else:
+        prompt = ""
+        for i, (old_query, response) in enumerate(history):
+            prompt += "[Round {}]\n问：{}\n答：{}\n".format(i, old_query, response)
+        prompt += "[Round {}]\n问：{}\n答：".format(len(history), query)
+    inputs = tokenizer([prompt], return_tensors="pt")
+    return torch.numel(inputs['input_ids'])
 
 @torch.inference_mode()
 def chatglm_generate_stream(
@@ -29,10 +42,33 @@ def chatglm_generate_stream(
         hist.append((messages[i][1], messages[i + 1][1]))
     query = messages[-2][1]
 
-    for response, new_hist in model.stream_chat(tokenizer, query, hist, **gen_kwargs):
+    input_echo_len = stream_chat_token_num(tokenizer, query, hist)
+
+    for i, (response, new_hist) in enumerate(model.stream_chat(tokenizer, query, hist, **gen_kwargs)):
         if echo:
             output = query + " " + response
         else:
             output = response
 
-        yield output
+        yield {
+            "text": output,
+            "usage": {
+                "prompt_tokens": input_echo_len,
+                "completion_tokens": i,
+                "total_tokens": input_echo_len + i,
+            },
+            "finish_reason": None
+        }
+
+    # TODO: ChatGLM stop when it reach max length
+    # Only last stream result contains finish_reason, we set finish_reason as stop
+    ret = {
+        "text": output,
+        "usage": {
+            "prompt_tokens": input_echo_len,
+            "completion_tokens": i,
+            "total_tokens": input_echo_len + i,
+        },
+        "finish_reason": "stop"
+    }
+    yield ret
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,6 +7,7 @@ dist @@
     # Log
     *.log
+    *.log.*
     *.json
     # Editor
@@ Expand Down @@