feat: add encodeChat

fixes #10
niieani · Jun 1, 2023 · ff30f11 · ff30f11
1 parent 84887b4
commit ff30f11
Show file tree

Hide file tree

Showing 19 changed files with 1,372 additions and 82 deletions.
diff --git a/README.md b/README.md
@@ -56,6 +56,8 @@ Refer to [supported models and their encodings](#Supported-models-and-their-enco
 
 ## Playground
 
+The playground is published under a memorable URL: https://gpt-tokenizer.dev/
+
 You can play with the package in the browser using the [Playground](https://codesandbox.io/s/gpt-tokenizer-tjcjoz?fontsize=14&hidenavigation=1&theme=dark).
 
 [![GPT Tokenizer Playground](./docs/gpt-tokenizer.png)](https://codesandbox.io/s/gpt-tokenizer-tjcjoz?fontsize=14&hidenavigation=1&theme=dark)
@@ -67,6 +69,7 @@ The playground mimics the official [OpenAI Tokenizer](https://platform.openai.co
 ```typescript
 import {
   encode,
+  encodeChat,
   decode,
   isWithinTokenLimit,
   encodeGenerator,
@@ -87,6 +90,18 @@ const decodedText = decode(tokens)
 // returns false if the limit is exceeded, otherwise returns the actual number of tokens (truthy value)
 const withinTokenLimit = isWithinTokenLimit(text, tokenLimit)
 
+// Example chat:
+const chat = [
+  { role: 'system', content: 'You are a helpful assistant.' },
+  { role: 'assistant', content: 'gpt-tokenizer is awesome.' },
+]
+
+// Encode chat into tokens
+const chatTokens = encodeChat(chat)
+
+// Check if chat is within the token limit
+const chatWithinTokenLimit = isWithinTokenLimit(chat, tokenLimit)
+
 // Encode text using generator
 for (const tokenChunk of encodeGenerator(text)) {
   console.log(tokenChunk)
@@ -130,10 +145,13 @@ import {
 
 chat:
 
-- `gpt-4` (`cl100k_base`)
+- `gpt-4-32k` (`cl100k_base`)
+- `gpt-4-0314` (`cl100k_base`)
+- `gpt-4-32k-0314` (`cl100k_base`)
 - `gpt-3.5-turbo` (`cl100k_base`)
+- `gpt-3.5-turbo-0301` (`cl100k_base`)
 
-text:
+text-only:
 
 - `text-davinci-003` (`p50k_base`)
 - `text-davinci-002` (`p50k_base`)
@@ -219,6 +237,24 @@ const tokenLimit = 10
 const withinTokenLimit = isWithinTokenLimit(text, tokenLimit)
 ```
 
+### `encodeChat(chat: ChatMessage[], model?: ModelName): number[]`
+
+Encodes the given chat into a sequence of tokens.
+
+If you didn't import the model version directly, or if `model` wasn't provided during initialization, it must be provided here to correctly tokenize the chat for a given model. Use this method when you need to transform a chat into the token format that the GPT models can process.
+
+Example:
+
+```typescript
+import { encodeChat } from 'gpt-tokenizer'
+
+const chat = [
+  { role: 'system', content: 'You are a helpful assistant.' },
+  { role: 'assistant', content: 'gpt-tokenizer is awesome.' },
+]
+const tokens = encodeChat(chat)
+```
+
 ### `encodeGenerator(text: string): Generator<number[], void, undefined>`
 
 Encodes the given text using a generator, yielding chunks of tokens.
@@ -236,6 +272,10 @@ for (const tokenChunk of encodeGenerator(text)) {
 }
 ```
 
+### `encodeChatGenerator(chat: Iterator<ChatMessage>, model?: ModelName): Generator<number[], void, undefined>`
+
+Same as `encodeChat`, but uses a generator as output, and may use any iterator as the input `chat`.
+
 ### `decodeGenerator(tokens: Iterable<number>): Generator<string, void, undefined>`
 
 Decodes a sequence of tokens using a generator, yielding chunks of decoded text.
@@ -287,6 +327,9 @@ import {
   FimMiddle,
   FimPrefix,
   FimSuffix,
+  ImStart,
+  ImEnd,
+  ImSep,
   encode,
 } from 'gpt-tokenizer'
 

diff --git a/package.json b/package.json
@@ -72,6 +72,8 @@
     "dist"
   ],
   "scripts": {
+    "codegen:models": "yarn tsx src/codegen/generateByModel.ts",
+    "codegen:encodings": "yarn tsx src/codegen/generateJsEncodings.ts",
     "build": "yarn build:cjs && yarn build:esm && yarn build:umd",
     "build:cjs": "yarn rrun tsc --outDir cjs --module commonjs --target es2022 --project tsconfig-cjs.json",
     "build:esm": "yarn rrun tsc --outDir esm --module esnext --target es2022 && echo '{\"name\": \"gpt-tokenizer\", \"type\": \"module\"}' > ./esm/package.json",

diff --git a/src/GptEncoding.test.ts b/src/GptEncoding.test.ts
@@ -1,7 +1,12 @@
 import fs from 'fs'
 import path from 'path'
 import { GptEncoding } from './GptEncoding.js'
-import { type EncodingName, encodingNames } from './mapping.js'
+import {
+  type ChatModelName,
+  type EncodingName,
+  chatModelParams,
+  encodingNames,
+} from './mapping.js'
 import { resolveEncoding } from './resolveEncoding.js'
 
 const sharedResults = {
@@ -155,6 +160,78 @@ describe.each(encodingNames)('%s', (encodingName: EncodingName) => {
   })
 })
 
+const chatModelNames = Object.keys(chatModelParams) as readonly ChatModelName[]
+
+const exampleMessages = [
+  {
+    role: 'system',
+    content:
+      'You are a helpful, pattern-following assistant that translates corporate jargon into plain English.',
+  },
+  {
+    role: 'system',
+    name: 'example_user',
+    content: 'New synergies will help drive top-line growth.',
+  },
+  {
+    role: 'system',
+    name: 'example_assistant',
+    content: 'Things working well together will increase revenue.',
+  },
+  {
+    role: 'system',
+    name: 'example_user',
+    content:
+      "Let's circle back when we have more bandwidth to touch base on opportunities for increased leverage.",
+  },
+  {
+    role: 'system',
+    name: 'example_assistant',
+    content: "Let's talk later when we're less busy about how to do better.",
+  },
+  {
+    role: 'user',
+    content:
+      "This late pivot means we don't have time to boil the ocean for the client deliverable.",
+  },
+] as const
+
+describe.each(chatModelNames)('%s', (modelName) => {
+  const encoding = GptEncoding.getEncodingApiForModel(
+    modelName,
+    resolveEncoding,
+  )
+  const expectedEncodedLength = modelName.startsWith('gpt-3.5-turbo')
+    ? 127
+    : 121
+
+  describe('chat functionality', () => {
+    it('encodes a chat correctly', () => {
+      const encoded = encoding.encodeChat(exampleMessages)
+      expect(encoded).toMatchSnapshot()
+      expect(encoded).toHaveLength(expectedEncodedLength)
+
+      const decoded = encoding.decode(encoded)
+      expect(decoded).toMatchSnapshot()
+    })
+
+    it('isWithinTokenLimit: false', () => {
+      const isWithinTokenLimit = encoding.isWithinTokenLimit(
+        exampleMessages,
+        50,
+      )
+      expect(isWithinTokenLimit).toBe(false)
+    })
+    it('isWithinTokenLimit: true (number)', () => {
+      const isWithinTokenLimit = encoding.isWithinTokenLimit(
+        exampleMessages,
+        150,
+      )
+      expect(isWithinTokenLimit).toBe(expectedEncodedLength)
+    })
+  })
+})
+
 function loadTestPlans() {
   const testPlanPath = path.join(__dirname, '../data/TestPlans.txt')
   const testPlanData = fs.readFileSync(testPlanPath, 'utf8')