Skip to content

Commit

Permalink
feat: add encodeChat
Browse files Browse the repository at this point in the history
fixes #10
  • Loading branch information
niieani committed Jun 1, 2023
1 parent 84887b4 commit ff30f11
Show file tree
Hide file tree
Showing 19 changed files with 1,372 additions and 82 deletions.
47 changes: 45 additions & 2 deletions README.md
Expand Up @@ -56,6 +56,8 @@ Refer to [supported models and their encodings](#Supported-models-and-their-enco

## Playground

The playground is published under a memorable URL: https://gpt-tokenizer.dev/

You can play with the package in the browser using the [Playground](https://codesandbox.io/s/gpt-tokenizer-tjcjoz?fontsize=14&hidenavigation=1&theme=dark).

[![GPT Tokenizer Playground](./docs/gpt-tokenizer.png)](https://codesandbox.io/s/gpt-tokenizer-tjcjoz?fontsize=14&hidenavigation=1&theme=dark)
Expand All @@ -67,6 +69,7 @@ The playground mimics the official [OpenAI Tokenizer](https://platform.openai.co
```typescript
import {
encode,
encodeChat,
decode,
isWithinTokenLimit,
encodeGenerator,
Expand All @@ -87,6 +90,18 @@ const decodedText = decode(tokens)
// returns false if the limit is exceeded, otherwise returns the actual number of tokens (truthy value)
const withinTokenLimit = isWithinTokenLimit(text, tokenLimit)

// Example chat:
const chat = [
{ role: 'system', content: 'You are a helpful assistant.' },
{ role: 'assistant', content: 'gpt-tokenizer is awesome.' },
]

// Encode chat into tokens
const chatTokens = encodeChat(chat)

// Check if chat is within the token limit
const chatWithinTokenLimit = isWithinTokenLimit(chat, tokenLimit)

// Encode text using generator
for (const tokenChunk of encodeGenerator(text)) {
console.log(tokenChunk)
Expand Down Expand Up @@ -130,10 +145,13 @@ import {

chat:

- `gpt-4` (`cl100k_base`)
- `gpt-4-32k` (`cl100k_base`)
- `gpt-4-0314` (`cl100k_base`)
- `gpt-4-32k-0314` (`cl100k_base`)
- `gpt-3.5-turbo` (`cl100k_base`)
- `gpt-3.5-turbo-0301` (`cl100k_base`)

text:
text-only:

- `text-davinci-003` (`p50k_base`)
- `text-davinci-002` (`p50k_base`)
Expand Down Expand Up @@ -219,6 +237,24 @@ const tokenLimit = 10
const withinTokenLimit = isWithinTokenLimit(text, tokenLimit)
```

### `encodeChat(chat: ChatMessage[], model?: ModelName): number[]`

Encodes the given chat into a sequence of tokens.

If you didn't import the model version directly, or if `model` wasn't provided during initialization, it must be provided here to correctly tokenize the chat for a given model. Use this method when you need to transform a chat into the token format that the GPT models can process.

Example:

```typescript
import { encodeChat } from 'gpt-tokenizer'

const chat = [
{ role: 'system', content: 'You are a helpful assistant.' },
{ role: 'assistant', content: 'gpt-tokenizer is awesome.' },
]
const tokens = encodeChat(chat)
```

### `encodeGenerator(text: string): Generator<number[], void, undefined>`

Encodes the given text using a generator, yielding chunks of tokens.
Expand All @@ -236,6 +272,10 @@ for (const tokenChunk of encodeGenerator(text)) {
}
```

### `encodeChatGenerator(chat: Iterator<ChatMessage>, model?: ModelName): Generator<number[], void, undefined>`

Same as `encodeChat`, but uses a generator as output, and may use any iterator as the input `chat`.

### `decodeGenerator(tokens: Iterable<number>): Generator<string, void, undefined>`

Decodes a sequence of tokens using a generator, yielding chunks of decoded text.
Expand Down Expand Up @@ -287,6 +327,9 @@ import {
FimMiddle,
FimPrefix,
FimSuffix,
ImStart,
ImEnd,
ImSep,
encode,
} from 'gpt-tokenizer'

Expand Down
2 changes: 2 additions & 0 deletions package.json
Expand Up @@ -72,6 +72,8 @@
"dist"
],
"scripts": {
"codegen:models": "yarn tsx src/codegen/generateByModel.ts",
"codegen:encodings": "yarn tsx src/codegen/generateJsEncodings.ts",
"build": "yarn build:cjs && yarn build:esm && yarn build:umd",
"build:cjs": "yarn rrun tsc --outDir cjs --module commonjs --target es2022 --project tsconfig-cjs.json",
"build:esm": "yarn rrun tsc --outDir esm --module esnext --target es2022 && echo '{\"name\": \"gpt-tokenizer\", \"type\": \"module\"}' > ./esm/package.json",
Expand Down
79 changes: 78 additions & 1 deletion src/GptEncoding.test.ts
@@ -1,7 +1,12 @@
import fs from 'fs'
import path from 'path'
import { GptEncoding } from './GptEncoding.js'
import { type EncodingName, encodingNames } from './mapping.js'
import {
type ChatModelName,
type EncodingName,
chatModelParams,
encodingNames,
} from './mapping.js'
import { resolveEncoding } from './resolveEncoding.js'

const sharedResults = {
Expand Down Expand Up @@ -155,6 +160,78 @@ describe.each(encodingNames)('%s', (encodingName: EncodingName) => {
})
})

const chatModelNames = Object.keys(chatModelParams) as readonly ChatModelName[]

const exampleMessages = [
{
role: 'system',
content:
'You are a helpful, pattern-following assistant that translates corporate jargon into plain English.',
},
{
role: 'system',
name: 'example_user',
content: 'New synergies will help drive top-line growth.',
},
{
role: 'system',
name: 'example_assistant',
content: 'Things working well together will increase revenue.',
},
{
role: 'system',
name: 'example_user',
content:
"Let's circle back when we have more bandwidth to touch base on opportunities for increased leverage.",
},
{
role: 'system',
name: 'example_assistant',
content: "Let's talk later when we're less busy about how to do better.",
},
{
role: 'user',
content:
"This late pivot means we don't have time to boil the ocean for the client deliverable.",
},
] as const

describe.each(chatModelNames)('%s', (modelName) => {
const encoding = GptEncoding.getEncodingApiForModel(
modelName,
resolveEncoding,
)
const expectedEncodedLength = modelName.startsWith('gpt-3.5-turbo')
? 127
: 121

describe('chat functionality', () => {
it('encodes a chat correctly', () => {
const encoded = encoding.encodeChat(exampleMessages)
expect(encoded).toMatchSnapshot()
expect(encoded).toHaveLength(expectedEncodedLength)

const decoded = encoding.decode(encoded)
expect(decoded).toMatchSnapshot()
})

it('isWithinTokenLimit: false', () => {
const isWithinTokenLimit = encoding.isWithinTokenLimit(
exampleMessages,
50,
)
expect(isWithinTokenLimit).toBe(false)
})
it('isWithinTokenLimit: true (number)', () => {
const isWithinTokenLimit = encoding.isWithinTokenLimit(
exampleMessages,
150,
)
expect(isWithinTokenLimit).toBe(expectedEncodedLength)
})
})
})

function loadTestPlans() {
const testPlanPath = path.join(__dirname, '../data/TestPlans.txt')
const testPlanData = fs.readFileSync(testPlanPath, 'utf8')
Expand Down

0 comments on commit ff30f11

Please sign in to comment.