-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenizer.go
158 lines (143 loc) · 5.72 KB
/
tokenizer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
// Copyright 2023 Peter Hebert. Licensed under the MIT license.
// Package gotoken provides an OpenAI-compatible tokenization library similar to
// tiktoken. Its primary export is the Tokenizer interface, featuring Encode and
// Decode methods for converting strings to/from []int.
//
// Tokenizer encodings, such as r50kbase or cl100kbase, are available in
// separate packages that implement the Tokenizer interface. This design mirrors
// the image/png and image/jpeg packages' integration with the standard image
// library. Encoding packages self-register with gotoken.
//
// Encoding packages include built-in token dictionaries, which removes the need
// for external downloads or local file caches. However, these packages are
// relatively large (a few MB) and should only be imported when needed. At least
// one encoding package must be imported for gotoken to be able to tokenize
// text.
//
// Example of importing gotoken and a tokenizer encoding:
//
// import (
// "github.com/peterheb/gotoken"
// _ "github.com/peterheb/gotoken/cl100kbase"
// )
//
// The _ indicates that cl100kbase should be imported even without a direct
// reference in your code. Encoding packages have no public functions or types,
// but they do contain public constants defining special tokens.
//
// [tiktoken]: https://github.com/openai/tiktoken
package gotoken
import (
"errors"
"fmt"
"sort"
"sync"
)
// Tokenizer is the primary public interface provided by gotoken. It is
// implemented by encoding packages, like
// [github.com/peterheb/gotoken/r50kbase]. A Tokenizer is created using
// [GetTokenizer].
//
// Tokenizer supports four methods:
//
// - Count returns the number of tokens in an input string, or 0 on error.
// - Encode tokenizes an input string to an []int.
// - Decode un-tokenizes an []int back to its string representation.
// - Allowed returns an error if the input string contains any sequences
// corresponding to special tokens that are not allowed by this tokenizer.
type Tokenizer interface {
Count(input string) int
Encode(input string) ([]int, error)
Decode(input []int) (string, error)
Allowed(input string) error
}
// Option is a functional option for a tokenizer, such as [WithSpecialTokens] or
// [WithSpecialTokensAsText].
type Option func(*tokenizerOptions)
// tokenizerOptions collects data from our functional options.
type tokenizerOptions struct {
AllowSpecialAsText bool
AllowedSpecialTokens []string
}
// These errors can be returned by functions in this library. Errors will be
// wrapped with fmt.Errorf; use [errors.Is] or [errors.As] to check for the
// underlying error type.
var (
ErrUnknownEncoding = errors.New("unknown tokenizer encoding")
ErrInvalidToken = errors.New("invalid token")
ErrSpecialToken = errors.New("unexpected special token found")
)
var (
registered = make(map[string]func(bool, []string) (Tokenizer, error))
regMu sync.RWMutex
)
// GetTokenizer returns a tokenizer by its encoding name. If no matching
// registered encoding is found, an error is returned that wraps
// [ErrUnknownEncoding].
//
// GetTokenizer supports functional options to configure the returned Tokenizer.
// The default configuration, if no options are specified, disallows special
// tokens in the input.
//
// If special tokens are not applicable, using [WithSpecialTokensAsText] will
// allow the tokenizer to process any input string without raising an error. If
// special tokens should be supported by the Tokenizer, list the specific ones
// to allow using the option [WithSpecialTokens].
//
// The following encoding names are supported:
//
// - "cl100k_base" in [github.com/peterheb/gotoken/cl100kbase]
// - "p50k_base" and "p50k_edit" in [github.com/peterheb/gotoken/p50kbase]
// - "r50k_base" in [github.com/peterheb/gotoken/r50kbase]
func GetTokenizer(encodingName string, opts ...Option) (Tokenizer, error) {
regMu.RLock()
defer regMu.RUnlock()
// If options are provided, apply them.
options := tokenizerOptions{}
for _, opt := range opts {
opt(&options)
}
// Return a new tokenizer instance
if tokenFactory, ok := registered[encodingName]; ok {
return tokenFactory(options.AllowSpecialAsText, options.AllowedSpecialTokens)
}
return nil, fmt.Errorf("%w: %s", ErrUnknownEncoding, encodingName)
}
// ListTokenizers returns a list of all registered tokenizer encodings. These
// are the valid inputs to [GetTokenizer].
func ListTokenizers() []string {
regMu.RLock()
defer regMu.RUnlock()
encodings := make([]string, 0, len(registered))
for encoding := range registered {
encodings = append(encodings, encoding)
}
sort.Strings(encodings)
return encodings
}
// RegisterTokenizer registers a tokenizer with the given name. This is
// typically called by the init function of a specific tokenizer's package.
func RegisterTokenizer(name string, tokFactory func(bool, []string) (Tokenizer, error)) {
regMu.Lock()
defer regMu.Unlock()
registered[name] = tokFactory
}
// WithSpecialTokensAsText is a functional option for [GetTokenizer] that
// configures the tokenizer to treat special tokens as text. This allows strings
// like "<|endoftext|>" to be encoded as text tokens, rather than causing an
// encoding error (which is the default behavior).
func WithSpecialTokensAsText() func(*tokenizerOptions) {
return func(opts *tokenizerOptions) {
opts.AllowSpecialAsText = true
}
}
// WithSpecialTokens is a functional option for [GetTokenizer] that configures
// the tokenizer to encode special tokens to their special token values. This
// should only be used when a Tokenizer is encoding trusted input.
func WithSpecialTokens(tokens ...string) func(*tokenizerOptions) {
return func(opts *tokenizerOptions) {
for _, tok := range tokens {
opts.AllowedSpecialTokens = append(opts.AllowedSpecialTokens, tok)
}
}
}