-
Notifications
You must be signed in to change notification settings - Fork 252
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
New TikToken tokenizers package (#440)
New tokenizers, up to date for GPT4
- Loading branch information
Showing
12 changed files
with
222 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# Kernel Memory with TikToken tokenization | ||
|
||
[![Discord](https://img.shields.io/discord/1063152441819942922?label=Discord&logo=discord&logoColor=white&color=d82679)](https://aka.ms/KMdiscord) | ||
|
||
This project contains | ||
the [TikToken](https://github.com/openai/tiktoken) | ||
tokenizers for Kernel Memory. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
|
||
/* IMPORTANT: the Startup class must be at the root of the namespace and | ||
* the namespace must match exactly (required by Xunit.DependencyInjection) */ | ||
|
||
using Microsoft.Extensions.Hosting; | ||
|
||
namespace TikToken.UnitTests; | ||
|
||
public class Startup | ||
{ | ||
// ReSharper disable once UnusedMember.Global | ||
public void ConfigureHost(IHostBuilder hostBuilder) | ||
{ | ||
} | ||
} |
37 changes: 37 additions & 0 deletions
37
extensions/TikToken/TikToken.UnitTests/TikToken.UnitTests.csproj
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
<Project Sdk="Microsoft.NET.Sdk"> | ||
|
||
<PropertyGroup> | ||
<TargetFramework>net8.0</TargetFramework> | ||
<RollForward>LatestMajor</RollForward> | ||
<AssemblyName>Microsoft.TikToken.UnitTests</AssemblyName> | ||
<IsTestProject>true</IsTestProject> | ||
<ImplicitUsings>enable</ImplicitUsings> | ||
<Nullable>enable</Nullable> | ||
<IsPackable>false</IsPackable> | ||
<GenerateDocumentationFile>false</GenerateDocumentationFile> | ||
<NoWarn>$(NoWarn);</NoWarn> | ||
</PropertyGroup> | ||
|
||
<ItemGroup> | ||
<ProjectReference Include="..\..\..\service\tests\TestHelpers\TestHelpers.csproj" /> | ||
<ProjectReference Include="..\TikToken\TikToken.csproj" /> | ||
</ItemGroup> | ||
|
||
<ItemGroup> | ||
<PackageReference Include="coverlet.collector"> | ||
<PrivateAssets>all</PrivateAssets> | ||
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets> | ||
</PackageReference> | ||
<PackageReference Include="Microsoft.Extensions.DependencyInjection" /> | ||
<PackageReference Include="Microsoft.NET.Test.Sdk" /> | ||
<PackageReference Include="Xunit.DependencyInjection" /> | ||
<PackageReference Include="Xunit.DependencyInjection.Logging" /> | ||
<PackageReference Include="xunit" /> | ||
<PackageReference Include="xunit.abstractions" /> | ||
<PackageReference Include="xunit.runner.visualstudio"> | ||
<PrivateAssets>all</PrivateAssets> | ||
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets> | ||
</PackageReference> | ||
</ItemGroup> | ||
|
||
</Project> |
27 changes: 27 additions & 0 deletions
27
extensions/TikToken/TikToken.UnitTests/TikTokenTokenizersTest.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
|
||
using Microsoft.KernelMemory.AI.OpenAI; | ||
using Microsoft.KernelMemory.AI.TikToken; | ||
using Microsoft.TestHelpers; | ||
using Xunit; | ||
using Xunit.Abstractions; | ||
|
||
namespace TikToken.UnitTests; | ||
|
||
public class TikTokenTokenizers : BaseUnitTestCase | ||
{ | ||
public TikTokenTokenizers(ITestOutputHelper output) : base(output) | ||
{ | ||
} | ||
|
||
[Fact] | ||
public void TheyCountTokens() | ||
{ | ||
const string text = "{'bos_token': '<|endoftext|>',\n 'eos_token': '<|endoftext|>',\n 'unk_token': '<|endoftext|>'}"; | ||
|
||
Assert.Equal(47, new DefaultGPTTokenizer().CountTokens(text)); | ||
Assert.Equal(29, new TikTokenGPT2Tokenizer().CountTokens(text)); | ||
Assert.Equal(29, new TikTokenGPT3Tokenizer().CountTokens(text)); | ||
Assert.Equal(21, new TikTokenGPT4Tokenizer().CountTokens(text)); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
<Project Sdk="Microsoft.NET.Sdk"> | ||
|
||
<PropertyGroup> | ||
<TargetFramework>net8.0</TargetFramework> | ||
<RollForward>LatestMajor</RollForward> | ||
<AssemblyName>Microsoft.KernelMemory.AI.TikToken</AssemblyName> | ||
<RootNamespace>Microsoft.KernelMemory.AI.TikToken</RootNamespace> | ||
<NoWarn>$(NoWarn);</NoWarn> | ||
</PropertyGroup> | ||
|
||
<ItemGroup> | ||
<PackageReference Include="Microsoft.KernelMemory.Abstractions" Condition="'$(SolutionName)' != 'KernelMemoryDev'" /> | ||
<ProjectReference Include="..\..\..\service\Abstractions\Abstractions.csproj" Condition="'$(SolutionName)' == 'KernelMemoryDev'" /> | ||
</ItemGroup> | ||
|
||
<ItemGroup> | ||
<PackageReference Include="Microsoft.ML.Tokenizers" /> | ||
</ItemGroup> | ||
|
||
<ItemGroup> | ||
<InternalsVisibleTo Include="Microsoft.UnitTests" /> | ||
</ItemGroup> | ||
|
||
<Import Project="../../../code-analysis.props" /> | ||
|
||
<Import Project="../../../nuget-package.props" /> | ||
|
||
<PropertyGroup> | ||
<IsPackable>true</IsPackable> | ||
<PackageId>Microsoft.KernelMemory.AI.TikToken</PackageId> | ||
<Product>TikToken tokenizers for Kernel Memory</Product> | ||
<Description>Provide TikToken tokenizers in Kernel Memory</Description> | ||
<PackageTags>TikToken, Tokenization, BPE, GPT4, GPT, Memory, RAG, Kernel Memory, Semantic Memory, Episodic Memory, Declarative Memory, AI, Artificial Intelligence, Embeddings, Vector DB, Vector Search, Memory DB, ETL</PackageTags> | ||
</PropertyGroup> | ||
|
||
<ItemGroup> | ||
<None Include="..\README.md" Link="README.md" Pack="true" PackagePath="." Visible="false" /> | ||
</ItemGroup> | ||
|
||
</Project> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
|
||
using Microsoft.ML.Tokenizers; | ||
|
||
namespace Microsoft.KernelMemory.AI.TikToken; | ||
|
||
/// <summary> | ||
/// TikToken GPT2 tokenizer (gpt2.tiktoken) | ||
/// </summary> | ||
public class TikTokenGPT2Tokenizer : ITextTokenizer | ||
{ | ||
private static readonly Tokenizer s_tokenizer = Tokenizer.CreateTiktokenForModel("gpt2"); | ||
|
||
/// <inheritdoc /> | ||
public int CountTokens(string text) | ||
{ | ||
return s_tokenizer.CountTokens(text); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
|
||
using Microsoft.ML.Tokenizers; | ||
|
||
namespace Microsoft.KernelMemory.AI.TikToken; | ||
|
||
/// <summary> | ||
/// TikToken GPT3 tokenizer (p50k_base.tiktoken) | ||
/// </summary> | ||
public class TikTokenGPT3Tokenizer : ITextTokenizer | ||
{ | ||
private static readonly Tokenizer s_tokenizer = Tokenizer.CreateTiktokenForModel("text-davinci-003"); | ||
|
||
/// <inheritdoc /> | ||
public int CountTokens(string text) | ||
{ | ||
return s_tokenizer.CountTokens(text); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
|
||
using System.Collections.Generic; | ||
using Microsoft.ML.Tokenizers; | ||
|
||
namespace Microsoft.KernelMemory.AI.TikToken; | ||
|
||
/// <summary> | ||
/// GPT 3.5 and GPT 4+ tokenizer (cl100k_base.tiktoken + special tokens) | ||
/// </summary> | ||
public class TikTokenGPT4Tokenizer : ITextTokenizer | ||
{ | ||
private static readonly Tokenizer s_tokenizer = Tokenizer.CreateTiktokenForModel("gpt-4", new Dictionary<string, int> { { "<|im_start|>", 100264 }, { "<|im_end|>", 100265 } }); | ||
|
||
/// <inheritdoc /> | ||
public int CountTokens(string text) | ||
{ | ||
return s_tokenizer.CountTokens(text); | ||
} | ||
} |