Skip to content

Commit

Permalink
New TikToken tokenizers package (#440)
Browse files Browse the repository at this point in the history
New tokenizers, up to date for GPT4
  • Loading branch information
dluc committed Apr 27, 2024
1 parent 4598fb2 commit ccbc801
Show file tree
Hide file tree
Showing 12 changed files with 222 additions and 12 deletions.
8 changes: 5 additions & 3 deletions Directory.Packages.props
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
<PackageVersion Include="Microsoft.Extensions.Logging" Version="8.0.0" />
<PackageVersion Include="Microsoft.Extensions.Logging.Abstractions" Version="8.0.1" />
<PackageVersion Include="Microsoft.Extensions.Logging.TraceSource" Version="8.0.0" />
<PackageVersion Include="Microsoft.ML.Tokenizers" Version="0.22.0-preview.24179.1" />
<PackageVersion Include="MongoDB.Driver.GridFS" Version="2.25.0" />
<PackageVersion Include="Moq" Version="4.20.70" />
<PackageVersion Include="PdfPig" Version="0.1.8" />
Expand Down Expand Up @@ -76,18 +77,19 @@
</ItemGroup>
<!-- Tests -->
<ItemGroup>
<PackageVersion Include="Microsoft.NET.Test.Sdk" Version="17.9.0" />
<PackageVersion Include="Xunit.DependencyInjection" Version="9.2.1" />
<PackageVersion Include="Xunit.DependencyInjection.Logging" Version="9.0.0" />
<PackageVersion Include="coverlet.collector" Version="6.0.2">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageVersion>
<PackageVersion Include="Microsoft.NET.Test.Sdk" Version="17.9.0" />
<PackageVersion Include="xunit" Version="2.7.1" />
<PackageVersion Include="xunit.abstractions" Version="2.0.3" />
<PackageVersion Include="xunit.extensibility.core" Version="2.7.1" />
<PackageVersion Include="xunit.runner.visualstudio" Version="2.5.8">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageVersion>
<PackageVersion Include="Xunit.DependencyInjection" Version="9.2.1" />
<PackageVersion Include="Xunit.DependencyInjection.Logging" Version="9.0.0" />
</ItemGroup>
</Project>
13 changes: 13 additions & 0 deletions KernelMemory.sln
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,10 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Service.AspNetCore", "servi
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "111-dotnet-azure-ai-hybrid-search", "examples\111-dotnet-azure-ai-hybrid-search\111-dotnet-azure-ai-hybrid-search.csproj", "{28534545-CB39-446A-9EB9-A5ABBFE0CFD3}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "TikToken", "extensions\TikToken\TikToken\TikToken.csproj", "{91757AC4-4FE3-40FE-96D6-1DDEDFB4A830}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "TikToken.UnitTests", "extensions\TikToken\TikToken.UnitTests\TikToken.UnitTests.csproj", "{8ADA17CD-B779-4817-B10A-E9D7B019088D}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -335,6 +339,8 @@ Global
{C2D3A947-B6F9-4306-BD42-21D8D1F42750} = {B488168B-AD86-4CC5-9D89-324B6EB743D9}
{A46B0BE1-03F2-4520-A3DA-FD845BA1FD69} = {87DEAE8D-138C-4FDD-B4C9-11C3A7817E8F}
{28534545-CB39-446A-9EB9-A5ABBFE0CFD3} = {0A43C65C-6007-4BB4-B3FE-8D439FC91841}
{91757AC4-4FE3-40FE-96D6-1DDEDFB4A830} = {155DA079-E267-49AF-973A-D1D44681970F}
{8ADA17CD-B779-4817-B10A-E9D7B019088D} = {3C17F42B-CFC8-4900-8CFB-88936311E919}
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{8A9FA587-7EBA-4D43-BE47-38D798B1C74C}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
Expand Down Expand Up @@ -542,5 +548,12 @@ Global
{28534545-CB39-446A-9EB9-A5ABBFE0CFD3}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{28534545-CB39-446A-9EB9-A5ABBFE0CFD3}.Debug|Any CPU.Build.0 = Debug|Any CPU
{28534545-CB39-446A-9EB9-A5ABBFE0CFD3}.Release|Any CPU.ActiveCfg = Release|Any CPU
{91757AC4-4FE3-40FE-96D6-1DDEDFB4A830}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{91757AC4-4FE3-40FE-96D6-1DDEDFB4A830}.Debug|Any CPU.Build.0 = Debug|Any CPU
{91757AC4-4FE3-40FE-96D6-1DDEDFB4A830}.Release|Any CPU.ActiveCfg = Release|Any CPU
{91757AC4-4FE3-40FE-96D6-1DDEDFB4A830}.Release|Any CPU.Build.0 = Release|Any CPU
{8ADA17CD-B779-4817-B10A-E9D7B019088D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{8ADA17CD-B779-4817-B10A-E9D7B019088D}.Debug|Any CPU.Build.0 = Debug|Any CPU
{8ADA17CD-B779-4817-B10A-E9D7B019088D}.Release|Any CPU.ActiveCfg = Release|Any CPU
EndGlobalSection
EndGlobal
1 change: 1 addition & 0 deletions KernelMemory.sln.DotSettings
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,7 @@ public void It$SOMENAME$()
<s:Boolean x:Key="/Default/UserDictionary/Words/=SVCS/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=syntaxes/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=testsettings/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=tiktoken/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=tldr/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=typeparam/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=Untrust/@EntryIndexedValue">True</s:Boolean>
Expand Down
27 changes: 18 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,16 @@ running the service locally with OpenAPI enabled.

[![Nuget package](https://img.shields.io/nuget/v/Microsoft.KernelMemory.MemoryDb.Qdrant)](https://www.nuget.org/packages/Microsoft.KernelMemory.MemoryDb.Qdrant/)

* **Microsoft.KernelMemory.MemoryDb.Redis:** Memory storage using
**[Redis](extensions/Redis)**.

[![Nuget package](https://img.shields.io/nuget/v/Microsoft.KernelMemory.MemoryDb.Redis)](https://www.nuget.org/packages/Microsoft.KernelMemory.MemoryDb.Redis/)

* **Microsoft.KernelMemory.MongoDbAtlas:** Memory and Files storage using
**[MongoDb Atlas](extensions/MongoDbAtlas)**.

[![Nuget package](https://img.shields.io/nuget/v/Microsoft.KernelMemory.MongoDbAtlas)](https://www.nuget.org/packages/Microsoft.KernelMemory.MongoDbAtlas/)

* **Microsoft.KernelMemory.AI.AzureOpenAI:** Integration with **[Azure OpenAI](extensions/OpenAI)** LLMs.

[![Nuget package](https://img.shields.io/nuget/v/Microsoft.KernelMemory.AI.AzureOpenAI)](https://www.nuget.org/packages/Microsoft.KernelMemory.AI.AzureOpenAI/)
Expand All @@ -386,24 +396,23 @@ running the service locally with OpenAPI enabled.

[![Nuget package](https://img.shields.io/nuget/v/Microsoft.KernelMemory.DataFormats.AzureAIDocIntel)](https://www.nuget.org/packages/Microsoft.KernelMemory.DataFormats.AzureAIDocIntel/)

* **Microsoft.KernelMemory.Orchestration.AzureQueues:** Ingestion and synthetic memory
pipelines via [Azure Queue Storage](extensions/AzureQueues).
* **Microsoft.KernelMemory.Orchestration.AzureQueues:** Ingestion and synthetic memory pipelines
via [Azure Queue Storage](extensions/AzureQueues).

[![Nuget package](https://img.shields.io/nuget/v/Microsoft.KernelMemory.Orchestration.AzureQueues)](https://www.nuget.org/packages/Microsoft.KernelMemory.Orchestration.AzureQueues/)

* **Microsoft.KernelMemory.Orchestration.RabbitMQ:** Ingestion and synthetic memory
pipelines via [RabbitMQ](extensions/RabbitMQ).
* **Microsoft.KernelMemory.Orchestration.RabbitMQ:** Ingestion and synthetic memory pipelines
via [RabbitMQ](extensions/RabbitMQ).

[![Nuget package](https://img.shields.io/nuget/v/Microsoft.KernelMemory.Orchestration.RabbitMQ)](https://www.nuget.org/packages/Microsoft.KernelMemory.Orchestration.RabbitMQ/)

* **Microsoft.KernelMemory.ContentStorage.AzureBlobs:** Used to store content on
[Azure Storage Blobs](extensions/AzureBlobs).
* **Microsoft.KernelMemory.ContentStorage.AzureBlobs:** Used to store files
in [Azure Storage Blobs](extensions/AzureBlobs).

[![Nuget package](https://img.shields.io/nuget/v/Microsoft.KernelMemory.ContentStorage.AzureBlobs)](https://www.nuget.org/packages/Microsoft.KernelMemory.ContentStorage.AzureBlobs/)

* **Microsoft.KernelMemory.Core:** The core library, can be used to build custom
pipelines and handlers, and contains a serverless client to use memory in a
synchronous way, without the web service. .NET 6+.
* **Microsoft.KernelMemory.Core:** The core library, can be used to build custom pipelines and handlers, and contains
a serverless client to use memory in a synchronous way, without the web service.

[![Nuget package](https://img.shields.io/nuget/vpre/Microsoft.KernelMemory.Core)](https://www.nuget.org/packages/Microsoft.KernelMemory.Core/)
[![Example code](https://img.shields.io/badge/example-code-blue)](examples/001-dotnet-Serverless)
Expand Down
7 changes: 7 additions & 0 deletions extensions/TikToken/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Kernel Memory with TikToken tokenization

[![Discord](https://img.shields.io/discord/1063152441819942922?label=Discord&logo=discord&logoColor=white&color=d82679)](https://aka.ms/KMdiscord)

This project contains
the [TikToken](https://github.com/openai/tiktoken)
tokenizers for Kernel Memory.
16 changes: 16 additions & 0 deletions extensions/TikToken/TikToken.UnitTests/Startup.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// Copyright (c) Microsoft. All rights reserved.

/* IMPORTANT: the Startup class must be at the root of the namespace and
* the namespace must match exactly (required by Xunit.DependencyInjection) */

using Microsoft.Extensions.Hosting;

namespace TikToken.UnitTests;

public class Startup
{
// ReSharper disable once UnusedMember.Global
public void ConfigureHost(IHostBuilder hostBuilder)
{
}
}
37 changes: 37 additions & 0 deletions extensions/TikToken/TikToken.UnitTests/TikToken.UnitTests.csproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>net8.0</TargetFramework>
<RollForward>LatestMajor</RollForward>
<AssemblyName>Microsoft.TikToken.UnitTests</AssemblyName>
<IsTestProject>true</IsTestProject>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<IsPackable>false</IsPackable>
<GenerateDocumentationFile>false</GenerateDocumentationFile>
<NoWarn>$(NoWarn);</NoWarn>
</PropertyGroup>

<ItemGroup>
<ProjectReference Include="..\..\..\service\tests\TestHelpers\TestHelpers.csproj" />
<ProjectReference Include="..\TikToken\TikToken.csproj" />
</ItemGroup>

<ItemGroup>
<PackageReference Include="coverlet.collector">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageReference>
<PackageReference Include="Microsoft.Extensions.DependencyInjection" />
<PackageReference Include="Microsoft.NET.Test.Sdk" />
<PackageReference Include="Xunit.DependencyInjection" />
<PackageReference Include="Xunit.DependencyInjection.Logging" />
<PackageReference Include="xunit" />
<PackageReference Include="xunit.abstractions" />
<PackageReference Include="xunit.runner.visualstudio">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageReference>
</ItemGroup>

</Project>
27 changes: 27 additions & 0 deletions extensions/TikToken/TikToken.UnitTests/TikTokenTokenizersTest.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// Copyright (c) Microsoft. All rights reserved.

using Microsoft.KernelMemory.AI.OpenAI;
using Microsoft.KernelMemory.AI.TikToken;
using Microsoft.TestHelpers;
using Xunit;
using Xunit.Abstractions;

namespace TikToken.UnitTests;

public class TikTokenTokenizers : BaseUnitTestCase
{
public TikTokenTokenizers(ITestOutputHelper output) : base(output)
{
}

[Fact]
public void TheyCountTokens()
{
const string text = "{'bos_token': '<|endoftext|>',\n 'eos_token': '<|endoftext|>',\n 'unk_token': '<|endoftext|>'}";

Assert.Equal(47, new DefaultGPTTokenizer().CountTokens(text));
Assert.Equal(29, new TikTokenGPT2Tokenizer().CountTokens(text));
Assert.Equal(29, new TikTokenGPT3Tokenizer().CountTokens(text));
Assert.Equal(21, new TikTokenGPT4Tokenizer().CountTokens(text));
}
}
40 changes: 40 additions & 0 deletions extensions/TikToken/TikToken/TikToken.csproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>net8.0</TargetFramework>
<RollForward>LatestMajor</RollForward>
<AssemblyName>Microsoft.KernelMemory.AI.TikToken</AssemblyName>
<RootNamespace>Microsoft.KernelMemory.AI.TikToken</RootNamespace>
<NoWarn>$(NoWarn);</NoWarn>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Microsoft.KernelMemory.Abstractions" Condition="'$(SolutionName)' != 'KernelMemoryDev'" />
<ProjectReference Include="..\..\..\service\Abstractions\Abstractions.csproj" Condition="'$(SolutionName)' == 'KernelMemoryDev'" />
</ItemGroup>

<ItemGroup>
<PackageReference Include="Microsoft.ML.Tokenizers" />
</ItemGroup>

<ItemGroup>
<InternalsVisibleTo Include="Microsoft.UnitTests" />
</ItemGroup>

<Import Project="../../../code-analysis.props" />

<Import Project="../../../nuget-package.props" />

<PropertyGroup>
<IsPackable>true</IsPackable>
<PackageId>Microsoft.KernelMemory.AI.TikToken</PackageId>
<Product>TikToken tokenizers for Kernel Memory</Product>
<Description>Provide TikToken tokenizers in Kernel Memory</Description>
<PackageTags>TikToken, Tokenization, BPE, GPT4, GPT, Memory, RAG, Kernel Memory, Semantic Memory, Episodic Memory, Declarative Memory, AI, Artificial Intelligence, Embeddings, Vector DB, Vector Search, Memory DB, ETL</PackageTags>
</PropertyGroup>

<ItemGroup>
<None Include="..\README.md" Link="README.md" Pack="true" PackagePath="." Visible="false" />
</ItemGroup>

</Project>
19 changes: 19 additions & 0 deletions extensions/TikToken/TikToken/TikTokenGPT2Tokenizer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
// Copyright (c) Microsoft. All rights reserved.

using Microsoft.ML.Tokenizers;

namespace Microsoft.KernelMemory.AI.TikToken;

/// <summary>
/// TikToken GPT2 tokenizer (gpt2.tiktoken)
/// </summary>
public class TikTokenGPT2Tokenizer : ITextTokenizer
{
private static readonly Tokenizer s_tokenizer = Tokenizer.CreateTiktokenForModel("gpt2");

/// <inheritdoc />
public int CountTokens(string text)
{
return s_tokenizer.CountTokens(text);
}
}
19 changes: 19 additions & 0 deletions extensions/TikToken/TikToken/TikTokenGPT3Tokenizer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
// Copyright (c) Microsoft. All rights reserved.

using Microsoft.ML.Tokenizers;

namespace Microsoft.KernelMemory.AI.TikToken;

/// <summary>
/// TikToken GPT3 tokenizer (p50k_base.tiktoken)
/// </summary>
public class TikTokenGPT3Tokenizer : ITextTokenizer
{
private static readonly Tokenizer s_tokenizer = Tokenizer.CreateTiktokenForModel("text-davinci-003");

/// <inheritdoc />
public int CountTokens(string text)
{
return s_tokenizer.CountTokens(text);
}
}
20 changes: 20 additions & 0 deletions extensions/TikToken/TikToken/TikTokenGPT4Tokenizer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
// Copyright (c) Microsoft. All rights reserved.

using System.Collections.Generic;
using Microsoft.ML.Tokenizers;

namespace Microsoft.KernelMemory.AI.TikToken;

/// <summary>
/// GPT 3.5 and GPT 4+ tokenizer (cl100k_base.tiktoken + special tokens)
/// </summary>
public class TikTokenGPT4Tokenizer : ITextTokenizer
{
private static readonly Tokenizer s_tokenizer = Tokenizer.CreateTiktokenForModel("gpt-4", new Dictionary<string, int> { { "<|im_start|>", 100264 }, { "<|im_end|>", 100265 } });

/// <inheritdoc />
public int CountTokens(string text)
{
return s_tokenizer.CountTokens(text);
}
}

0 comments on commit ccbc801

Please sign in to comment.