Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
244 changes: 244 additions & 0 deletions .clang-format
Original file line number Diff line number Diff line change
@@ -0,0 +1,244 @@
---
Language: Cpp
AccessModifierOffset: -1
AlignAfterOpenBracket: AlwaysBreak
AlignArrayOfStructures: None
AlignConsecutiveAssignments:
Enabled: false
AcrossEmptyLines: false
AcrossComments: false
AlignCompound: false
AlignFunctionPointers: false
PadOperators: true
AlignConsecutiveBitFields:
Enabled: false
AcrossEmptyLines: false
AcrossComments: false
AlignCompound: false
AlignFunctionPointers: false
PadOperators: true
AlignConsecutiveDeclarations:
Enabled: false
AcrossEmptyLines: false
AcrossComments: false
AlignCompound: false
AlignFunctionPointers: false
PadOperators: true
AlignConsecutiveMacros:
Enabled: false
AcrossEmptyLines: false
AcrossComments: false
AlignCompound: false
AlignFunctionPointers: false
PadOperators: true
AlignConsecutiveShortCaseStatements:
Enabled: false
AcrossEmptyLines: false
AcrossComments: false
AlignCaseColons: false
AlignEscapedNewlines: Left
AlignOperands: DontAlign
AlignTrailingComments:
Kind: Never
OverEmptyLines: 0
AllowAllArgumentsOnNextLine: true
AllowAllParametersOfDeclarationOnNextLine: false
AllowBreakBeforeNoexceptSpecifier: Never
AllowShortBlocksOnASingleLine: Never
AllowShortCaseLabelsOnASingleLine: false
AllowShortCompoundRequirementOnASingleLine: true
AllowShortEnumsOnASingleLine: true
AllowShortFunctionsOnASingleLine: Empty
AllowShortIfStatementsOnASingleLine: Never
AllowShortLambdasOnASingleLine: All
AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: Yes
AttributeMacros:
- __capability
BinPackArguments: false
BinPackParameters: false
BitFieldColonSpacing: Both
BraceWrapping:
AfterCaseLabel: false
AfterClass: false
AfterControlStatement: Never
AfterEnum: false
AfterExternBlock: false
AfterFunction: false
AfterNamespace: false
AfterObjCDeclaration: false
AfterStruct: false
AfterUnion: false
BeforeCatch: false
BeforeElse: false
BeforeLambdaBody: false
BeforeWhile: false
IndentBraces: false
SplitEmptyFunction: true
SplitEmptyRecord: true
SplitEmptyNamespace: true
BreakAdjacentStringLiterals: true
BreakAfterAttributes: Leave
BreakAfterJavaFieldAnnotations: false
BreakArrays: true
BreakBeforeBinaryOperators: None
BreakBeforeConceptDeclarations: Always
BreakBeforeBraces: Attach
BreakBeforeInlineASMColon: OnlyMultiline
BreakBeforeTernaryOperators: true
BreakConstructorInitializers: BeforeColon
BreakInheritanceList: BeforeColon
BreakStringLiterals: false
ColumnLimit: 80
CommentPragmas: '^ IWYU pragma:'
CompactNamespaces: false
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DerivePointerAlignment: false
DisableFormat: false
EmptyLineAfterAccessModifier: Never
EmptyLineBeforeAccessModifier: LogicalBlock
ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: true
ForEachMacros:
- FOR_EACH
- FOR_EACH_R
- FOR_EACH_RANGE
IfMacros:
- KJ_IF_MAYBE
IncludeBlocks: Preserve
IncludeCategories:
- Regex: '^<.*\.h(pp)?>'
Priority: 1
SortPriority: 0
CaseSensitive: false
- Regex: '^<.*'
Priority: 2
SortPriority: 0
CaseSensitive: false
- Regex: '.*'
Priority: 3
SortPriority: 0
CaseSensitive: false
IncludeIsMainRegex: '(Test)?$'
IncludeIsMainSourceRegex: ''
IndentAccessModifiers: false
IndentCaseBlocks: false
IndentCaseLabels: true
IndentExternBlock: AfterExternBlock
IndentGotoLabels: true
IndentPPDirectives: None
IndentRequiresClause: true
IndentWidth: 2
IndentWrappedFunctionNames: false
InsertBraces: false
InsertNewlineAtEOF: false
InsertTrailingCommas: None
IntegerLiteralSeparator:
Binary: 0
BinaryMinDigits: 0
Decimal: 0
DecimalMinDigits: 0
Hex: 0
HexMinDigits: 0
JavaScriptQuotes: Leave
JavaScriptWrapImports: true
KeepEmptyLinesAtTheStartOfBlocks: false
KeepEmptyLinesAtEOF: false
LambdaBodyIndentation: Signature
LineEnding: DeriveLF
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBinPackProtocolList: Auto
ObjCBlockIndentWidth: 2
ObjCBreakBeforeNestedBlockParam: true
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: false
PackConstructorInitializers: NextLine
PenaltyBreakAssignment: 2
PenaltyBreakBeforeFirstCallParameter: 1
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakOpenParenthesis: 0
PenaltyBreakScopeResolution: 500
PenaltyBreakString: 1000
PenaltyBreakTemplateDeclaration: 10
PenaltyExcessCharacter: 1000000
PenaltyIndentedWhitespace: 0
PenaltyReturnTypeOnItsOwnLine: 200
PointerAlignment: Left
PPIndentWidth: -1
QualifierAlignment: Leave
ReferenceAlignment: Pointer
ReflowComments: true
RemoveBracesLLVM: false
RemoveParentheses: Leave
RemoveSemicolon: false
RequiresClausePosition: OwnLine
RequiresExpressionIndentation: OuterScope
SeparateDefinitionBlocks: Leave
ShortNamespaceLines: 1
SkipMacroDefinitionBody: false
SortIncludes: CaseSensitive
SortJavaStaticImport: Before
SortUsingDeclarations: LexicographicNumeric
SpaceAfterCStyleCast: false
SpaceAfterLogicalNot: false
SpaceAfterTemplateKeyword: true
SpaceAroundPointerQualifiers: Default
SpaceBeforeAssignmentOperators: true
SpaceBeforeCaseColon: false
SpaceBeforeCpp11BracedList: false
SpaceBeforeCtorInitializerColon: true
SpaceBeforeInheritanceColon: true
SpaceBeforeJsonColon: false
SpaceBeforeParens: ControlStatements
SpaceBeforeParensOptions:
AfterControlStatements: true
AfterForeachMacros: true
AfterFunctionDefinitionName: false
AfterFunctionDeclarationName: false
AfterIfMacros: true
AfterOverloadedOperator: false
AfterPlacementOperator: true
AfterRequiresInClause: false
AfterRequiresInExpression: false
BeforeNonEmptyParentheses: false
SpaceBeforeRangeBasedForLoopColon: true
SpaceBeforeSquareBrackets: false
SpaceInEmptyBlock: false
SpacesBeforeTrailingComments: 1
SpacesInAngles: Never
SpacesInContainerLiterals: true
SpacesInLineCommentPrefix:
Minimum: 1
Maximum: -1
SpacesInParens: Never
SpacesInParensOptions:
InCStyleCasts: false
InConditionalStatements: false
InEmptyParentheses: false
Other: false
SpacesInSquareBrackets: false
Standard: Latest
StatementAttributeLikeMacros:
- Q_EMIT
StatementMacros:
- Q_UNUSED
- QT_REQUIRE_VERSION
TabWidth: 8
UseTab: Never
VerilogBreakBetweenInstancePorts: true
WhitespaceSensitiveMacros:
- BOOST_PP_STRINGIZE
- CF_SWIFT_NAME
- NS_SWIFT_NAME
- PP_STRINGIZE
- STRINGIZE
...
46 changes: 25 additions & 21 deletions src/pre_tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
// @lint-ignore-every LICENSELINT

// Local
#include <pytorch/tokenizers/pre_tokenizer.h>
Expand Down Expand Up @@ -63,35 +64,37 @@ PreTokenizer::Ptr PreTokenizerConfig::create() const {
"Missing pretokenizers for PreTokenizer of type Sequence");
}
std::vector<PreTokenizer::Ptr> pretoks;
std::transform(pretokenizers->begin(), pretokenizers->end(),
std::back_inserter(pretoks),
[](const PreTokenizerConfig &cfg) { return cfg.create(); });
std::transform(
pretokenizers->begin(),
pretokenizers->end(),
std::back_inserter(pretoks),
[](const PreTokenizerConfig& cfg) { return cfg.create(); });
return PreTokenizer::Ptr(new SequencePreTokenizer(pretoks));
}
throw std::runtime_error("Unsupported PreTokenizer type: " + type);
}

PreTokenizerConfig &PreTokenizerConfig::parse_json(const json &json_config) {
PreTokenizerConfig& PreTokenizerConfig::parse_json(const json& json_config) {
type = json_config.at("type");
if (type == "Split") {
try {
pattern = json_config.at("pattern");
} catch (json::out_of_range &) {
} catch (json::out_of_range&) {
}
} else if (type == "Digits") {
try {
individual_digits = json_config.at("individual_digits");
} catch (json::out_of_range &) {
} catch (json::out_of_range&) {
}
} else if (type == "ByteLevel") {
try {
add_prefix_space = json_config.at("add_prefix_space");
} catch (json::out_of_range &) {
} catch (json::out_of_range&) {
}
// TODO: trim_offsets, use_regex
} else if (type == "Sequence") {
pretokenizers = std::vector<PreTokenizerConfig>();
for (const auto &entry : json_config.at("pretokenizers")) {
for (const auto& entry : json_config.at("pretokenizers")) {
pretokenizers->push_back(PreTokenizerConfig().parse_json(entry));
}
} else {
Expand All @@ -102,14 +105,14 @@ PreTokenizerConfig &PreTokenizerConfig::parse_json(const json &json_config) {

// RegexPreTokenizer ///////////////////////////////////////////////////////////

RegexPreTokenizer::Re2UPtr
RegexPreTokenizer::create_regex_(const std::string &pattern) {
RegexPreTokenizer::Re2UPtr RegexPreTokenizer::create_regex_(
const std::string& pattern) {
assert(!pattern.empty());
return std::make_unique<re2::RE2>("(" + pattern + ")");
}

std::vector<std::string>
RegexPreTokenizer::pre_tokenize(re2::StringPiece input) const {
std::vector<std::string> RegexPreTokenizer::pre_tokenize(
re2::StringPiece input) const {
std::vector<std::string> result;
std::string piece;
while (RE2::FindAndConsume(&input, *regex_, &piece)) {
Expand All @@ -136,13 +139,14 @@ constexpr char GPT2_EXPR[] =
// Construction //
//////////////////

ByteLevelPreTokenizer::ByteLevelPreTokenizer(bool add_prefix_space,
const std::string &pattern)
ByteLevelPreTokenizer::ByteLevelPreTokenizer(
bool add_prefix_space,
const std::string& pattern)
: pattern_(pattern.empty() ? GPT2_EXPR : pattern),
add_prefix_space_(add_prefix_space) {}

std::vector<std::string>
ByteLevelPreTokenizer::pre_tokenize(re2::StringPiece input) const {
std::vector<std::string> ByteLevelPreTokenizer::pre_tokenize(
re2::StringPiece input) const {
// Add the prefix space if configured to do so
std::string input_str(input);
if (add_prefix_space_ && !input_str.empty() && input_str[0] != ' ') {
Expand All @@ -158,13 +162,13 @@ SequencePreTokenizer::SequencePreTokenizer(
std::vector<PreTokenizer::Ptr> pre_tokenizers)
: pre_tokenizers_(std::move(pre_tokenizers)) {}

std::vector<std::string>
SequencePreTokenizer::pre_tokenize(re2::StringPiece input) const {
std::vector<std::string> SequencePreTokenizer::pre_tokenize(
re2::StringPiece input) const {
std::vector<std::string> pieces{std::string(input)};
for (const auto &pre_tokenizer : pre_tokenizers_) {
for (const auto& pre_tokenizer : pre_tokenizers_) {
std::vector<std::string> new_pieces;
for (const auto &piece : pieces) {
for (const auto &subpiece : pre_tokenizer->pre_tokenize(piece)) {
for (const auto& piece : pieces) {
for (const auto& subpiece : pre_tokenizer->pre_tokenize(piece)) {
new_pieces.push_back(subpiece);
}
}
Expand Down
6 changes: 3 additions & 3 deletions src/token_decoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ TokenDecoder::Ptr TokenDecoderConfig::create() const {
throw std::runtime_error("Unsupported TokenDecoder type: " + type);
}

TokenDecoderConfig &TokenDecoderConfig::parse_json(const json &json_config) {
TokenDecoderConfig& TokenDecoderConfig::parse_json(const json& json_config) {
type = json_config.at("type");
if (type == "ByteLevel") {
// No parameters to parse
Expand All @@ -54,7 +54,7 @@ namespace {
// Copied from llama.cpp
// CITE:
// https://github.com/ggerganov/llama.cpp/blob/master/src/llama-vocab.cpp#L20
static std::string format(const char *fmt, ...) {
static std::string format(const char* fmt, ...) {
va_list ap;
va_list ap2;
va_start(ap, fmt);
Expand Down Expand Up @@ -84,7 +84,7 @@ std::string ByteLevelTokenDecoder::decode(re2::StringPiece token) const {
const auto utf8 = unicode_cpt_to_utf8(cpt);
try {
decoded_text += unicode_utf8_to_byte(utf8);
} catch (const std::out_of_range & /*e*/) {
} catch (const std::out_of_range& /*e*/) {
decoded_text += "[UNK_BYTE_0x";
for (const auto c : utf8) {
decoded_text += format("%02x", (uint8_t)c);
Expand Down
2 changes: 1 addition & 1 deletion targets.bzl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime", "get_executorch_supported_platforms")
load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_executorch_supported_platforms", "runtime")
load("@fbsource//xplat/executorch/third-party:glob_defs.bzl", "subdir_glob")

PLATFORMS = get_executorch_supported_platforms()
Expand Down