Skip to content

Commit

Permalink
Fix tokenization offsets for strings with unicode quotes being wrong.
Browse files Browse the repository at this point in the history
  • Loading branch information
davisking committed May 11, 2021
1 parent b6f454f commit faf3ecf
Showing 1 changed file with 4 additions and 0 deletions.
4 changes: 4 additions & 0 deletions mitielib/include/mitie/conll_tokenizer.h
Expand Up @@ -77,6 +77,8 @@ namespace mitie
(unsigned char)token[1] == 0x80 &&
(unsigned char)token[2] == 0x9C)
{
next_token_offset = token_offset + 3;
next_token_front_padding = 0;
next_token = token.substr(3);
token.resize(3);
return result;
Expand All @@ -86,6 +88,8 @@ namespace mitie
(unsigned char)token[token.size()-2] == 0x80 &&
(unsigned char)token[token.size()-1] == 0x9D)
{
next_token_offset = token_offset + token.size()-3;
next_token_front_padding = 0;
next_token = token.substr(token.size()-3);
token.resize(token.size()-3);
return result;
Expand Down

0 comments on commit faf3ecf

Please sign in to comment.