@@ -39,7 +39,8 @@ function canSetUnknownToken(test)
3939 tok = bert .tokenizer .internal .WordPieceTokenizer(enc ,' UnknownToken' ,unk );
4040 test .verifyEqual(tok .Unk ,unk )
4141 str = " blah" ;
42- act_out = tok .tokenize(str );
42+ ustr = textanalytics .unicode .UTF32(str );
43+ act_out = tok .tokenize(ustr );
4344 exp_out = unk ;
4445 test .verifyEqual(act_out ,exp_out );
4546 end
@@ -50,7 +51,8 @@ function canSetMaxTokenLength(test)
5051 tok = bert .tokenizer .internal .WordPieceTokenizer(enc ,' MaxTokenLength' ,maxLen );
5152 test .verifyEqual(tok .MaxChar ,maxLen );
5253 str = " foo" ;
53- act_out = tok .tokenize(str );
54+ ustr = textanalytics .unicode .UTF32(str );
55+ act_out = tok .tokenize(ustr );
5456 exp_out = tok .Unk ;
5557 test .verifyEqual(act_out ,exp_out );
5658 end
@@ -59,7 +61,9 @@ function canTokenize(test)
5961 enc = wordEncoding([" foo" ," bar" ," ##foo" ]);
6062 tok = bert .tokenizer .internal .WordPieceTokenizer(enc );
6163 str = " foo bar foobar barba bafoobar barfoo" ;
62- act_out = tok .tokenize(str );
64+ wsTok = bert .tokenizer .internal .WhitespaceTokenizer ;
65+ ustr = textanalytics .unicode .UTF32(wsTok .tokenize(str ));
66+ act_out = tok .tokenize(ustr );
6367 exp_out = [" foo" ," bar" ,tok .Unk ,tok .Unk ,tok .Unk ," bar" ," ##foo" ];
6468 test .verifyEqual(act_out ,exp_out );
6569 end
0 commit comments