Skip to content

Commit

Permalink
[translation] Reimplement UTF-8 check
Browse files Browse the repository at this point in the history
Because our CPython build doesn't have UnicodeDecodeError.

Also Python 2 allows UTF-8-like bytes that represent code points in the
surrogate range, which is not UTF-8.
  • Loading branch information
Andy C committed Jan 12, 2024
1 parent 8c1192d commit 2e69c64
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 13 deletions.
13 changes: 1 addition & 12 deletions data_lang/pyj8.py
Expand Up @@ -21,18 +21,7 @@ def WriteString(s, options, buf):
buf.write(fastfunc.J8EncodeString(s, j8_fallback))


def PartIsUtf8(s, start, end):
# type: (str, int, int) -> bool
"""Is a part of a string UTF-8?
Used for J8 decoding. TODO: Could also replace this with fastfunc?
"""
part = s[start:end]
try:
part.decode('utf-8')
except UnicodeDecodeError as e:
return False
return True
PartIsUtf8 = fastfunc.PartIsUtf8


# vim: sw=4
34 changes: 33 additions & 1 deletion pyext/fastfunc.c
Expand Up @@ -6,12 +6,13 @@
#include <stdlib.h>

#include "data_lang/j8c.h"
#include "data_lang/utf8_impls/bjoern_dfa.h"

#include <Python.h>

// Log messages to stderr.
static void debug(const char* fmt, ...) {
#if 1
#if 0
va_list args;
va_start(args, fmt);
vfprintf(stderr, fmt, args);
Expand All @@ -36,8 +37,39 @@ func_J8EncodeString(PyObject *self, PyObject *args) {
return ret;
}

static PyObject *
func_PartIsUtf8(PyObject *self, PyObject *args) {
j8_buf_t in;
int start;
int end;

if (!PyArg_ParseTuple(args, "s#ii", &(in.data), &(in.len), &start, &end)) {
return NULL;
}
// Bounds check for safety
assert(0 <= start);
assert(end <= in.len);

uint32_t codepoint;
uint32_t state = UTF8_ACCEPT;

for (int i = start; i < end; ++i) {
// This var or a static_cast<> is necessary. Should really change BigStr*
// to use unsigned type
unsigned char c = in.data[i];
decode(&state, &codepoint, c);
if (state == UTF8_REJECT) {
return PyBool_FromLong(0);
}
}

return PyBool_FromLong(state == UTF8_ACCEPT);
}


static PyMethodDef methods[] = {
{"J8EncodeString", func_J8EncodeString, METH_VARARGS, ""},
{"PartIsUtf8", func_PartIsUtf8, METH_VARARGS, ""},

{NULL, NULL},
};
Expand Down
2 changes: 2 additions & 0 deletions pyext/fastfunc.pyi
@@ -1,2 +1,4 @@

def J8EncodeString(s: str, j8_fallback: int) -> str: ...

def PartIsUtf8(s: str, start: int, end: int) -> bool: ...
4 changes: 4 additions & 0 deletions pyext/fastfunc_test.py
Expand Up @@ -24,6 +24,10 @@ def testEncode(self):
x = fastfunc.J8EncodeString(s, 1)
print(x)

def testUtf8(self):
s = 'hi \xff'
self.assertEqual(True, fastfunc.PartIsUtf8(s, 0, 3))
self.assertEqual(False, fastfunc.PartIsUtf8(s, 3, 4))


if __name__ == '__main__':
Expand Down

0 comments on commit 2e69c64

Please sign in to comment.