Skip to content

Commit

Permalink
utf8: validator
Browse files Browse the repository at this point in the history
For sanity-checking UTF8 without decoding.
  • Loading branch information
markokr committed Jun 19, 2014
1 parent dc00b6f commit 360519f
Show file tree
Hide file tree
Showing 3 changed files with 154 additions and 0 deletions.
69 changes: 69 additions & 0 deletions test/test_utf8.c
Expand Up @@ -34,6 +34,24 @@ static int uget4(int a, int b, int c, int d)
return utf8_get_char(&p, buf + 4);
}

static const char *mkseq(uint32_t c, int n)
{
static char buf[8];
static const uint8_t prefix[] = { 0, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
int i;
for (i = n - 1; i > 0; i--) {
buf[i] = (c & 0x3F) | 0x80;
c >>= 6;
}
buf[0] = prefix[n-1] | c;
return buf;
}

static int readseq(uint32_t c, int n)
{
const char *p = mkseq(c, n);
return utf8_get_char(&p, p + n);
}

static void test_utf8_char_size(void *p)
{
Expand Down Expand Up @@ -87,6 +105,26 @@ static void test_utf8_get_char(void *p)
int_check(uget1(0xC2), -0xC2);
int_check(uget2(0xE2, 0x82), -0xE2);
int_check(uget3(0xF0, 0xA4, 0xAD), -0xF0);

/* good boundaries */
int_check(readseq(0x7f, 1), 0x7f);
int_check(readseq(0x80, 2), 0x80);
int_check(readseq(0x7ff, 2), 0x7ff);
int_check(readseq(0x800, 3), 0x800);
int_check(readseq(0xffff, 3), 0xffff);
int_check(readseq(0x10000, 4), 0x10000);
int_check(readseq(0x10ffff, 4), 0x10ffff);
int_check(readseq(0xd7ff, 3), 0xd7ff);
int_check(readseq(0xe000, 3), 0xe000);

/* bad boundaries */
int_check(readseq(0x7f, 2), -193);
int_check(readseq(0x7ff, 3), -224);
int_check(readseq(0xffff, 4), -240);
int_check(readseq(0x110000, 4), -244);
int_check(readseq(0x10ffff, 5), -248);
int_check(readseq(0xd800, 3), -237);
int_check(readseq(0xdfff, 3), -237);
end:;
}

Expand Down Expand Up @@ -140,6 +178,36 @@ static void test_utf8_put_char(void *p)
end:;
}

static int validseq(uint32_t c, int n)
{
const char *p = mkseq(c, n);
return utf8_validate_seq(p, p + n);
}

static void test_utf8_validate_seq(void *p)
{
/* good boundaries */
int_check(validseq(0x7f, 1), 1);
int_check(validseq(0x80, 2), 2);
int_check(validseq(0x7ff, 2), 2);
int_check(validseq(0x800, 3), 3);
int_check(validseq(0xffff, 3), 3);
int_check(validseq(0x10000, 4), 4);
int_check(validseq(0x10ffff, 4), 4);
int_check(validseq(0xd7ff, 3), 3);
int_check(validseq(0xe000, 3), 3);

/* bad boundaries */
int_check(validseq(0x7f, 2), 0);
int_check(validseq(0x7ff, 3), 0);
int_check(validseq(0xffff, 4), 0);
int_check(validseq(0x110000, 4), 0);
int_check(validseq(0x10ffff, 5), 0);
int_check(validseq(0xd800, 3), 0);
int_check(validseq(0xdfff, 3), 0);
end:;
}

/*
* Describe
*/
Expand All @@ -149,6 +217,7 @@ struct testcase_t utf8_tests[] = {
{ "utf8_seq_size", test_utf8_seq_size },
{ "utf8_get_char", test_utf8_get_char },
{ "utf8_put_char", test_utf8_put_char },
{ "utf8_validate_seq", test_utf8_validate_seq },
END_OF_TESTCASES
};

80 changes: 80 additions & 0 deletions usual/utf8.c
Expand Up @@ -17,6 +17,7 @@
*/

#include <usual/utf8.h>
#include <usual/err.h>

#define u8head(c, mask) (((c) & (mask | (mask >> 1))) == mask)
#define u8tail(c) u8head(c, 0x80)
Expand Down Expand Up @@ -138,3 +139,82 @@ int utf8_seq_size(unsigned char b)
return 0;
}

/*
* 7f: c1bf (+1)
* 80: c280
* 7ff: dfbf
* 7ff: e09fbf (+1)
* 800: e0a080
* ffff: efbfbf
* ffff: f08fbfbf (+1)
* 10000: f0908080
* 10ffff: f48fbfbf
*/
int utf8_validate_seq(const char *src, const char *srcend)
{
const unsigned char *u = (unsigned char *)src;
const unsigned char *uend = (unsigned char *)srcend;

if (u[0] < 0x80) { /* ascii */
if (u[0] == 0)
goto invalid;
return 1;
} else if (u[0] < 0xC2) { /* tail byte as first byte */
goto invalid;
} else if (u[0] < 0xE0) { /* 1 tail byte */
if (u + 2 > uend)
goto invalid;

if ((u[1] & 0xC0) != 0x80)
goto invalid;
return 2;
} else if (u[0] < 0xF0) { /* 2 tail bytes */
if (u + 3 > uend)
goto invalid;
if (u[0] == 0xE0 && u[1] < 0xA0)
goto invalid;
if (u[0] == 0xED && u[1] >= 0xA0)
goto invalid;
if ((u[1] & 0xC0) != 0x80)
goto invalid;
if ((u[2] & 0xC0) != 0x80)
goto invalid;
return 3;
} else if (u[0] < 0xF5) { /* 3-tail bytes */
if (u + 4 > uend)
goto invalid;
if (u[0] == 0xF0 && u[1] < 0x90)
goto invalid;
if (u[0] == 0xF4 && u[1] > 0x8F)
goto invalid;

if ((u[1] & 0xC0) != 0x80)
goto invalid;
if ((u[2] & 0xC0) != 0x80)
goto invalid;
if ((u[3] & 0xC0) != 0x80)
goto invalid;
return 4;
}
invalid:
return 0;
}

bool utf8_validate_string(const char *src, const char *end)
{
unsigned int n;
while (src < end) {
if (*src & 0x80) {
n = utf8_validate_seq(src, end);
if (n == 0)
return false;
src += n;
} else if (*src == '\0') {
return false;
} else {
src++;
}
}
return true;
}

5 changes: 5 additions & 0 deletions usual/utf8.h
Expand Up @@ -53,5 +53,10 @@ int utf8_char_size(unsigned int c);
/** Return UTF8 seq length based on first byte */
int utf8_seq_size(unsigned char c);

/** Return sequence length if all bytes are valid, 0 otherwise. */
int utf8_validate_seq(const char *src, const char *srcend);

bool utf8_validate_string(const char *src, const char *end);

#endif

0 comments on commit 360519f

Please sign in to comment.