Skip to content

Commit

Permalink
Output illegal character marker for 4-byte illegal characters > 0x7FF…
Browse files Browse the repository at this point in the history
…FFFFF

Some text encodings supported by mbstring (such as UCS-4) accept 4-byte
characters. When mbstring encounters an illegal byte sequence for the
encoding it is using, it should emit an 'illegal character' marker,
which can either be a single character like '?', an HTML hexadecimal
entity, or a marker string like 'BAD+XXXX'.

Because of the use of signed integers to hold 4-byte characters,
illegal 4-byte sequences with a 'negative' value (one with the high
bit set) were not handled correctly when emitting the illegal char
marker. The result is that such illegal sequences were just skipped
over (and the marker was not emitted to the output). Fix that.
  • Loading branch information
alexdowad committed Aug 30, 2021
1 parent a57b713 commit 97b7fc8
Showing 1 changed file with 69 additions and 78 deletions.
147 changes: 69 additions & 78 deletions ext/mbstring/libmbfl/mbfl/mbfl_convert.c
Original file line number Diff line number Diff line change
Expand Up @@ -214,11 +214,34 @@ int mbfl_convert_filter_strcat(mbfl_convert_filter *filter, const unsigned char
return 0;
}

static int mbfl_filt_conv_output_hex(unsigned int w, mbfl_convert_filter *filter)
{
int nonzero = 0, shift = 28, ret = 0;

while (shift >= 0) {
int n = (w >> shift) & 0xF;
if (n || nonzero) {
nonzero = 1;
ret = (*filter->filter_function)(mbfl_hexchar_table[n], filter);
if (ret < 0) {
return ret;
}
}
shift -= 4;
}

if (nonzero == 0) {
/* illegal character was zero; no hex digits were output by above loop */
ret = (*filter->filter_function)('0', filter);
}

return ret;
}

/* illegal character output function for conv-filter */
int mbfl_filt_conv_illegal_output(int c, mbfl_convert_filter *filter)
{
int n, m, r;

unsigned int w = c;
int ret = 0;
int mode_backup = filter->illegal_mode;
int substchar_backup = filter->illegal_substchar;
Expand All @@ -237,89 +260,57 @@ int mbfl_filt_conv_illegal_output(int c, mbfl_convert_filter *filter)
case MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR:
ret = (*filter->filter_function)(substchar_backup, filter);
break;

case MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG:
if (c >= 0) {
if (c < MBFL_WCSGROUP_UCS4MAX) { /* unicode */
ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"U+");
} else {
if (c < MBFL_WCSGROUP_WCHARMAX) {
m = c & ~MBFL_WCSPLANE_MASK;
switch (m) {
case MBFL_WCSPLANE_JIS0208:
ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"JIS+");
break;
case MBFL_WCSPLANE_JIS0212:
ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"JIS2+");
break;
case MBFL_WCSPLANE_JIS0213:
ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"JIS3+");
break;
case MBFL_WCSPLANE_WINCP932:
ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"W932+");
break;
case MBFL_WCSPLANE_GB18030:
ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"GB+");
break;
default:
ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"?+");
break;
}
c &= MBFL_WCSPLANE_MASK;
} else {
ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"BAD+");
c &= MBFL_WCSGROUP_MASK;
}
}
if (ret >= 0) {
m = 0;
r = 28;
while (r >= 0) {
n = (c >> r) & 0xf;
if (n || m) {
m = 1;
ret = (*filter->filter_function)(mbfl_hexchar_table[n], filter);
if (ret < 0) {
break;
}
}
r -= 4;
}
if (m == 0) {
ret = (*filter->filter_function)(mbfl_hexchar_table[0], filter);
}
if (w < MBFL_WCSGROUP_UCS4MAX) { /* Unicode */
ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"U+");
} else if (w < MBFL_WCSGROUP_WCHARMAX) {
int m = w & ~MBFL_WCSPLANE_MASK;
switch (m) {
case MBFL_WCSPLANE_JIS0208:
ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"JIS+");
break;
case MBFL_WCSPLANE_JIS0212:
ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"JIS2+");
break;
case MBFL_WCSPLANE_JIS0213:
ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"JIS3+");
break;
case MBFL_WCSPLANE_WINCP932:
ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"W932+");
break;
case MBFL_WCSPLANE_GB18030:
ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"GB+");
break;
default:
ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"?+");
break;
}
w &= MBFL_WCSPLANE_MASK;
} else {
ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"BAD+");
w &= MBFL_WCSGROUP_MASK;
}

if (ret >= 0) {
ret = mbfl_filt_conv_output_hex(w, filter);
}
break;

case MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY:
if (c >= 0) {
if (c < MBFL_WCSGROUP_UCS4MAX) { /* unicode */
ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"&#x");
if (ret < 0)
break;

m = 0;
r = 28;
while (r >= 0) {
n = (c >> r) & 0xf;
if (n || m) {
m = 1;
ret = (*filter->filter_function)(mbfl_hexchar_table[n], filter);
if (ret < 0) {
break;
}
}
r -= 4;
}
if (m == 0) {
/* illegal character was zero; no hex digits were output by above loop */
ret = (*filter->filter_function)('0', filter);
}
ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)";");
} else {
ret = (*filter->filter_function)(substchar_backup, filter);
}
if (w < MBFL_WCSGROUP_UCS4MAX) { /* unicode */
ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"&#x");
if (ret < 0)
break;
ret = mbfl_filt_conv_output_hex(w, filter);
if (ret < 0)
break;
ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)";");
} else {
ret = (*filter->filter_function)(substchar_backup, filter);
}
break;

case MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE:
default:
break;
Expand Down

0 comments on commit 97b7fc8

Please sign in to comment.