Skip to content

Commit

Permalink
Fix conversion of HZ text (and add test suite)
Browse files Browse the repository at this point in the history
- Treat truncated multi-byte characters as an error.
- Don't allow ASCII control characters to appear in the middle of a
  multi-byte character.
- Handle ~ escapes according to the HZ standard (RFC 1843).
- Treat unrecognized ~ escapes as an error.
- Multi-byte characters (between ~{ ~} escapes) are GB2312, not CP936.
  (CP936 is an extended version from MicroSoft, but the RFC does not
  state that this extended version of GB should be used.)
  • Loading branch information
alexdowad committed Jun 29, 2021
1 parent aff3658 commit 1e5c3c1
Show file tree
Hide file tree
Showing 4 changed files with 10,300 additions and 72 deletions.
179 changes: 107 additions & 72 deletions ext/mbstring/libmbfl/filters/mbfilter_hz.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@
#include "mbfilter_hz.h"

#include "unicode_table_cp936.h"
#include "unicode_table_gb2312.h"

static int mbfl_filt_conv_hz_wchar_flush(mbfl_convert_filter *filter);

const mbfl_encoding mbfl_encoding_hz = {
mbfl_no_encoding_hz,
Expand All @@ -49,7 +52,7 @@ const struct mbfl_convert_vtbl vtbl_hz_wchar = {
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_hz_wchar,
mbfl_filt_conv_common_flush,
mbfl_filt_conv_hz_wchar_flush,
NULL,
};

Expand All @@ -65,68 +68,73 @@ const struct mbfl_convert_vtbl vtbl_wchar_hz = {

#define CK(statement) do { if ((statement) < 0) return (-1); } while (0)

/*
* HZ => wchar
*/
int
mbfl_filt_conv_hz_wchar(int c, mbfl_convert_filter *filter)
int mbfl_filt_conv_hz_wchar(int c, mbfl_convert_filter *filter)
{
int c1, s, w;

switch (filter->status & 0xf) {
/* case 0x00: ASCII */
/* case 0x10: GB2312 */
/* case 0x00: ASCII */
/* case 0x10: GB2312 */
case 0:
if (c == 0x7e) {
if (c == '~') {
filter->status += 2;
} else if (filter->status == 0x10 && c > 0x20 && c < 0x7f) { /* DBCS first char */
} else if (filter->status == 0x10 && ((c > 0x20 && c <= 0x29) || (c >= 0x30 && c <= 0x77))) {
/* DBCS first char */
filter->cache = c;
filter->status += 1;
} else if (c >= 0 && c < 0x80) { /* latin, CTLs */
} else if (filter->status == 0 && c >= 0 && c < 0x80) { /* latin, CTLs */
CK((*filter->output_function)(c, filter->data));
} else {
w = c & MBFL_WCSGROUP_MASK;
w |= MBFL_WCSGROUP_THROUGH;
CK((*filter->output_function)(w, filter->data));
CK((*filter->output_function)(c | MBFL_WCSGROUP_THROUGH, filter->data));
}
break;

/* case 0x11: GB2312 second char */
/* case 0x11: GB2312 second char */
case 1:
filter->status &= ~0xf;
c1 = filter->cache;
if (c1 > 0x20 && c1 < 0x7f && c > 0x20 && c < 0x7f) {
if (c1 > 0x20 && c1 < 0x7F && c > 0x20 && c < 0x7F) {
s = (c1 - 1)*192 + c + 0x40; /* GB2312 */
if (s >= 0 && s < cp936_ucs_table_size) {
w = cp936_ucs_table[s];
if (s == 0x1864) {
w = 0x30FB;
} else if (s == 0x186A) {
w = 0x2015;
} else if (s == 0x186C) {
w = 0x2225;
} else if ((s >= 0x1920 && s <= 0x192A) || s == 0x1963 || (s >= 0x1C60 && s <= 0x1C7F) || (s >= 0x1DBB && s <= 0x1DC4)) {
w = 0;
} else {
w = cp936_ucs_table[s];
}
} else {
w = 0;
}
if (w <= 0) {
w = (c1 << 8) | c;
w &= MBFL_WCSPLANE_MASK;
w |= MBFL_WCSPLANE_GB2312;
w = (c1 << 8) | c | MBFL_WCSPLANE_GB2312;
}
CK((*filter->output_function)(w, filter->data));
} else if ((c >= 0 && c < 0x21) || c == 0x7f) { /* CTLs */
CK((*filter->output_function)(c, filter->data));
} else {
w = (c1 << 8) | c;
w &= MBFL_WCSGROUP_MASK;
w |= MBFL_WCSGROUP_THROUGH;
w = (c1 << 8) | c | MBFL_WCSGROUP_THROUGH;
CK((*filter->output_function)(w, filter->data));
}
break;

/* '~' */
case 2:
if (c == 0x7d) { /* '}' */
filter->status = 0x0;
} else if (c == 0x7b) { /* '{' */
if (c == '}' && filter->status == 0x12) {
filter->status = 0;
} else if (c == '{' && filter->status == 2) {
filter->status = 0x10;
} else if (c == 0x7e) { /* '~' */
filter->status = 0x0;
CK((*filter->output_function)(0x007e, filter->data));
} else if (c == '~' && filter->status == 2) {
CK((*filter->output_function)('~', filter->data));
} else if (c == '\n') {
/* "~\n" is a line continuation; no output is needed, nor should we shift modes */
filter->status -= 2;
} else {
/* Invalid character after ~ */
filter->status -= 2;
CK((*filter->output_function)(c | MBFL_WCSGROUP_THROUGH, filter->data));
}
break;

Expand All @@ -138,66 +146,94 @@ mbfl_filt_conv_hz_wchar(int c, mbfl_convert_filter *filter)
return c;
}

/*
* wchar => HZ
*/
int
mbfl_filt_conv_wchar_hz(int c, mbfl_convert_filter *filter)
static int mbfl_filt_conv_hz_wchar_flush(mbfl_convert_filter *filter)
{
if (filter->status == 0x11) {
/* 2-byte character was truncated */
CK((*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data));
}

if (filter->flush_function) {
(*filter->flush_function)(filter->data);
}

return 0;
}

int mbfl_filt_conv_wchar_hz(int c, mbfl_convert_filter *filter)
{
int s;
int s = 0;

s = 0;
if (c >= ucs_a1_cp936_table_min && c < ucs_a1_cp936_table_max) {
s = ucs_a1_cp936_table[c - ucs_a1_cp936_table_min];
if (c == 0xB7 || c == 0x144 || c == 0x148 || c == 0x251 || c == 0x261 || c == 0x2CA || c == 0x2CB || c == 0x2D9) {
s = 0;
} else {
s = ucs_a1_cp936_table[c - ucs_a1_cp936_table_min];
}
} else if (c >= ucs_a2_cp936_table_min && c < ucs_a2_cp936_table_max) {
s = ucs_a2_cp936_table[c - ucs_a2_cp936_table_min];
if (c == 0x2015) {
s = 0xA1AA;
} else if (c == 0x2010 || c == 0x2013 || c == 0x2014 || c == 0x2016 || c == 0x2025 || c == 0x2035 ||
c == 0x2105 || c == 0x2109 || c == 0x2121 || (c >= 0x2170 && c <= 0x2179) || (c >= 0x2196 && c <= 0x2199) ||
c == 0x2215 || c == 0x221F || c == 0x2223 || c == 0x2252 || c == 0x2266 || c == 0x2267 || c == 0x2295 ||
(c >= 0x2550 && c <= 0x2573) || c == 0x22BF || c == 0x2609 || (c >= 0x2581 && c <= 0x258F) ||
(c >= 0x2593 && c <= 0x2595) || c == 0x25BC || c == 0x25BD || (c >= 0x25E2 && c <= 0x25E5)) {
s = 0;
} else {
s = ucs_a2_cp936_table[c - ucs_a2_cp936_table_min];
}
} else if (c >= ucs_a3_cp936_table_min && c < ucs_a3_cp936_table_max) {
s = ucs_a3_cp936_table[c - ucs_a3_cp936_table_min];
} else if (c >= ucs_i_cp936_table_min && c < ucs_i_cp936_table_max) {
s = ucs_i_cp936_table[c - ucs_i_cp936_table_min];
if (c == 0x30FB) {
s = 0xA1A4;
} else if (c == 0x3006 || c == 0x3007 || c == 0x3012 || c == 0x3231 || c == 0x32A3 || c >= 0x3300 ||
(c >= 0x3018 && c <= 0x3040) || (c >= 0x309B && c <= 0x309E) || (c >= 0x30FC && c <= 0x30FE)) {
s = 0;
} else {
s = ucs_a3_cp936_table[c - ucs_a3_cp936_table_min];
}
} else if (c >= ucs_i_gb2312_table_min && c < ucs_i_gb2312_table_max) {
s = ucs_i_gb2312_table[c - ucs_i_gb2312_table_min];
} else if (c >= ucs_hff_cp936_table_min && c < ucs_hff_cp936_table_max) {
if (c == 0xff04) {
s = 0xa1e7;
} else if (c == 0xff5e) {
s = 0xa1ab;
} else if (c >= 0xff01 && c <= 0xff5d) {
s = c - 0xff01 + 0xa3a1;
} else if (c >= 0xffe0 && c <= 0xffe5) {
s = ucs_hff_s_cp936_table[c-0xffe0];
if (c == 0xFF04) {
s = 0xA1E7;
} else if (c == 0xFF5E) {
s = 0xA1AB;
} else if (c >= 0xFF01 && c <= 0xFF5D) {
s = c - 0xFF01 + 0xA3A1;
} else if (c == 0xFFE0 || c == 0xFFE1 || c == 0xFFE3 || c == 0xFFE5) {
s = ucs_hff_s_cp936_table[c - 0xFFE0];
}
}

if (s & 0x8000) {
s -= 0x8080;
}

if (s <= 0) {
if (c == 0) {
s = 0;
} else if (s <= 0) {
s = -1;
}
} else if ((s >= 0x80 && s < 0x2121) || (s > 0x8080)) {
s = (c == 0) ? 0 : -1;
} else if ((s >= 0x80 && s < 0x2121) || s > 0x8080) {
s = -1;
}

if (s >= 0) {
if (s < 0x80) { /* ASCII */
if ((filter->status & 0xff00) != 0) {
CK((*filter->output_function)(0x7e, filter->data)); /* '~' */
CK((*filter->output_function)(0x7d, filter->data)); /* '}' */
CK((*filter->output_function)('~', filter->data));
CK((*filter->output_function)('}', filter->data));
}
filter->status = 0;
if (s == 0x7e){
CK((*filter->output_function)(0x7e, filter->data));
if (s == 0x7E) {
CK((*filter->output_function)('~', filter->data));
}
CK((*filter->output_function)(s, filter->data));
} else { /* GB 2312-80 */
if ((filter->status & 0xff00) != 0x200) {
CK((*filter->output_function)(0x7e, filter->data)); /* '~' */
CK((*filter->output_function)(0x7b, filter->data)); /* '{' */
if ((filter->status & 0xFF00) != 0x200) {
CK((*filter->output_function)('~', filter->data));
CK((*filter->output_function)('{', filter->data));
}
filter->status = 0x200;
CK((*filter->output_function)((s >> 8) & 0x7f, filter->data));
CK((*filter->output_function)(s & 0x7f, filter->data));
CK((*filter->output_function)((s >> 8) & 0x7F, filter->data));
CK((*filter->output_function)(s & 0x7F, filter->data));
}
} else {
CK(mbfl_filt_conv_illegal_output(c, filter));
Expand All @@ -206,14 +242,13 @@ mbfl_filt_conv_wchar_hz(int c, mbfl_convert_filter *filter)
return c;
}

int
mbfl_filt_conv_any_hz_flush(mbfl_convert_filter *filter)
int mbfl_filt_conv_any_hz_flush(mbfl_convert_filter *filter)
{
/* back to latin */
if ((filter->status & 0xff00) != 0) {
CK((*filter->output_function)(0x7e, filter->data)); /* ~ */
CK((*filter->output_function)(0x7d, filter->data)); /* '{' */
if (filter->status & 0xFF00) {
CK((*filter->output_function)('~', filter->data));
CK((*filter->output_function)('}', filter->data));
}
filter->status &= 0xff;
filter->status = 0;
return 0;
}

0 comments on commit 1e5c3c1

Please sign in to comment.