Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use fast text conversion filters to implement mb_convert_variables #9966

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
51 changes: 15 additions & 36 deletions ext/mbstring/mbstring.c
Expand Up @@ -3180,7 +3180,7 @@ PHP_FUNCTION(mb_convert_kana)
RETVAL_STR(jp_kana_convert(str, enc, opt));
}

static int mb_recursive_encoder_detector_feed(mbfl_encoding_detector *identd, zval *var, int *recursion_error) /* {{{ */
static int mb_recursive_encoder_detector_feed(mbfl_encoding_detector *identd, zval *var, bool *recursion_error) /* {{{ */
{
mbfl_string string;
HashTable *ht;
Expand All @@ -3196,7 +3196,7 @@ static int mb_recursive_encoder_detector_feed(mbfl_encoding_detector *identd, zv
} else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
if (Z_REFCOUNTED_P(var)) {
if (Z_IS_RECURSIVE_P(var)) {
*recursion_error = 1;
*recursion_error = true;
return 0;
}
Z_PROTECT_RECURSION_P(var);
Expand Down Expand Up @@ -3226,43 +3226,37 @@ static int mb_recursive_encoder_detector_feed(mbfl_encoding_detector *identd, zv
return 0;
} /* }}} */

static int mb_recursive_convert_variable(mbfl_buffer_converter *convd, zval *var) /* {{{ */
static bool mb_recursive_convert_variable(zval *var, const mbfl_encoding* from_encoding, const mbfl_encoding* to_encoding)
{
mbfl_string string, result, *ret;
HashTable *ht;
zval *entry, *orig_var;

orig_var = var;
ZVAL_DEREF(var);

if (Z_TYPE_P(var) == IS_STRING) {
string.val = (unsigned char *)Z_STRVAL_P(var);
string.len = Z_STRLEN_P(var);
ret = mbfl_buffer_converter_feed_result(convd, &string, &result);
if (ret != NULL) {
zval_ptr_dtor(orig_var);
// TODO: avoid reallocation ???
ZVAL_STRINGL(orig_var, (char *)ret->val, ret->len);
efree(ret->val);
}
zend_string *ret = php_mb_convert_encoding_ex(Z_STRVAL_P(var), Z_STRLEN_P(var), to_encoding, from_encoding);
zval_ptr_dtor(orig_var);
ZVAL_STR(orig_var, ret);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is where I would like to ask someone who knows PHP internals well... am I handling the refcounts correctly?

mb_convert_variables replaces strings in an array or object. After converting the old zend_string to the new one, I am using zval_ptr_dtor to drop the reference to the old one, then ZVAL_STR to make the zval point to the new string.

I don't think ZVAL_STR modifies the refcount, and when the new zend_string is returned from php_mb_convert_encoding_ex, it should start with a refcount of 1, so hopefully we are OK here... 😕

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems to make sense, but you can always run some tests with Valgrind or ASAN to see if there is any issue. But been staring at this for a bit and it seems to check out

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, that's a good idea.

} else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
if (Z_TYPE_P(var) == IS_ARRAY) {
SEPARATE_ARRAY(var);
}
if (Z_REFCOUNTED_P(var)) {
if (Z_IS_RECURSIVE_P(var)) {
return 1;
return true;
}
Z_PROTECT_RECURSION_P(var);
}

ht = HASH_OF(var);
if (ht != NULL) {
ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
if (mb_recursive_convert_variable(convd, entry)) {
if (mb_recursive_convert_variable(entry, from_encoding, to_encoding)) {
if (Z_REFCOUNTED_P(var)) {
Z_UNPROTECT_RECURSION_P(var);
}
return 1;
return true;
}
} ZEND_HASH_FOREACH_END();
}
Expand All @@ -3271,8 +3265,9 @@ static int mb_recursive_convert_variable(mbfl_buffer_converter *convd, zval *var
Z_UNPROTECT_RECURSION_P(var);
}
}
return 0;
} /* }}} */

return false;
}

/* {{{ Converts the string resource in variables to desired encoding */
PHP_FUNCTION(mb_convert_variables)
Expand All @@ -3281,14 +3276,12 @@ PHP_FUNCTION(mb_convert_variables)
zend_string *to_enc_str;
zend_string *from_enc_str;
HashTable *from_enc_ht;
mbfl_string string, result;
const mbfl_encoding *from_encoding, *to_encoding;
mbfl_encoding_detector *identd;
mbfl_buffer_converter *convd;
int n, argc;
size_t elistsz;
const mbfl_encoding **elist;
int recursion_error = 0;
bool recursion_error = false;

ZEND_PARSE_PARAMETERS_START(3, -1)
Z_PARAM_STR(to_enc_str)
Expand All @@ -3302,10 +3295,7 @@ PHP_FUNCTION(mb_convert_variables)
RETURN_THROWS();
}

/* initialize string */
from_encoding = MBSTRG(current_internal_encoding);
mbfl_string_init_set(&string, from_encoding);
mbfl_string_init(&result);

/* pre-conversion encoding */
if (from_enc_ht) {
Expand Down Expand Up @@ -3356,29 +3346,18 @@ PHP_FUNCTION(mb_convert_variables)

efree(ZEND_VOIDP(elist));

convd = mbfl_buffer_converter_new(from_encoding, to_encoding, 0);
/* If this assertion fails this means some memory allocation failure which is a bug */
ZEND_ASSERT(convd != NULL);

mbfl_buffer_converter_illegal_mode(convd, MBSTRG(current_filter_illegal_mode));
mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar));

/* convert */
n = 0;
while (n < argc) {
zval *zv = &args[n];

ZVAL_DEREF(zv);
recursion_error = mb_recursive_convert_variable(convd, zv);
recursion_error = mb_recursive_convert_variable(zv, from_encoding, to_encoding);
if (recursion_error) {
break;
}
n++;
}

MBSTRG(illegalchars) += mbfl_buffer_illegalchars(convd);
mbfl_buffer_converter_delete(convd);

if (recursion_error) {
php_error_docref(NULL, E_WARNING, "Cannot handle recursive references");
RETURN_FALSE;
Expand Down
91 changes: 64 additions & 27 deletions ext/mbstring/tests/mb_convert_variables.phpt
Expand Up @@ -17,19 +17,19 @@ $sjis = base64_decode('k/qWe4zqg2WDTINYg2eCxYK3gUIwMTIzNIJUglWCVoJXgliBQg==');
// JIS string (BASE64 encoded)
$jis = base64_decode('GyRCRnxLXDhsJUYlLSU5JUgkRyQ5ISMbKEIwMTIzNBskQiM1IzYjNyM4IzkhIxsoQg==');
// EUC-JP string
$euc_jp = '���ܸ�ƥ����ȤǤ���01234������������';
Copy link
Contributor Author

@alexdowad alexdowad Nov 17, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, interesting to see that GitHub is able to recognize this EUC-JP encoded string and display it correctly here...

On the other side, I've converted it to UTF-8 so that it will (hopefully) appear correctly in most text editors.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree that it is preferable to have *.phpt files encoded as UTF-8 whenever possible.

$euc_jp = mb_convert_encoding("日本語テキストです。0123456789。", 'EUC-JP', 'UTF-8');

// Test for single scalar
echo "== SCALAR TEST ==\n";
$s = $sjis;
$encoding = mb_convert_variables('EUC-JP', 'SJIS', $s);
print("$encoding\n"); // SJIS
print("$s\n"); // Converted to EUC-JP
echo bin2hex($s), "\n"; // Converted to EUC-JP

$s = $jis;
$encoding = mb_convert_variables('EUC-JP', 'JIS', $s);
print("$encoding\n"); // JIS
print("$s\n"); // Converted to EUC-JP
echo bin2hex($s), "\n"; // Converted to EUC-JP

$s = $euc_jp;
$encoding = mb_convert_variables('SJIS', 'EUC-JP', $s);
Expand All @@ -47,9 +47,7 @@ $s2 = $euc_jp;
$s3 = $euc_jp;
$encoding = mb_convert_variables('EUC-JP', 'auto', $s1, $s2, $s3);
print("$encoding\n"); // EUC-JP
print("$s1$s2$s3\n"); // Converted to EUC-JP


echo bin2hex("$s1$s2$s3"), "\n"; // Converted to EUC-JP

// Note: Mixing encoding in array/object is not supported?
// Test for array
Expand All @@ -58,15 +56,13 @@ $a = array($s3, $s2, $s1);
$aa = $a;
$encoding = mb_convert_variables('EUC-JP', 'auto', $aa);
print("$encoding\n"); // EUC-JP
print("{$aa[0]}{$aa[1]}{$aa[2]}\n"); // Converted to EUC-JP
echo bin2hex("{$aa[0]}{$aa[1]}{$aa[2]}"), "\n"; // Converted to EUC-JP

$a = array($s1, $s2, $s3);
$aa = $a;
$encoding = mb_convert_variables('EUC-JP', 'auto', $aa);
print("$encoding\n"); // EUC-JP
print("{$aa[0]}{$aa[1]}{$aa[2]}\n"); // Converted to EUC-JP


echo bin2hex("{$aa[0]}{$aa[1]}{$aa[2]}"), "\n"; // Converted to EUC-JP

// Test for object
echo "== OBJECT TEST ==\n";
Expand Down Expand Up @@ -102,19 +98,17 @@ class bar
}
}


$o = new foo;
$oo = $o;
$encoding = mb_convert_variables('EUC-JP', 'auto', $oo);
print("$encoding\n"); // EUC-JP
print("{$oo->s1}{$oo->s2}{$oo->s3}\n"); // Converted to EUC-JP
echo bin2hex("{$oo->s1}{$oo->s2}{$oo->s3}"), "\n"; // Converted to EUC-JP

$o = new bar;
$oo = $o;
$encoding = mb_convert_variables('EUC-JP', 'auto', $oo);
print("$encoding\n"); // EUC-JP
print("{$oo->s1}{$oo->s2}{$oo->s3}\n"); // Converted to EUC-JP

echo bin2hex("{$oo->s1}{$oo->s2}{$oo->s3}"), "\n"; // Converted to EUC-JP

// Test for scalar, array and object
echo "== SCALAR, ARRAY AND OBJECT TEST ==\n";
Expand All @@ -127,36 +121,79 @@ $oo = $o;

$encoding = mb_convert_variables('EUC-JP', 'auto', $s1, $s2, $s3, $aa, $oo);
print("$encoding\n"); // EUC-JP
print("$s1$s2$s3\n"); // Converted to EUC-JP
print("{$aa[0]}{$aa[1]}{$aa[2]}\n"); // Converted to EUC-JP
print("{$oo->s1}{$oo->s2}{$oo->s3}\n"); // Converted to EUC-JP
echo bin2hex("$s1$s2$s3"), "\n"; // Converted to EUC-JP
echo bin2hex("{$aa[0]}{$aa[1]}{$aa[2]}"), "\n"; // Converted to EUC-JP
echo bin2hex("{$oo->s1}{$oo->s2}{$oo->s3}"), "\n"; // Converted to EUC-JP

echo "== DEEPLY NESTED OBJECT/ARRAY TEST ==\n";

class Nested
{
public $inner;

function __construct($value)
{
$this->inner = $value;
}
}

$deeplyNested = array(new Nested(array(new Nested(array(new Nested("BLAH"))))));

$encoding = mb_convert_variables('UTF-16LE', 'UTF-8', $deeplyNested);
echo $encoding, "\n";
echo bin2hex($deeplyNested[0]->inner[0]->inner[0]->inner), "\n";

echo "== INVALID STRING ENCODING TEST ==\n";
// Make sure both that the correct invalid encoding marker is used,
// and that the count of illegal characters is incremented

$illegalCount = mb_get_info('illegal_chars');
$nested = array(new Nested("\xFF"));
mb_substitute_character(0x25);
mb_convert_variables('UTF-16LE', 'UTF-8', $nested);
echo bin2hex($nested[0]->inner), "\n";
echo "# of illegal characters detected: ", mb_get_info('illegal_chars') - $illegalCount, "\n";

$illegalCount = mb_get_info('illegal_chars');
$nested = array(new Nested("\xFF"));
mb_substitute_character(0x26);
mb_convert_variables('UTF-16LE', 'UTF-8', $nested);
echo bin2hex($nested[0]->inner), "\n";
echo "# of illegal characters detected: ", mb_get_info('illegal_chars') - $illegalCount, "\n";

?>
--EXPECT--
== SCALAR TEST ==
SJIS
���ܸ�ƥ����ȤǤ���01234������������
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
JIS
���ܸ�ƥ����ȤǤ���01234������������
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
EUC-JP
k/qWe4zqg2WDTINYg2eCxYK3gUIwMTIzNIJUglWCVoJXgliBQg==
EUC-JP
GyRCRnxLXDhsJUYlLSU5JUgkRyQ5ISMbKEIwMTIzNBskQiM1IzYjNyM4IzkhIxsoQg==
EUC-JP
���ܸ�ƥ����ȤǤ���01234���������������ܸ�ƥ����ȤǤ���01234���������������ܸ�ƥ����ȤǤ���01234������������
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
== ARRAY TEST ==
EUC-JP
���ܸ�ƥ����ȤǤ���01234���������������ܸ�ƥ����ȤǤ���01234���������������ܸ�ƥ����ȤǤ���01234������������
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
EUC-JP
���ܸ�ƥ����ȤǤ���01234���������������ܸ�ƥ����ȤǤ���01234���������������ܸ�ƥ����ȤǤ���01234������������
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
== OBJECT TEST ==
EUC-JP
���ܸ�ƥ����ȤǤ���01234���������������ܸ�ƥ����ȤǤ���01234���������������ܸ�ƥ����ȤǤ���01234������������
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
EUC-JP
���ܸ�ƥ����ȤǤ���01234���������������ܸ�ƥ����ȤǤ���01234���������������ܸ�ƥ����ȤǤ���01234������������
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
== SCALAR, ARRAY AND OBJECT TEST ==
EUC-JP
���ܸ�ƥ����ȤǤ���01234���������������ܸ�ƥ����ȤǤ���01234���������������ܸ�ƥ����ȤǤ���01234������������
���ܸ�ƥ����ȤǤ���01234���������������ܸ�ƥ����ȤǤ���01234���������������ܸ�ƥ����ȤǤ���01234������������
���ܸ�ƥ����ȤǤ���01234���������������ܸ�ƥ����ȤǤ���01234���������������ܸ�ƥ����ȤǤ���01234������������
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
== DEEPLY NESTED OBJECT/ARRAY TEST ==
UTF-8
42004c0041004800
== INVALID STRING ENCODING TEST ==
2500
# of illegal characters detected: 1
2600
# of illegal characters detected: 1