New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Use fast text conversion filters to implement mb_convert_variables #9966
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,19 +17,19 @@ $sjis = base64_decode('k/qWe4zqg2WDTINYg2eCxYK3gUIwMTIzNIJUglWCVoJXgliBQg=='); | |
// JIS string (BASE64 encoded) | ||
$jis = base64_decode('GyRCRnxLXDhsJUYlLSU5JUgkRyQ5ISMbKEIwMTIzNBskQiM1IzYjNyM4IzkhIxsoQg=='); | ||
// EUC-JP string | ||
$euc_jp = '���ܸ�ƥ����ȤǤ���01234������������'; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, interesting to see that GitHub is able to recognize this EUC-JP encoded string and display it correctly here... On the other side, I've converted it to UTF-8 so that it will (hopefully) appear correctly in most text editors. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree that it is preferable to have *.phpt files encoded as UTF-8 whenever possible. |
||
$euc_jp = mb_convert_encoding("日本語テキストです。0123456789。", 'EUC-JP', 'UTF-8'); | ||
|
||
// Test for single scalar | ||
echo "== SCALAR TEST ==\n"; | ||
$s = $sjis; | ||
$encoding = mb_convert_variables('EUC-JP', 'SJIS', $s); | ||
print("$encoding\n"); // SJIS | ||
print("$s\n"); // Converted to EUC-JP | ||
echo bin2hex($s), "\n"; // Converted to EUC-JP | ||
|
||
$s = $jis; | ||
$encoding = mb_convert_variables('EUC-JP', 'JIS', $s); | ||
print("$encoding\n"); // JIS | ||
print("$s\n"); // Converted to EUC-JP | ||
echo bin2hex($s), "\n"; // Converted to EUC-JP | ||
|
||
$s = $euc_jp; | ||
$encoding = mb_convert_variables('SJIS', 'EUC-JP', $s); | ||
|
@@ -47,9 +47,7 @@ $s2 = $euc_jp; | |
$s3 = $euc_jp; | ||
$encoding = mb_convert_variables('EUC-JP', 'auto', $s1, $s2, $s3); | ||
print("$encoding\n"); // EUC-JP | ||
print("$s1$s2$s3\n"); // Converted to EUC-JP | ||
|
||
|
||
echo bin2hex("$s1$s2$s3"), "\n"; // Converted to EUC-JP | ||
|
||
// Note: Mixing encoding in array/object is not supported? | ||
// Test for array | ||
|
@@ -58,15 +56,13 @@ $a = array($s3, $s2, $s1); | |
$aa = $a; | ||
$encoding = mb_convert_variables('EUC-JP', 'auto', $aa); | ||
print("$encoding\n"); // EUC-JP | ||
print("{$aa[0]}{$aa[1]}{$aa[2]}\n"); // Converted to EUC-JP | ||
echo bin2hex("{$aa[0]}{$aa[1]}{$aa[2]}"), "\n"; // Converted to EUC-JP | ||
|
||
$a = array($s1, $s2, $s3); | ||
$aa = $a; | ||
$encoding = mb_convert_variables('EUC-JP', 'auto', $aa); | ||
print("$encoding\n"); // EUC-JP | ||
print("{$aa[0]}{$aa[1]}{$aa[2]}\n"); // Converted to EUC-JP | ||
|
||
|
||
echo bin2hex("{$aa[0]}{$aa[1]}{$aa[2]}"), "\n"; // Converted to EUC-JP | ||
|
||
// Test for object | ||
echo "== OBJECT TEST ==\n"; | ||
|
@@ -102,19 +98,17 @@ class bar | |
} | ||
} | ||
|
||
|
||
$o = new foo; | ||
$oo = $o; | ||
$encoding = mb_convert_variables('EUC-JP', 'auto', $oo); | ||
print("$encoding\n"); // EUC-JP | ||
print("{$oo->s1}{$oo->s2}{$oo->s3}\n"); // Converted to EUC-JP | ||
echo bin2hex("{$oo->s1}{$oo->s2}{$oo->s3}"), "\n"; // Converted to EUC-JP | ||
|
||
$o = new bar; | ||
$oo = $o; | ||
$encoding = mb_convert_variables('EUC-JP', 'auto', $oo); | ||
print("$encoding\n"); // EUC-JP | ||
print("{$oo->s1}{$oo->s2}{$oo->s3}\n"); // Converted to EUC-JP | ||
|
||
echo bin2hex("{$oo->s1}{$oo->s2}{$oo->s3}"), "\n"; // Converted to EUC-JP | ||
|
||
// Test for scalar, array and object | ||
echo "== SCALAR, ARRAY AND OBJECT TEST ==\n"; | ||
|
@@ -127,36 +121,79 @@ $oo = $o; | |
|
||
$encoding = mb_convert_variables('EUC-JP', 'auto', $s1, $s2, $s3, $aa, $oo); | ||
print("$encoding\n"); // EUC-JP | ||
print("$s1$s2$s3\n"); // Converted to EUC-JP | ||
print("{$aa[0]}{$aa[1]}{$aa[2]}\n"); // Converted to EUC-JP | ||
print("{$oo->s1}{$oo->s2}{$oo->s3}\n"); // Converted to EUC-JP | ||
echo bin2hex("$s1$s2$s3"), "\n"; // Converted to EUC-JP | ||
echo bin2hex("{$aa[0]}{$aa[1]}{$aa[2]}"), "\n"; // Converted to EUC-JP | ||
echo bin2hex("{$oo->s1}{$oo->s2}{$oo->s3}"), "\n"; // Converted to EUC-JP | ||
|
||
echo "== DEEPLY NESTED OBJECT/ARRAY TEST ==\n"; | ||
|
||
class Nested | ||
{ | ||
public $inner; | ||
|
||
function __construct($value) | ||
{ | ||
$this->inner = $value; | ||
} | ||
} | ||
|
||
$deeplyNested = array(new Nested(array(new Nested(array(new Nested("BLAH")))))); | ||
|
||
$encoding = mb_convert_variables('UTF-16LE', 'UTF-8', $deeplyNested); | ||
echo $encoding, "\n"; | ||
echo bin2hex($deeplyNested[0]->inner[0]->inner[0]->inner), "\n"; | ||
|
||
echo "== INVALID STRING ENCODING TEST ==\n"; | ||
// Make sure both that the correct invalid encoding marker is used, | ||
// and that the count of illegal characters is incremented | ||
|
||
$illegalCount = mb_get_info('illegal_chars'); | ||
$nested = array(new Nested("\xFF")); | ||
mb_substitute_character(0x25); | ||
mb_convert_variables('UTF-16LE', 'UTF-8', $nested); | ||
echo bin2hex($nested[0]->inner), "\n"; | ||
echo "# of illegal characters detected: ", mb_get_info('illegal_chars') - $illegalCount, "\n"; | ||
|
||
$illegalCount = mb_get_info('illegal_chars'); | ||
$nested = array(new Nested("\xFF")); | ||
mb_substitute_character(0x26); | ||
mb_convert_variables('UTF-16LE', 'UTF-8', $nested); | ||
echo bin2hex($nested[0]->inner), "\n"; | ||
echo "# of illegal characters detected: ", mb_get_info('illegal_chars') - $illegalCount, "\n"; | ||
|
||
?> | ||
--EXPECT-- | ||
== SCALAR TEST == | ||
SJIS | ||
���ܸ�ƥ����ȤǤ���01234������������ | ||
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3 | ||
JIS | ||
���ܸ�ƥ����ȤǤ���01234������������ | ||
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3 | ||
EUC-JP | ||
k/qWe4zqg2WDTINYg2eCxYK3gUIwMTIzNIJUglWCVoJXgliBQg== | ||
EUC-JP | ||
GyRCRnxLXDhsJUYlLSU5JUgkRyQ5ISMbKEIwMTIzNBskQiM1IzYjNyM4IzkhIxsoQg== | ||
EUC-JP | ||
���ܸ�ƥ����ȤǤ���01234���������������ܸ�ƥ����ȤǤ���01234���������������ܸ�ƥ����ȤǤ���01234������������ | ||
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3 | ||
== ARRAY TEST == | ||
EUC-JP | ||
���ܸ�ƥ����ȤǤ���01234���������������ܸ�ƥ����ȤǤ���01234���������������ܸ�ƥ����ȤǤ���01234������������ | ||
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3 | ||
EUC-JP | ||
���ܸ�ƥ����ȤǤ���01234���������������ܸ�ƥ����ȤǤ���01234���������������ܸ�ƥ����ȤǤ���01234������������ | ||
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3 | ||
== OBJECT TEST == | ||
EUC-JP | ||
���ܸ�ƥ����ȤǤ���01234���������������ܸ�ƥ����ȤǤ���01234���������������ܸ�ƥ����ȤǤ���01234������������ | ||
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3 | ||
EUC-JP | ||
���ܸ�ƥ����ȤǤ���01234���������������ܸ�ƥ����ȤǤ���01234���������������ܸ�ƥ����ȤǤ���01234������������ | ||
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3 | ||
== SCALAR, ARRAY AND OBJECT TEST == | ||
EUC-JP | ||
���ܸ�ƥ����ȤǤ���01234���������������ܸ�ƥ����ȤǤ���01234���������������ܸ�ƥ����ȤǤ���01234������������ | ||
���ܸ�ƥ����ȤǤ���01234���������������ܸ�ƥ����ȤǤ���01234���������������ܸ�ƥ����ȤǤ���01234������������ | ||
���ܸ�ƥ����ȤǤ���01234���������������ܸ�ƥ����ȤǤ���01234���������������ܸ�ƥ����ȤǤ���01234������������ | ||
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3 | ||
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3 | ||
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3 | ||
== DEEPLY NESTED OBJECT/ARRAY TEST == | ||
UTF-8 | ||
42004c0041004800 | ||
== INVALID STRING ENCODING TEST == | ||
2500 | ||
# of illegal characters detected: 1 | ||
2600 | ||
# of illegal characters detected: 1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is where I would like to ask someone who knows PHP internals well... am I handling the refcounts correctly?
mb_convert_variables
replaces strings in an array or object. After converting the oldzend_string
to the new one, I am usingzval_ptr_dtor
to drop the reference to the old one, thenZVAL_STR
to make thezval
point to the new string.I don't think
ZVAL_STR
modifies the refcount, and when the newzend_string
is returned fromphp_mb_convert_encoding_ex
, it should start with a refcount of 1, so hopefully we are OK here... 😕There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This seems to make sense, but you can always run some tests with Valgrind or ASAN to see if there is any issue. But been staring at this for a bit and it seems to check out
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, that's a good idea.