Skip to content

Commit

Permalink
Add mb_trim function
Browse files Browse the repository at this point in the history
Co-authored-by: Niels Dossche <7771979+nielsdos@users.noreply.github.com>
Co-authored-by: Gina Peter Banyard <girgias@php.net>
  • Loading branch information
3 people authored and alexdowad committed Nov 24, 2023
1 parent 3665e90 commit a80b6d7
Show file tree
Hide file tree
Showing 5 changed files with 289 additions and 3 deletions.
139 changes: 139 additions & 0 deletions ext/mbstring/mbstring.c
Original file line number Diff line number Diff line change
Expand Up @@ -2945,6 +2945,145 @@ PHP_FUNCTION(mb_strtolower)
RETURN_STR(mbstring_convert_case(PHP_UNICODE_CASE_LOWER, ZSTR_VAL(str), ZSTR_LEN(str), enc));
}

typedef enum {
MB_LTRIM = 1,
MB_RTRIM = 2,
MB_BOTH_TRIM = 3
} mb_trim_mode;

static zend_always_inline bool is_trim_wchar(uint32_t w, const HashTable *ht)
{
return zend_hash_index_exists(ht, w);
}

static zend_string* trim_each_wchar(zend_string *str, const HashTable *what_ht, mb_trim_mode mode, const mbfl_encoding *enc)
{
unsigned char *in = (unsigned char*)ZSTR_VAL(str);
uint32_t wchar_buf[128];
size_t in_len = ZSTR_LEN(str);
size_t out_len = 0;
unsigned int state = 0;
size_t left = 0;
size_t right = 0;
size_t total_len = 0;

while (in_len) {
out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
ZEND_ASSERT(out_len <= 128);
total_len += out_len;

for (size_t i = 0; i < out_len; i++) {
uint32_t w = wchar_buf[i];
if (is_trim_wchar(w, what_ht)) {
if (mode & MB_LTRIM) {
left += 1;
}
if (mode & MB_RTRIM) {
right += 1;
}
} else {
mode &= ~MB_LTRIM;
if (mode & MB_RTRIM) {
right = 0;
}
}
}
}

return mb_get_substr(str, left, total_len - (right + left), enc);
}

static zend_string* mb_trim_default_chars(zend_string *str, mb_trim_mode mode, const mbfl_encoding *enc)
{
const uint32_t trim_default_chars[] = {
0x20, 0x0C, 0x0A, 0x0D, 0x09, 0x0B, 0x00, 0xA0, 0x1680,
0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007,
0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000,
0x85, 0x180E
};
size_t trim_default_chars_length = sizeof(trim_default_chars) / sizeof(uint32_t);

HashTable what_ht;
zval val;
ZVAL_TRUE(&val);

zend_hash_init(&what_ht, trim_default_chars_length, NULL, NULL, false);

for (size_t i = 0; i < trim_default_chars_length; i++) {
zend_hash_index_add_new(&what_ht, trim_default_chars[i], &val);
}
zend_string* retval = trim_each_wchar(str, &what_ht, mode, enc);
zend_hash_destroy(&what_ht);

return retval;
}

static zend_string* mb_trim_what_chars(zend_string *str, zend_string *what, mb_trim_mode mode, const mbfl_encoding *enc)
{
unsigned char *what_in = (unsigned char*)ZSTR_VAL(what);
uint32_t what_wchar_buf[128];
size_t what_out_len = 0;
unsigned int state = 0;
size_t what_len = ZSTR_LEN(what);
HashTable what_ht;
zval val;
ZVAL_TRUE(&val);
zend_hash_init(&what_ht, what_len, NULL, NULL, false);

while (what_len) {
what_out_len = enc->to_wchar(&what_in, &what_len, what_wchar_buf, 128, &state);
ZEND_ASSERT(what_out_len <= 128);
for (size_t i = 0; i < what_out_len; i++) {
zend_hash_index_add(&what_ht, what_wchar_buf[i], &val);
}
}

zend_string *retval = trim_each_wchar(str, &what_ht, mode, enc);
zend_hash_destroy(&what_ht);

return retval;
}

static void php_do_mb_trim(INTERNAL_FUNCTION_PARAMETERS, mb_trim_mode mode)
{
zend_string *str;
zend_string *what = NULL;
zend_string *encoding = NULL;

ZEND_PARSE_PARAMETERS_START(1, 3)
Z_PARAM_STR(str)
Z_PARAM_OPTIONAL
Z_PARAM_STR(what)
Z_PARAM_STR_OR_NULL(encoding)
ZEND_PARSE_PARAMETERS_END();

const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
if (!enc) {
RETURN_THROWS();
}

if (what) {
RETURN_STR(mb_trim_what_chars(str, what, mode, enc));
} else {
RETURN_STR(mb_trim_default_chars(str, mode, enc));
}
}

PHP_FUNCTION(mb_trim)
{
php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_BOTH_TRIM);
}

PHP_FUNCTION(mb_ltrim)
{
php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_LTRIM);
}

PHP_FUNCTION(mb_rtrim)
{
php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_RTRIM);
}

static const mbfl_encoding **duplicate_elist(const mbfl_encoding **elist, size_t size)
{
const mbfl_encoding **new_elist = safe_emalloc(size, sizeof(mbfl_encoding*), 0);
Expand Down
6 changes: 6 additions & 0 deletions ext/mbstring/mbstring.stub.php
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,12 @@ function mb_strtoupper(string $string, ?string $encoding = null): string {}
/** @refcount 1 */
function mb_strtolower(string $string, ?string $encoding = null): string {}

function mb_trim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string {}

function mb_ltrim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string {}

function mb_rtrim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string {}

/** @refcount 1 */
function mb_detect_encoding(string $string, array|string|null $encodings = null, bool $strict = false): string|false {}

Expand Down
18 changes: 17 additions & 1 deletion ext/mbstring/mbstring_arginfo.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

125 changes: 125 additions & 0 deletions ext/mbstring/tests/mb_trim.phpt
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
--TEST--
mb_trim() function tests
--EXTENSIONS--
mbstring
--FILE--
<?php
mb_internal_encoding("UTF-8");

echo "== Copy from trim ==\n";
var_dump('ABC' === mb_trim('ABC'));
var_dump('ABC' === mb_ltrim('ABC'));
var_dump('ABC' === mb_rtrim('ABC'));
var_dump('ABC' === mb_trim(" \0\t\nABC \0\t\n"));
var_dump("ABC \0\t\n" === mb_ltrim(" \0\t\nABC \0\t\n"));
var_dump(" \0\t\nABC" === mb_rtrim(" \0\t\nABC \0\t\n"));
var_dump(" \0\t\nABC \0\t\n" === mb_trim(" \0\t\nABC \0\t\n",''));
var_dump(" \0\t\nABC \0\t\n" === mb_ltrim(" \0\t\nABC \0\t\n",''));
var_dump(" \0\t\nABC \0\t\n" === mb_rtrim(" \0\t\nABC \0\t\n",''));
echo "== Empty string ==\n";
var_dump(mb_trim(""));
var_dump(mb_ltrim(""));
var_dump(mb_rtrim(""));

echo "== Single string ==\n";
var_dump(mb_ltrim(' test ', ''));
var_dump(mb_trim(" あいうえおあお ", " ", "UTF-8"));
var_dump(mb_trim('foo BAR Spaß', 'ß', "UTF-8"));
var_dump(mb_trim('foo BAR Spaß', 'f', "UTF-8"));

echo "== Multi strings ==\n";
var_dump(mb_trim('foo BAR Spaß', 'ßf', "UTF-8"));
var_dump(mb_trim('foo BAR Spaß', 'fß', "UTF-8"));
var_dump(mb_trim(" あいうおえお  あ", " あ", "UTF-8"));
var_dump(mb_trim(" あいうおえお  あ", "あ ", "UTF-8"));
var_dump(mb_trim(" あいうおえお  a", "あa", "UTF-8"));
var_dump(mb_trim(" あいうおえお  a", "\xe3", "UTF-8"));

echo "== Many strings ==\n";
var_dump(mb_trim(str_repeat(" ", 129)));
var_dump(mb_trim(str_repeat(" ", 129) . "a"));
var_dump(mb_rtrim(str_repeat(" ", 129) . "a"));

echo "== mb_ltrim ==\n";
var_dump(mb_ltrim("あああああああああああああああああああああああああああああああああいああああ", ""));
echo "== mb_rtrim ==\n";
var_dump(mb_rtrim("あああああああああああああああああああああああああああああああああいああああ", ""));

echo "== default params ==\n";
var_dump(mb_trim(" \f\n\r\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}"));

echo "== Byte Order Mark ==\n";
var_dump(mb_ltrim("\u{FFFE}漢字", "\u{FFFE}\u{FEFF}"));
var_dump(bin2hex(mb_ltrim(mb_convert_encoding("\u{FFFE}漢字", "UTF-16LE", "UTF-8"), mb_convert_encoding("\u{FFFE}\u{FEFF}", "UTF-16LE", "UTF-8"), "UTF-16LE")));
var_dump(bin2hex(mb_ltrim(mb_convert_encoding("\u{FEFF}漢字", "UTF-16BE", "UTF-8"), mb_convert_encoding("\u{FFFE}\u{FEFF}", "UTF-16BE", "UTF-8"), "UTF-16BE")));

echo "== Empty string ==\n";
var_dump(mb_trim(" abcd ", ""));
var_dump(mb_ltrim(" abcd ", ""));
var_dump(mb_rtrim(" abcd ", ""));

echo "== SJIS ==\n";
var_dump(mb_convert_encoding(mb_trim("\x81\x40\x82\xa0\x81\x40", "\x81\x40", "SJIS"), "UTF-8", "SJIS"));

echo "== Same strings ==\n";
var_dump(mb_trim("foo", "oo"));

echo "== \$encoding throws ValueError ==\n";
try {
var_dump(mb_trim( "\u{180F}", "", "NULL"));
} catch (ValueError $e) {
var_dump($e->getMessage());
}

?>
--EXPECT--
== Copy from trim ==
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
== Empty string ==
string(0) ""
string(0) ""
string(0) ""
== Single string ==
string(6) " test "
string(21) "あいうえおあお"
string(11) "foo BAR Spa"
string(12) "oo BAR Spaß"
== Multi strings ==
string(10) "oo BAR Spa"
string(10) "oo BAR Spa"
string(16) "いうおえお "
string(16) "いうおえお "
string(25) " あいうおえお  "
string(26) " あいうおえお  a"
== Many strings ==
string(0) ""
string(1) "a"
string(388) "                                                                                                                                 a"
== mb_ltrim ==
string(15) "いああああ"
== mb_rtrim ==
string(102) "あああああああああああああああああああああああああああああああああい"
== default params ==
string(0) ""
== Byte Order Mark ==
string(6) "漢字"
string(8) "226f575b"
string(8) "6f225b57"
== Empty string ==
string(6) " abcd "
string(6) " abcd "
string(6) " abcd "
== SJIS ==
string(3) "あ"
== Same strings ==
string(1) "f"
== $encoding throws ValueError ==
string(73) "mb_trim(): Argument #3 ($encoding) must be a valid encoding, "NULL" given"
4 changes: 2 additions & 2 deletions ext/mbstring/tests/mbregex_stack_limit2.phpt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ if (version_compare(MB_ONIGURUMA_VERSION, '6.9.3') < 0) {
?>
--FILE--
<?php
function mb_trim( $string, $chars = "", $chars_array = array() )
function mb_trim_regex( $string, $chars = "", $chars_array = array() )
{
for( $x=0; $x<iconv_strlen( $chars ); $x++ ) $chars_array[] = preg_quote( iconv_substr( $chars, $x, 1 ) );
$encoded_char_list = implode( "|", array_merge( array( "\s","\t","\n","\r", "\0", "\x0B" ), $chars_array ) );
Expand All @@ -23,7 +23,7 @@ function mb_trim( $string, $chars = "", $chars_array = array() )
}

ini_set('mbstring.regex_stack_limit', 10000);
var_dump(mb_trim(str_repeat(' ', 10000)));
var_dump(mb_trim_regex(str_repeat(' ', 10000)));

echo 'OK';
?>
Expand Down

3 comments on commit a80b6d7

@nielsdos
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This needs an upgrading and news entry.

@youkidearitai
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, #12459 is included NEWS and UPGRADING.

@nielsdos
Copy link
Member

@nielsdos nielsdos commented on a80b6d7 Nov 24, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@youkidearitai OK, thanks for pointing me to where it is. I'll commit them manually.
EDIT: added in d3c2673

Please sign in to comment.