Skip to content

Commit

Permalink
[RFC] Implement mb_str_pad() (#11284)
Browse files Browse the repository at this point in the history
Closes GH-10203.
  • Loading branch information
nielsdos committed Jun 20, 2023
1 parent d9e2da3 commit 6859163
Show file tree
Hide file tree
Showing 6 changed files with 283 additions and 1 deletion.
2 changes: 2 additions & 0 deletions NEWS
Expand Up @@ -2,6 +2,8 @@ PHP NEWS
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
?? ??? ????, PHP 8.3.0alpha3

- MBString:
. Implement mb_str_pad() RFC. (nielsdos)

22 Jun 2023, PHP 8.3.0alpha2

Expand Down
4 changes: 4 additions & 0 deletions UPGRADING
Expand Up @@ -208,6 +208,10 @@ PHP 8.3 UPGRADE NOTES
the given $depth and $options.
RFC: https://wiki.php.net/rfc/json_validate

- MBString:
. Added mb_str_pad(), which is the mbstring equivalent of str_pad().
RFC: https://wiki.php.net/rfc/mb_str_pad

- Posix:
. Added posix_sysconf call to get runtime informations.
. Added posix_pathconf call to get configuration value from a directory/file.
Expand Down
126 changes: 126 additions & 0 deletions ext/mbstring/mbstring.c
Expand Up @@ -5522,6 +5522,132 @@ PHP_FUNCTION(mb_chr)
}
/* }}} */

PHP_FUNCTION(mb_str_pad)
{
zend_string *input, *encoding_str = NULL, *pad = NULL;
zend_long pad_to_length;
zend_long pad_type_val = PHP_STR_PAD_RIGHT;

ZEND_PARSE_PARAMETERS_START(2, 5)
Z_PARAM_STR(input)
Z_PARAM_LONG(pad_to_length)
Z_PARAM_OPTIONAL
Z_PARAM_STR(pad)
Z_PARAM_LONG(pad_type_val)
Z_PARAM_STR_OR_NULL(encoding_str)
ZEND_PARSE_PARAMETERS_END();

const mbfl_encoding *encoding = php_mb_get_encoding(encoding_str, 5);
if (!encoding) {
RETURN_THROWS();
}

size_t input_length = mb_get_strlen(input, encoding);

/* If resulting string turns out to be shorter than input string,
we simply copy the input and return. */
if (pad_to_length < 0 || (size_t)pad_to_length <= input_length) {
RETURN_STR_COPY(input);
}

if (ZSTR_LEN(pad) == 0) {
zend_argument_value_error(3, "must be a non-empty string");
RETURN_THROWS();
}

if (pad_type_val < PHP_STR_PAD_LEFT || pad_type_val > PHP_STR_PAD_BOTH) {
zend_argument_value_error(4, "must be STR_PAD_LEFT, STR_PAD_RIGHT, or STR_PAD_BOTH");
RETURN_THROWS();
}

size_t pad_length = mb_get_strlen(pad, encoding);

size_t num_mb_pad_chars = pad_to_length - input_length;

/* We need to figure out the left/right padding lengths. */
size_t left_pad = 0, right_pad = 0; /* Initialize here to silence compiler warnings. */
switch (pad_type_val) {
case PHP_STR_PAD_RIGHT:
right_pad = num_mb_pad_chars;
break;

case PHP_STR_PAD_LEFT:
left_pad = num_mb_pad_chars;
break;

case PHP_STR_PAD_BOTH:
left_pad = num_mb_pad_chars / 2;
right_pad = num_mb_pad_chars - left_pad;
break;
}

/* How many full block copies need to happen, and how many characters are then left over? */
size_t full_left_pad_copies = left_pad / pad_length;
size_t full_right_pad_copies = right_pad / pad_length;
size_t remaining_left_pad_chars = left_pad % pad_length;
size_t remaining_right_pad_chars = right_pad % pad_length;

if (UNEXPECTED(full_left_pad_copies > SIZE_MAX / ZSTR_LEN(pad) || full_right_pad_copies > SIZE_MAX / ZSTR_LEN(pad))) {
goto overflow_no_release;
}

/* Compute the number of bytes required for the padding */
size_t full_left_pad_bytes = full_left_pad_copies * ZSTR_LEN(pad);
size_t full_right_pad_bytes = full_right_pad_copies * ZSTR_LEN(pad);

/* No special fast-path handling necessary for zero-length pads because these functions will not
* allocate memory in case a zero-length pad is required. */
zend_string *remaining_left_pad_str = mb_get_substr(pad, 0, remaining_left_pad_chars, encoding);
zend_string *remaining_right_pad_str = mb_get_substr(pad, 0, remaining_right_pad_chars, encoding);

if (UNEXPECTED(full_left_pad_bytes > ZSTR_MAX_LEN - ZSTR_LEN(remaining_left_pad_str)
|| full_right_pad_bytes > ZSTR_MAX_LEN - ZSTR_LEN(remaining_right_pad_str))) {
goto overflow;
}

size_t left_pad_bytes = full_left_pad_bytes + ZSTR_LEN(remaining_left_pad_str);
size_t right_pad_bytes = full_right_pad_bytes + ZSTR_LEN(remaining_right_pad_str);

if (UNEXPECTED(left_pad_bytes > ZSTR_MAX_LEN - right_pad_bytes
|| ZSTR_LEN(input) > ZSTR_MAX_LEN - left_pad_bytes - right_pad_bytes)) {
goto overflow;
}

zend_string *result = zend_string_alloc(ZSTR_LEN(input) + left_pad_bytes + right_pad_bytes, false);
char *buffer = ZSTR_VAL(result);

/* First we pad the left. */
for (size_t i = 0; i < full_left_pad_copies; i++, buffer += ZSTR_LEN(pad)) {
memcpy(buffer, ZSTR_VAL(pad), ZSTR_LEN(pad));
}
memcpy(buffer, ZSTR_VAL(remaining_left_pad_str), ZSTR_LEN(remaining_left_pad_str));
buffer += ZSTR_LEN(remaining_left_pad_str);

/* Then we copy the input string. */
memcpy(buffer, ZSTR_VAL(input), ZSTR_LEN(input));
buffer += ZSTR_LEN(input);

/* Finally, we pad on the right. */
for (size_t i = 0; i < full_right_pad_copies; i++, buffer += ZSTR_LEN(pad)) {
memcpy(buffer, ZSTR_VAL(pad), ZSTR_LEN(pad));
}
memcpy(buffer, ZSTR_VAL(remaining_right_pad_str), ZSTR_LEN(remaining_right_pad_str));

ZSTR_VAL(result)[ZSTR_LEN(result)] = '\0';

zend_string_release_ex(remaining_left_pad_str, false);
zend_string_release_ex(remaining_right_pad_str, false);

RETURN_NEW_STR(result);

overflow:
zend_string_release_ex(remaining_left_pad_str, false);
zend_string_release_ex(remaining_right_pad_str, false);
overflow_no_release:
zend_throw_error(NULL, "String size overflow");
RETURN_THROWS();
}

/* {{{ */
PHP_FUNCTION(mb_scrub)
{
Expand Down
2 changes: 2 additions & 0 deletions ext/mbstring/mbstring.stub.php
Expand Up @@ -183,6 +183,8 @@ function mb_ord(string $string, ?string $encoding = null): int|false {}

function mb_chr(int $codepoint, ?string $encoding = null): string|false {}

function mb_str_pad(string $string, int $length, string $pad_string = " ", int $pad_type = STR_PAD_RIGHT, ?string $encoding = null): string {}

#ifdef HAVE_MBREGEX
/** @refcount 1 */
function mb_regex_encoding(?string $encoding = null): string|bool {}
Expand Down
12 changes: 11 additions & 1 deletion ext/mbstring/mbstring_arginfo.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

138 changes: 138 additions & 0 deletions ext/mbstring/tests/mb_str_pad.phpt
@@ -0,0 +1,138 @@
--TEST--
mb_str_pad()
--EXTENSIONS--
mbstring
--FILE--
<?php

echo "--- Error conditions ---\n";
try {
var_dump(mb_str_pad('▶▶', 6, '', STR_PAD_RIGHT));
} catch (ValueError $e) {
var_dump($e->getMessage());
}
try {
var_dump(mb_str_pad('▶▶', 6, '', STR_PAD_LEFT));
} catch (ValueError $e) {
var_dump($e->getMessage());
}
try {
var_dump(mb_str_pad('▶▶', 6, '', STR_PAD_BOTH));
} catch (ValueError $e) {
var_dump($e->getMessage());
}
try {
var_dump(mb_str_pad('▶▶', 6, ' ', 123456));
} catch (ValueError $e) {
var_dump($e->getMessage());
}
try {
var_dump(mb_str_pad('▶▶', 6, ' ', STR_PAD_BOTH, 'unexisting'));
} catch (ValueError $e) {
var_dump($e->getMessage());
}

echo "--- Simple ASCII strings ---\n";
var_dump(mb_str_pad('Hello', 7, '+-', STR_PAD_BOTH));
var_dump(mb_str_pad('World', 10, '+-', STR_PAD_BOTH));
var_dump(mb_str_pad('Hello', 7, '+-', STR_PAD_LEFT));
var_dump(mb_str_pad('World', 10, '+-', STR_PAD_LEFT));
var_dump(mb_str_pad('Hello', 7, '+-', STR_PAD_RIGHT));
var_dump(mb_str_pad('World', 10, '+-', STR_PAD_RIGHT));

echo "--- Edge cases pad length ---\n";
var_dump(mb_str_pad('▶▶', 2, ' ', STR_PAD_BOTH));
var_dump(mb_str_pad('▶▶', 1, ' ', STR_PAD_BOTH));
var_dump(mb_str_pad('▶▶', 0, ' ', STR_PAD_BOTH));
var_dump(mb_str_pad('▶▶', -1, ' ', STR_PAD_BOTH));

echo "--- Empty input string ---\n";
var_dump(mb_str_pad('', 2, ' ', STR_PAD_BOTH));
var_dump(mb_str_pad('', 1, ' ', STR_PAD_BOTH));
var_dump(mb_str_pad('', 0, ' ', STR_PAD_BOTH));
var_dump(mb_str_pad('', -1, ' ', STR_PAD_BOTH));

echo "--- No default argument ---\n";
var_dump(mb_str_pad('▶▶', 6, pad_type: STR_PAD_RIGHT));
var_dump(mb_str_pad('▶▶', 6, pad_type: STR_PAD_LEFT));
var_dump(mb_str_pad('▶▶', 6, pad_type: STR_PAD_BOTH));

echo "--- UTF-8 emojis ---\n";
for ($i = 6; $i > 0; $i--) {
var_dump(mb_str_pad('▶▶', $i, '❤❓❇', STR_PAD_RIGHT));
var_dump(mb_str_pad('▶▶', $i, '❤❓❇', STR_PAD_LEFT));
var_dump(mb_str_pad('▶▶', $i, '❤❓❇', STR_PAD_BOTH));
}

echo "--- UTF-8, 32, 7 test ---\n";

// Taken from mb_substr.phpt
$utf8 = "Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь";
$utf32 = mb_convert_encoding($utf8, 'UTF-32', 'UTF-8');
$utf7 = mb_convert_encoding($utf8, 'UTF-7', 'UTF-8');
$tests = ["UTF-8" => $utf8, "UTF-32" => $utf32, "UTF-7" => $utf7];

foreach ($tests as $encoding => $test) {
$pad_str = mb_convert_encoding('▶▶', $encoding, 'UTF-8');
var_dump(mb_convert_encoding(mb_str_pad($test, 44, $pad_str, STR_PAD_RIGHT, $encoding), 'UTF-8', $encoding));
var_dump(mb_convert_encoding(mb_str_pad($test, 44, $pad_str, STR_PAD_LEFT, $encoding), 'UTF-8', $encoding));
var_dump(mb_convert_encoding(mb_str_pad($test, 44, $pad_str, STR_PAD_BOTH, $encoding), 'UTF-8', $encoding));
}
?>
--EXPECT--
--- Error conditions ---
string(66) "mb_str_pad(): Argument #3 ($pad_string) must be a non-empty string"
string(66) "mb_str_pad(): Argument #3 ($pad_string) must be a non-empty string"
string(66) "mb_str_pad(): Argument #3 ($pad_string) must be a non-empty string"
string(90) "mb_str_pad(): Argument #4 ($pad_type) must be STR_PAD_LEFT, STR_PAD_RIGHT, or STR_PAD_BOTH"
string(82) "mb_str_pad(): Argument #5 ($encoding) must be a valid encoding, "unexisting" given"
--- Simple ASCII strings ---
string(7) "+Hello+"
string(10) "+-World+-+"
string(7) "+-Hello"
string(10) "+-+-+World"
string(7) "Hello+-"
string(10) "World+-+-+"
--- Edge cases pad length ---
string(6) "▶▶"
string(6) "▶▶"
string(6) "▶▶"
string(6) "▶▶"
--- Empty input string ---
string(2) " "
string(1) " "
string(0) ""
string(0) ""
--- No default argument ---
string(10) "▶▶ "
string(10) " ▶▶"
string(10) " ▶▶ "
--- UTF-8 emojis ---
string(18) "▶▶❤❓❇❤"
string(18) "❤❓❇❤▶▶"
string(18) "❤❓▶▶❤❓"
string(15) "▶▶❤❓❇"
string(15) "❤❓❇▶▶"
string(15) "❤▶▶❤❓"
string(12) "▶▶❤❓"
string(12) "❤❓▶▶"
string(12) "❤▶▶❤"
string(9) "▶▶❤"
string(9) "❤▶▶"
string(9) "▶▶❤"
string(6) "▶▶"
string(6) "▶▶"
string(6) "▶▶"
string(6) "▶▶"
string(6) "▶▶"
string(6) "▶▶"
--- UTF-8, 32, 7 test ---
string(92) "Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь▶▶▶"
string(92) "▶▶▶Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь"
string(92) "▶Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь▶▶"
string(92) "Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь▶▶▶"
string(92) "▶▶▶Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь"
string(92) "▶Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь▶▶"
string(92) "Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь▶▶▶"
string(92) "▶▶▶Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь"
string(92) "▶Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь▶▶"

0 comments on commit 6859163

Please sign in to comment.