From c4fc6a91274a4de30f2128a0cc0c9851ef9e9f09 Mon Sep 17 00:00:00 2001 From: Jason Marble Date: Thu, 23 Oct 2025 15:53:36 -0700 Subject: [PATCH 1/4] Fix GH-20262: array_unique() SORT_REGULAR fails to deduplicate with mixed strings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit array_unique() with SORT_REGULAR was failing to remove duplicate numeric strings when mixed with alphanumeric strings due to non-transitive comparison issues in the sort-based algorithm. Implemented hash-bucketing optimization for SORT_REGULAR that preserves full type coercion semantics while improving performance from O(n²) to O(n). Closes GH-20262 --- ext/standard/array.c | 128 +++++++++++ .../array_unique_variation_sort_regular.phpt | 214 ++++++++++++++++++ ext/standard/tests/array/gh20262.phpt | 19 ++ 3 files changed, 361 insertions(+) create mode 100644 ext/standard/tests/array/array_unique_variation_sort_regular.phpt create mode 100644 ext/standard/tests/array/gh20262.phpt diff --git a/ext/standard/array.c b/ext/standard/array.c index 4097d7189901..e778451fa59f 100644 --- a/ext/standard/array.c +++ b/ext/standard/array.c @@ -4964,6 +4964,9 @@ PHP_FUNCTION(array_unique) bucket_compare_func_t cmp; struct bucketindex *arTmp, *cmpdata, *lastkept; uint32_t i, idx; + zend_long num_key; + zend_string *str_key; + zval *val; ZEND_PARSE_PARAMETERS_START(1, 2) Z_PARAM_ARRAY(array) @@ -4976,6 +4979,131 @@ PHP_FUNCTION(array_unique) return; } + if (sort_type == PHP_SORT_REGULAR) { + /* Hash-bucketing solution for SORT_REGULAR */ + #define UNIQUE_HASH_BUCKETS 256 + + typedef struct { + zval **values; + uint32_t count; + uint32_t capacity; + } value_bucket; + + value_bucket *buckets = ecalloc(UNIQUE_HASH_BUCKETS, sizeof(value_bucket)); + cmp = php_get_data_compare_func_unstable(sort_type, 0); + array_init(return_value); + + ZEND_HASH_FOREACH_KEY_VAL(Z_ARRVAL_P(array), num_key, str_key, val) { + /* Dereference if this is a reference */ + zval *deref_val = val; + ZVAL_DEREF(deref_val); + + /* Compute hash for this value */ + zend_ulong hash = 0; + + if (Z_TYPE_P(deref_val) == IS_LONG) { + hash = (zend_ulong)Z_LVAL_P(deref_val); + } else if (Z_TYPE_P(deref_val) == IS_DOUBLE) { + double dval = Z_DVAL_P(deref_val); + if (zend_isnan(dval) || zend_isinf(dval)) { + hash = 0; /* NaN and Inf hash to 0 */ + } else { + hash = (zend_ulong)(zend_long)dval; + } + } else if (Z_TYPE_P(deref_val) == IS_TRUE) { + hash = 1; /* true hashes like integer 1 */ + } else if (Z_TYPE_P(deref_val) == IS_FALSE) { + hash = 0; /* false hashes like integer 0 */ + } else if (Z_TYPE_P(deref_val) == IS_NULL) { + hash = 0; /* null hashes like integer 0 */ + } else if (Z_TYPE_P(deref_val) == IS_STRING) { + /* Check if numeric string */ + zend_long lval; + double dval; + zend_uchar type = is_numeric_string(Z_STRVAL_P(deref_val), Z_STRLEN_P(deref_val), &lval, &dval, 0); + + if (type == IS_LONG) { + hash = (zend_ulong)lval; /* '5' and '05' hash the same */ + } else if (type == IS_DOUBLE) { + hash = (zend_ulong)dval; + } else { + /* Non-numeric string */ + if (Z_STRLEN_P(deref_val) == 0) { + hash = 0; /* Empty string might equal false/null */ + } else { + hash = zend_string_hash_val(Z_STR_P(deref_val)); + } + } + } else if (Z_TYPE_P(deref_val) == IS_OBJECT) { + /* Hash objects by class name */ + zend_class_entry *ce = Z_OBJCE_P(deref_val); + hash = zend_string_hash_val(ce->name); + } else if (Z_TYPE_P(deref_val) == IS_ARRAY) { + /* Hash arrays by size and first value */ + hash = zend_hash_num_elements(Z_ARRVAL_P(deref_val)); + + /* XOR with hash of first element if it's a simple type */ + zval *first_elem = zend_hash_get_current_data(Z_ARRVAL_P(deref_val)); + if (first_elem) { + if (Z_TYPE_P(first_elem) == IS_LONG) { + hash ^= Z_LVAL_P(first_elem); + } else if (Z_TYPE_P(first_elem) == IS_STRING) { + hash ^= zend_string_hash_val(Z_STR_P(first_elem)); + } + } + } else { + /* Other types */ + hash = Z_TYPE_P(deref_val); + } + + uint32_t bucket_idx = hash % UNIQUE_HASH_BUCKETS; + value_bucket *bucket = &buckets[bucket_idx]; + + /* Check if duplicate exists in this bucket */ + bool is_duplicate = false; + for (uint32_t i = 0; i < bucket->count; i++) { + zval *existing_deref = bucket->values[i]; + ZVAL_DEREF(existing_deref); + Bucket b1 = {.val = *deref_val}, b2 = {.val = *existing_deref}; + if (cmp(&b1, &b2) == 0) { + is_duplicate = true; + break; + } + } + + if (!is_duplicate) { + /* Add to bucket */ + if (bucket->count >= bucket->capacity) { + bucket->capacity = bucket->capacity ? bucket->capacity * 2 : 4; + bucket->values = erealloc(bucket->values, bucket->capacity * sizeof(zval*)); + } + bucket->values[bucket->count++] = val; + + /* Add to result */ + if (UNEXPECTED(Z_ISREF_P(val) && Z_REFCOUNT_P(val) == 1)) { + ZVAL_DEREF(val); + } + Z_TRY_ADDREF_P(val); + + if (str_key) { + zend_hash_add_new(Z_ARRVAL_P(return_value), str_key, val); + } else { + zend_hash_index_add_new(Z_ARRVAL_P(return_value), num_key, val); + } + } + } ZEND_HASH_FOREACH_END(); + + /* Cleanup buckets */ + for (uint32_t i = 0; i < UNIQUE_HASH_BUCKETS; i++) { + if (buckets[i].values) { + efree(buckets[i].values); + } + } + efree(buckets); + + return; + } + if (sort_type == PHP_SORT_STRING) { HashTable seen; zend_long num_key; diff --git a/ext/standard/tests/array/array_unique_variation_sort_regular.phpt b/ext/standard/tests/array/array_unique_variation_sort_regular.phpt new file mode 100644 index 000000000000..a1b067bd5cd2 --- /dev/null +++ b/ext/standard/tests/array/array_unique_variation_sort_regular.phpt @@ -0,0 +1,214 @@ +--TEST-- +Test array_unique() function : SORT_REGULAR type coercion behavior +--FILE-- + +--EXPECT-- +*** Testing array_unique() with SORT_REGULAR *** + +-- Integer and string coercion -- +array(2) { + [0]=> + int(1) + [2]=> + int(2) +} + +-- Boolean coercion -- +array(2) { + [0]=> + bool(true) + [2]=> + bool(false) +} + +-- NULL coercion -- +array(2) { + [0]=> + NULL + [4]=> + string(1) "0" +} + +-- Float coercion -- +array(1) { + [0]=> + int(1) +} + +-- Numeric strings -- +array(1) { + [0]=> + string(2) "10" +} + +-- Leading zeros -- +array(1) { + [0]=> + string(2) "05" +} + +-- Partial numeric strings -- +array(2) { + [0]=> + string(4) "5abc" + [1]=> + string(1) "5" +} + +-- Whitespace in numeric strings -- +array(1) { + [0]=> + string(1) "5" +} + +-- Case sensitivity -- +array(3) { + [0]=> + string(3) "abc" + [1]=> + string(3) "ABC" + [2]=> + string(3) "Abc" +} + +-- Exponential notation -- +array(1) { + [0]=> + int(1000) +} + +-- Negative numbers -- +array(1) { + [0]=> + int(-5) +} + +-- Arrays -- +array(2) { + [0]=> + array(2) { + [0]=> + int(1) + [1]=> + int(2) + } + [2]=> + array(2) { + [0]=> + int(1) + [1]=> + int(3) + } +} + +-- NaN handling -- +array(3) { + [0]=> + float(NAN) + [1]=> + float(NAN) + [2]=> + int(1) +} + +-- INF handling -- +array(2) { + [0]=> + float(INF) + [2]=> + float(-INF) +} + +-- Bug GH-20262 case -- +array(3) { + [0]=> + string(1) "5" + [1]=> + string(2) "10" + [2]=> + string(2) "3A" +} + +-- SORT_REGULAR vs SORT_STRING -- +SORT_REGULAR: array(1) { + [0]=> + bool(true) +} +SORT_STRING: array(1) { + [0]=> + bool(true) +} + +Done diff --git a/ext/standard/tests/array/gh20262.phpt b/ext/standard/tests/array/gh20262.phpt new file mode 100644 index 000000000000..0eb98c0dc2fd --- /dev/null +++ b/ext/standard/tests/array/gh20262.phpt @@ -0,0 +1,19 @@ +--TEST-- +Bug GH-20262 (array_unique() with SORT_REGULAR fails to remove duplicates with mixed strings) +--FILE-- + +--EXPECT-- +array(3) { + [0]=> + string(1) "5" + [1]=> + string(2) "10" + [2]=> + string(2) "3A" +} From 603989786ca28e2835724970ecd4c753d5a5951e Mon Sep 17 00:00:00 2001 From: Jason Marble Date: Fri, 24 Oct 2025 01:46:33 -0700 Subject: [PATCH 2/4] array_unique: Security and performance improvements - Eliminate Hash-DoS and integer overflow vulnerabilities - Add exception handling to prevent memory leaks - Implement type-specific optimizations (hash/sort/bucket) - Dynamic bucket sizing for memory efficiency - Fix resource handling --- ext/standard/array.c | 348 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 281 insertions(+), 67 deletions(-) diff --git a/ext/standard/array.c b/ext/standard/array.c index e778451fa59f..bd48b79b3060 100644 --- a/ext/standard/array.c +++ b/ext/standard/array.c @@ -4980,104 +4980,299 @@ PHP_FUNCTION(array_unique) } if (sort_type == PHP_SORT_REGULAR) { - /* Hash-bucketing solution for SORT_REGULAR */ - #define UNIQUE_HASH_BUCKETS 256 + /* Detect data types in array to choose optimal algorithm */ + bool all_integers = true; + bool has_complex_types = false; /* arrays, objects only (NOT resources) */ + zval *check_val; + + ZEND_HASH_FOREACH_VAL(Z_ARRVAL_P(array), check_val) { + ZVAL_DEREF(check_val); + uint8_t type = Z_TYPE_P(check_val); + + if (type != IS_LONG) { + all_integers = false; + } + + /* Arrays and objects need sorting (they have deep comparison semantics). + * Resources use identity comparison, so they can stay in scalar path. */ + if (type == IS_ARRAY || type == IS_OBJECT) { + has_complex_types = true; + break; /* No point continuing - we'll use sort path */ + } + } ZEND_HASH_FOREACH_END(); + + /* For integer-only arrays, we can use a real hash table for O(N) performance */ + if (all_integers) { + HashTable seen; + zend_hash_init(&seen, zend_hash_num_elements(Z_ARRVAL_P(array)), NULL, NULL, 0); + array_init(return_value); + + ZEND_HASH_FOREACH_KEY_VAL(Z_ARRVAL_P(array), num_key, str_key, val) { + if (UNEXPECTED(Z_ISREF_P(val) && Z_REFCOUNT_P(val) == 1)) { + val = Z_REFVAL_P(val); + } + zend_long int_val = Z_LVAL_P(val); + + /* Use integer value as hash key for O(1) lookup */ + if (!zend_hash_index_exists(&seen, int_val)) { + zend_hash_index_add_empty_element(&seen, int_val); + + /* Add to result */ + Z_TRY_ADDREF_P(val); + + if (str_key) { + zend_hash_add_new(Z_ARRVAL_P(return_value), str_key, val); + } else { + zend_hash_index_add_new(Z_ARRVAL_P(return_value), num_key, val); + } + } + + if (UNEXPECTED(EG(exception))) { + zend_hash_destroy(&seen); + return; + } + } ZEND_HASH_FOREACH_END(); + + zend_hash_destroy(&seen); + return; + } + + if (has_complex_types) { + /* Use the standard sort-based deduplication (same as other sort types) */ + cmp = php_get_data_compare_func_unstable(sort_type, 0); + + bool in_place = zend_may_modify_arg_in_place(array); + if (in_place) { + RETVAL_ARR(Z_ARRVAL_P(array)); + } else { + RETVAL_ARR(zend_array_dup(Z_ARRVAL_P(array))); + } + + /* Guard against integer overflow in allocation */ + uint32_t num_elements = Z_ARRVAL_P(array)->nNumOfElements; + if (UNEXPECTED(num_elements >= UINT32_MAX - 1)) { + zend_throw_error(NULL, "Array is too large for array_unique()"); + RETURN_THROWS(); + } + size_t alloc_size = (num_elements + 1) * sizeof(struct bucketindex); + if (UNEXPECTED(alloc_size / sizeof(struct bucketindex) != (num_elements + 1))) { + zend_throw_error(NULL, "Array is too large for array_unique()"); + RETURN_THROWS(); + } + + arTmp = pemalloc(alloc_size, GC_FLAGS(Z_ARRVAL_P(array)) & IS_ARRAY_PERSISTENT); + + /* Copy array buckets to arTmp */ + if (HT_IS_PACKED(Z_ARRVAL_P(array))) { + zval *zv = Z_ARRVAL_P(array)->arPacked; + for (i = 0, idx = 0; idx < Z_ARRVAL_P(array)->nNumUsed; idx++, zv++) { + if (Z_TYPE_P(zv) == IS_UNDEF) continue; + ZVAL_COPY_VALUE(&arTmp[i].b.val, zv); + arTmp[i].b.h = idx; + arTmp[i].b.key = NULL; + arTmp[i].i = i; + i++; + } + } else { + p = Z_ARRVAL_P(array)->arData; + for (i = 0, idx = 0; idx < Z_ARRVAL_P(array)->nNumUsed; idx++, p++) { + if (Z_TYPE(p->val) == IS_UNDEF) continue; + arTmp[i].b = *p; + arTmp[i].i = i; + i++; + } + } + ZVAL_UNDEF(&arTmp[i].b.val); + + /* Sort array */ + zend_sort((void *) arTmp, i, sizeof(struct bucketindex), + (compare_func_t) cmp, (swap_func_t) array_bucketindex_swap); + + if (UNEXPECTED(EG(exception))) { + pefree(arTmp, GC_FLAGS(Z_ARRVAL_P(array)) & IS_ARRAY_PERSISTENT); + return; + } + + /* Remove adjacent duplicates from sorted array */ + lastkept = arTmp; + for (cmpdata = arTmp + 1; Z_TYPE(cmpdata->b.val) != IS_UNDEF; cmpdata++) { + if (cmp(&lastkept->b, &cmpdata->b)) { + lastkept = cmpdata; + } else { + if (lastkept->i > cmpdata->i) { + p = &lastkept->b; + lastkept = cmpdata; + } else { + p = &cmpdata->b; + } + if (p->key == NULL) { + zend_hash_index_del(Z_ARRVAL_P(return_value), p->h); + } else { + zend_hash_del(Z_ARRVAL_P(return_value), p->key); + } + } + } + + pefree(arTmp, GC_FLAGS(Z_ARRVAL_P(array)) & IS_ARRAY_PERSISTENT); + + if (in_place) { + Z_ADDREF_P(return_value); + } + + return; + } + + uint32_t num_elements = Z_ARRVAL_P(array)->nNumOfElements; + uint32_t bucket_count; + + if (num_elements < 64) { + bucket_count = 64; + } else if (num_elements < 256) { + bucket_count = 256; + } else if (num_elements < 1024) { + bucket_count = 1024; + } else if (num_elements < 4096) { + bucket_count = 4096; + } else { + bucket_count = 16384; + } + + #define SAFE_UNIQUE_HASH_BUCKETS bucket_count typedef struct { - zval **values; + zval *values; uint32_t count; uint32_t capacity; - } value_bucket; + } safe_value_bucket; - value_bucket *buckets = ecalloc(UNIQUE_HASH_BUCKETS, sizeof(value_bucket)); + safe_value_bucket *buckets = ecalloc(SAFE_UNIQUE_HASH_BUCKETS, sizeof(safe_value_bucket)); cmp = php_get_data_compare_func_unstable(sort_type, 0); array_init(return_value); + #define CLEANUP_BUCKETS() do { \ + for (uint32_t _i = 0; _i < SAFE_UNIQUE_HASH_BUCKETS; _i++) { \ + if (buckets[_i].values) { \ + for (uint32_t _j = 0; _j < buckets[_i].count; _j++) { \ + zval_ptr_dtor(&buckets[_i].values[_j]); \ + } \ + efree(buckets[_i].values); \ + } \ + } \ + efree(buckets); \ + } while (0) + ZEND_HASH_FOREACH_KEY_VAL(Z_ARRVAL_P(array), num_key, str_key, val) { - /* Dereference if this is a reference */ zval *deref_val = val; ZVAL_DEREF(deref_val); - /* Compute hash for this value */ - zend_ulong hash = 0; + zend_ulong hash; if (Z_TYPE_P(deref_val) == IS_LONG) { + /* Hash integer value directly */ hash = (zend_ulong)Z_LVAL_P(deref_val); } else if (Z_TYPE_P(deref_val) == IS_DOUBLE) { + /* Hash double as integer if it's a whole number */ double dval = Z_DVAL_P(deref_val); - if (zend_isnan(dval) || zend_isinf(dval)) { - hash = 0; /* NaN and Inf hash to 0 */ + if (zend_isnan(dval)) { + hash = 0xDEADBEEF; /* All NaNs to same bucket, cmp() ensures correctness */ + } else if (zend_isinf(dval)) { + /* +INF and -INF get distinct hashes */ + hash = dval > 0 ? ZEND_ULONG_MAX : ZEND_ULONG_MAX - 1; + } else if (dval == (double)(zend_long)dval) { + hash = (zend_ulong)(zend_long)dval; /* 5.0 hashes like 5 */ } else { - hash = (zend_ulong)(zend_long)dval; + /* Non-integer double - use bit pattern as hash to avoid string conversion */ + union { double d; zend_ulong ul; } u; + u.d = dval; + hash = u.ul; } - } else if (Z_TYPE_P(deref_val) == IS_TRUE) { - hash = 1; /* true hashes like integer 1 */ - } else if (Z_TYPE_P(deref_val) == IS_FALSE) { - hash = 0; /* false hashes like integer 0 */ - } else if (Z_TYPE_P(deref_val) == IS_NULL) { - hash = 0; /* null hashes like integer 0 */ } else if (Z_TYPE_P(deref_val) == IS_STRING) { - /* Check if numeric string */ + /* Check if it's a numeric string */ zend_long lval; double dval; - zend_uchar type = is_numeric_string(Z_STRVAL_P(deref_val), Z_STRLEN_P(deref_val), &lval, &dval, 0); - - if (type == IS_LONG) { - hash = (zend_ulong)lval; /* '5' and '05' hash the same */ - } else if (type == IS_DOUBLE) { - hash = (zend_ulong)dval; - } else { - /* Non-numeric string */ - if (Z_STRLEN_P(deref_val) == 0) { - hash = 0; /* Empty string might equal false/null */ + zend_uchar str_type = is_numeric_string(Z_STRVAL_P(deref_val), Z_STRLEN_P(deref_val), &lval, &dval, 0); + + if (str_type == IS_LONG) { + /* Numeric string - hash as integer so "5" hashes like 5 */ + hash = (zend_ulong)lval; + } else if (str_type == IS_DOUBLE) { + /* Numeric string with decimal */ + if (dval == (double)(zend_long)dval) { + hash = (zend_ulong)(zend_long)dval; } else { hash = zend_string_hash_val(Z_STR_P(deref_val)); } + } else if (Z_STRLEN_P(deref_val) == 0) { + /* Empty string "" compares equal to false/null/0 in SORT_REGULAR */ + hash = 0; + } else { + /* Non-numeric, non-empty string - use string hash */ + hash = zend_string_hash_val(Z_STR_P(deref_val)); } - } else if (Z_TYPE_P(deref_val) == IS_OBJECT) { - /* Hash objects by class name */ - zend_class_entry *ce = Z_OBJCE_P(deref_val); - hash = zend_string_hash_val(ce->name); - } else if (Z_TYPE_P(deref_val) == IS_ARRAY) { - /* Hash arrays by size and first value */ - hash = zend_hash_num_elements(Z_ARRVAL_P(deref_val)); - - /* XOR with hash of first element if it's a simple type */ - zval *first_elem = zend_hash_get_current_data(Z_ARRVAL_P(deref_val)); - if (first_elem) { - if (Z_TYPE_P(first_elem) == IS_LONG) { - hash ^= Z_LVAL_P(first_elem); - } else if (Z_TYPE_P(first_elem) == IS_STRING) { - hash ^= zend_string_hash_val(Z_STR_P(first_elem)); - } - } + } else if (Z_TYPE_P(deref_val) == IS_TRUE) { + hash = 1; /* true hashes like integer 1 */ + } else if (Z_TYPE_P(deref_val) == IS_FALSE || Z_TYPE_P(deref_val) == IS_NULL) { + hash = 0; /* false/null hash like integer 0 */ + } else if (Z_TYPE_P(deref_val) == IS_RESOURCE) { + /* Resources use identity comparison (like ===), hash by handle and type + * Include type to prevent collisions between different resource types */ + hash = (zend_ulong)Z_RES_HANDLE_P(deref_val) ^ (zend_ulong)Z_RES_TYPE_P(deref_val); } else { - /* Other types */ - hash = Z_TYPE_P(deref_val); + /* Note: Arrays and objects should never reach here as they trigger + * has_complex_types and use the sort path instead. This is just + * a fallback for any unexpected types. */ + hash = (zend_ulong)Z_TYPE_P(deref_val); } - uint32_t bucket_idx = hash % UNIQUE_HASH_BUCKETS; - value_bucket *bucket = &buckets[bucket_idx]; + uint32_t bucket_idx = hash % SAFE_UNIQUE_HASH_BUCKETS; + safe_value_bucket *bucket = &buckets[bucket_idx]; - /* Check if duplicate exists in this bucket */ + /* Check if duplicate exists in this bucket only */ bool is_duplicate = false; for (uint32_t i = 0; i < bucket->count; i++) { - zval *existing_deref = bucket->values[i]; - ZVAL_DEREF(existing_deref); - Bucket b1 = {.val = *deref_val}, b2 = {.val = *existing_deref}; + Bucket b1 = {.val = *deref_val}, b2 = {.val = bucket->values[i]}; if (cmp(&b1, &b2) == 0) { is_duplicate = true; break; } + + if (UNEXPECTED(EG(exception))) { + CLEANUP_BUCKETS(); + return; + } } if (!is_duplicate) { - /* Add to bucket */ - if (bucket->count >= bucket->capacity) { - bucket->capacity = bucket->capacity ? bucket->capacity * 2 : 4; - bucket->values = erealloc(bucket->values, bucket->capacity * sizeof(zval*)); + /* Grow bucket if needed - with overflow protection */ + if (UNEXPECTED(bucket->count >= bucket->capacity)) { + uint32_t new_capacity = bucket->capacity ? bucket->capacity * 2 : 4; + /* Check for overflow in capacity doubling */ + if (UNEXPECTED(new_capacity < bucket->capacity || new_capacity > UINT32_MAX / sizeof(zval))) { + /* Bucket too large - free all buckets and throw error */ + for (uint32_t j = 0; j < SAFE_UNIQUE_HASH_BUCKETS; j++) { + if (buckets[j].values) { + for (uint32_t k = 0; k < buckets[j].count; k++) { + zval_ptr_dtor(&buckets[j].values[k]); + } + efree(buckets[j].values); + } + } + efree(buckets); + zend_throw_error(NULL, "Array too large for array_unique()"); + RETURN_THROWS(); + } + bucket->values = safe_erealloc(bucket->values, new_capacity, sizeof(zval), 0); + bucket->capacity = new_capacity; + } + + /* Store value in bucket */ + ZVAL_COPY(&bucket->values[bucket->count], deref_val); + bucket->count++; + + if (UNEXPECTED(EG(exception))) { + CLEANUP_BUCKETS(); + return; } - bucket->values[bucket->count++] = val; /* Add to result */ if (UNEXPECTED(Z_ISREF_P(val) && Z_REFCOUNT_P(val) == 1)) { @@ -5090,16 +5285,16 @@ PHP_FUNCTION(array_unique) } else { zend_hash_index_add_new(Z_ARRVAL_P(return_value), num_key, val); } + + if (UNEXPECTED(EG(exception))) { + CLEANUP_BUCKETS(); + return; + } } } ZEND_HASH_FOREACH_END(); - /* Cleanup buckets */ - for (uint32_t i = 0; i < UNIQUE_HASH_BUCKETS; i++) { - if (buckets[i].values) { - efree(buckets[i].values); - } - } - efree(buckets); + CLEANUP_BUCKETS(); + #undef CLEANUP_BUCKETS return; } @@ -5124,8 +5319,12 @@ PHP_FUNCTION(array_unique) zend_tmp_string_release(tmp_str_val); } + if (UNEXPECTED(EG(exception))) { + zend_hash_destroy(&seen); + return; + } + if (retval) { - /* First occurrence of the value */ if (UNEXPECTED(Z_ISREF_P(val) && Z_REFCOUNT_P(val) == 1)) { ZVAL_DEREF(val); } @@ -5136,6 +5335,11 @@ PHP_FUNCTION(array_unique) } else { zend_hash_index_add_new(Z_ARRVAL_P(return_value), num_key, val); } + + if (UNEXPECTED(EG(exception))) { + zend_hash_destroy(&seen); + return; + } } } ZEND_HASH_FOREACH_END(); @@ -5153,7 +5357,17 @@ PHP_FUNCTION(array_unique) } /* create and sort array with pointers to the target_hash buckets */ - arTmp = pemalloc((Z_ARRVAL_P(array)->nNumOfElements + 1) * sizeof(struct bucketindex), GC_FLAGS(Z_ARRVAL_P(array)) & IS_ARRAY_PERSISTENT); + uint32_t num_elements = Z_ARRVAL_P(array)->nNumOfElements; + if (UNEXPECTED(num_elements >= UINT32_MAX - 1)) { + zend_throw_error(NULL, "Array is too large for array_unique()"); + RETURN_THROWS(); + } + size_t alloc_size = (num_elements + 1) * sizeof(struct bucketindex); + if (UNEXPECTED(alloc_size / sizeof(struct bucketindex) != (num_elements + 1))) { + zend_throw_error(NULL, "Array is too large for array_unique()"); + RETURN_THROWS(); + } + arTmp = pemalloc(alloc_size, GC_FLAGS(Z_ARRVAL_P(array)) & IS_ARRAY_PERSISTENT); if (HT_IS_PACKED(Z_ARRVAL_P(array))) { zval *zv = Z_ARRVAL_P(array)->arPacked; for (i = 0, idx = 0; idx < Z_ARRVAL_P(array)->nNumUsed; idx++, zv++) { From 4482f7d10570ca7626298e4b9cc5beadb68430d1 Mon Sep 17 00:00:00 2001 From: Jason Marble Date: Fri, 24 Oct 2025 18:27:54 -0700 Subject: [PATCH 3/4] Fix array_unique to dereference values Use ZVAL_DEREF macro to dereference values before comparing in the array_unique function. Also add a new test case to verify the behavior. --- ext/standard/array.c | 8 +- .../tests/array/array_unique_references.phpt | 163 ++++++++++++++++++ 2 files changed, 167 insertions(+), 4 deletions(-) create mode 100644 ext/standard/tests/array/array_unique_references.phpt diff --git a/ext/standard/array.c b/ext/standard/array.c index bd48b79b3060..0199ec661792 100644 --- a/ext/standard/array.c +++ b/ext/standard/array.c @@ -5008,10 +5008,10 @@ PHP_FUNCTION(array_unique) array_init(return_value); ZEND_HASH_FOREACH_KEY_VAL(Z_ARRVAL_P(array), num_key, str_key, val) { - if (UNEXPECTED(Z_ISREF_P(val) && Z_REFCOUNT_P(val) == 1)) { - val = Z_REFVAL_P(val); - } - zend_long int_val = Z_LVAL_P(val); + /* Dereference if this is a reference */ + zval *deref_val = val; + ZVAL_DEREF(deref_val); + zend_long int_val = Z_LVAL_P(deref_val); /* Use integer value as hash key for O(1) lookup */ if (!zend_hash_index_exists(&seen, int_val)) { diff --git a/ext/standard/tests/array/array_unique_references.phpt b/ext/standard/tests/array/array_unique_references.phpt new file mode 100644 index 000000000000..252f18171427 --- /dev/null +++ b/ext/standard/tests/array/array_unique_references.phpt @@ -0,0 +1,163 @@ +--TEST-- +Test array_unique() function : proper handling of references (ZVAL_DEREF) +--FILE-- + &$v1, "b" => &$v2, "c" => "test"]; +var_dump(array_unique($arr, SORT_REGULAR)); + +// Test 10: Type coercion with references in SORT_REGULAR +echo "\n-- Type coercion with references --\n"; +$int = 1; +$str = "1"; +$arr = [&$int, &$str, 1, "1"]; +var_dump(array_unique($arr, SORT_REGULAR)); + +echo "\nDone\n"; +?> +--EXPECT-- +*** Testing array_unique() with references (ZVAL_DEREF behavior) *** + +-- Integer references (integer-only path) -- +array(2) { + [0]=> + &int(5) + [2]=> + int(10) +} + +-- Reference to same variable -- +array(1) { + [0]=> + &int(42) +} + +-- String references (bucket path) -- +array(1) { + [0]=> + &string(5) "hello" +} + +-- Mixed types with references -- +array(2) { + [0]=> + &int(5) + [1]=> + &string(4) "test" +} + +-- References with SORT_NUMERIC -- +array(2) { + [0]=> + &int(15) + [1]=> + int(20) +} + +-- References with SORT_STRING -- +array(2) { + [0]=> + &string(3) "abc" + [1]=> + string(3) "def" +} + +-- Array references (complex types path) -- +array(2) { + [0]=> + &array(2) { + [0]=> + int(1) + [1]=> + int(2) + } + [2]=> + array(2) { + [0]=> + int(1) + [1]=> + int(3) + } +} + +-- Float references -- +array(2) { + [0]=> + &float(3.14) + [2]=> + float(2.71) +} + +-- References preserve keys -- +array(1) { + ["a"]=> + &string(4) "test" +} + +-- Type coercion with references -- +array(1) { + [0]=> + &int(1) +} + +Done From 76d2f626b8e3667e62e60d2b8aaaa757401db553 Mon Sep 17 00:00:00 2001 From: Jason Marble Date: Sat, 25 Oct 2025 08:25:02 -0700 Subject: [PATCH 4/4] Remove duplicate code for complex type deduplication --- ext/standard/array.c | 86 +++----------------------------------------- 1 file changed, 4 insertions(+), 82 deletions(-) diff --git a/ext/standard/array.c b/ext/standard/array.c index 0199ec661792..2b2b5dc2bf03 100644 --- a/ext/standard/array.c +++ b/ext/standard/array.c @@ -5038,88 +5038,9 @@ PHP_FUNCTION(array_unique) } if (has_complex_types) { - /* Use the standard sort-based deduplication (same as other sort types) */ - cmp = php_get_data_compare_func_unstable(sort_type, 0); - - bool in_place = zend_may_modify_arg_in_place(array); - if (in_place) { - RETVAL_ARR(Z_ARRVAL_P(array)); - } else { - RETVAL_ARR(zend_array_dup(Z_ARRVAL_P(array))); - } - - /* Guard against integer overflow in allocation */ - uint32_t num_elements = Z_ARRVAL_P(array)->nNumOfElements; - if (UNEXPECTED(num_elements >= UINT32_MAX - 1)) { - zend_throw_error(NULL, "Array is too large for array_unique()"); - RETURN_THROWS(); - } - size_t alloc_size = (num_elements + 1) * sizeof(struct bucketindex); - if (UNEXPECTED(alloc_size / sizeof(struct bucketindex) != (num_elements + 1))) { - zend_throw_error(NULL, "Array is too large for array_unique()"); - RETURN_THROWS(); - } - - arTmp = pemalloc(alloc_size, GC_FLAGS(Z_ARRVAL_P(array)) & IS_ARRAY_PERSISTENT); - - /* Copy array buckets to arTmp */ - if (HT_IS_PACKED(Z_ARRVAL_P(array))) { - zval *zv = Z_ARRVAL_P(array)->arPacked; - for (i = 0, idx = 0; idx < Z_ARRVAL_P(array)->nNumUsed; idx++, zv++) { - if (Z_TYPE_P(zv) == IS_UNDEF) continue; - ZVAL_COPY_VALUE(&arTmp[i].b.val, zv); - arTmp[i].b.h = idx; - arTmp[i].b.key = NULL; - arTmp[i].i = i; - i++; - } - } else { - p = Z_ARRVAL_P(array)->arData; - for (i = 0, idx = 0; idx < Z_ARRVAL_P(array)->nNumUsed; idx++, p++) { - if (Z_TYPE(p->val) == IS_UNDEF) continue; - arTmp[i].b = *p; - arTmp[i].i = i; - i++; - } - } - ZVAL_UNDEF(&arTmp[i].b.val); - - /* Sort array */ - zend_sort((void *) arTmp, i, sizeof(struct bucketindex), - (compare_func_t) cmp, (swap_func_t) array_bucketindex_swap); - - if (UNEXPECTED(EG(exception))) { - pefree(arTmp, GC_FLAGS(Z_ARRVAL_P(array)) & IS_ARRAY_PERSISTENT); - return; - } - - /* Remove adjacent duplicates from sorted array */ - lastkept = arTmp; - for (cmpdata = arTmp + 1; Z_TYPE(cmpdata->b.val) != IS_UNDEF; cmpdata++) { - if (cmp(&lastkept->b, &cmpdata->b)) { - lastkept = cmpdata; - } else { - if (lastkept->i > cmpdata->i) { - p = &lastkept->b; - lastkept = cmpdata; - } else { - p = &cmpdata->b; - } - if (p->key == NULL) { - zend_hash_index_del(Z_ARRVAL_P(return_value), p->h); - } else { - zend_hash_del(Z_ARRVAL_P(return_value), p->key); - } - } - } - - pefree(arTmp, GC_FLAGS(Z_ARRVAL_P(array)) & IS_ARRAY_PERSISTENT); - - if (in_place) { - Z_ADDREF_P(return_value); - } - - return; + /* Arrays and objects need sort-based deduplication. + * Fall through to the standard sort path below. */ + goto sort_based_dedup; } uint32_t num_elements = Z_ARRVAL_P(array)->nNumOfElements; @@ -5347,6 +5268,7 @@ PHP_FUNCTION(array_unique) return; } +sort_based_dedup: cmp = php_get_data_compare_func_unstable(sort_type, 0); bool in_place = zend_may_modify_arg_in_place(array);