diff --git a/ext/standard/array.c b/ext/standard/array.c index 4097d71899011..2b2b5dc2bf031 100644 --- a/ext/standard/array.c +++ b/ext/standard/array.c @@ -4964,6 +4964,9 @@ PHP_FUNCTION(array_unique) bucket_compare_func_t cmp; struct bucketindex *arTmp, *cmpdata, *lastkept; uint32_t i, idx; + zend_long num_key; + zend_string *str_key; + zval *val; ZEND_PARSE_PARAMETERS_START(1, 2) Z_PARAM_ARRAY(array) @@ -4976,6 +4979,247 @@ PHP_FUNCTION(array_unique) return; } + if (sort_type == PHP_SORT_REGULAR) { + /* Detect data types in array to choose optimal algorithm */ + bool all_integers = true; + bool has_complex_types = false; /* arrays, objects only (NOT resources) */ + zval *check_val; + + ZEND_HASH_FOREACH_VAL(Z_ARRVAL_P(array), check_val) { + ZVAL_DEREF(check_val); + uint8_t type = Z_TYPE_P(check_val); + + if (type != IS_LONG) { + all_integers = false; + } + + /* Arrays and objects need sorting (they have deep comparison semantics). + * Resources use identity comparison, so they can stay in scalar path. */ + if (type == IS_ARRAY || type == IS_OBJECT) { + has_complex_types = true; + break; /* No point continuing - we'll use sort path */ + } + } ZEND_HASH_FOREACH_END(); + + /* For integer-only arrays, we can use a real hash table for O(N) performance */ + if (all_integers) { + HashTable seen; + zend_hash_init(&seen, zend_hash_num_elements(Z_ARRVAL_P(array)), NULL, NULL, 0); + array_init(return_value); + + ZEND_HASH_FOREACH_KEY_VAL(Z_ARRVAL_P(array), num_key, str_key, val) { + /* Dereference if this is a reference */ + zval *deref_val = val; + ZVAL_DEREF(deref_val); + zend_long int_val = Z_LVAL_P(deref_val); + + /* Use integer value as hash key for O(1) lookup */ + if (!zend_hash_index_exists(&seen, int_val)) { + zend_hash_index_add_empty_element(&seen, int_val); + + /* Add to result */ + Z_TRY_ADDREF_P(val); + + if (str_key) { + zend_hash_add_new(Z_ARRVAL_P(return_value), str_key, val); + } else { + zend_hash_index_add_new(Z_ARRVAL_P(return_value), num_key, val); + } + } + + if (UNEXPECTED(EG(exception))) { + zend_hash_destroy(&seen); + return; + } + } ZEND_HASH_FOREACH_END(); + + zend_hash_destroy(&seen); + return; + } + + if (has_complex_types) { + /* Arrays and objects need sort-based deduplication. + * Fall through to the standard sort path below. */ + goto sort_based_dedup; + } + + uint32_t num_elements = Z_ARRVAL_P(array)->nNumOfElements; + uint32_t bucket_count; + + if (num_elements < 64) { + bucket_count = 64; + } else if (num_elements < 256) { + bucket_count = 256; + } else if (num_elements < 1024) { + bucket_count = 1024; + } else if (num_elements < 4096) { + bucket_count = 4096; + } else { + bucket_count = 16384; + } + + #define SAFE_UNIQUE_HASH_BUCKETS bucket_count + + typedef struct { + zval *values; + uint32_t count; + uint32_t capacity; + } safe_value_bucket; + + safe_value_bucket *buckets = ecalloc(SAFE_UNIQUE_HASH_BUCKETS, sizeof(safe_value_bucket)); + cmp = php_get_data_compare_func_unstable(sort_type, 0); + array_init(return_value); + + #define CLEANUP_BUCKETS() do { \ + for (uint32_t _i = 0; _i < SAFE_UNIQUE_HASH_BUCKETS; _i++) { \ + if (buckets[_i].values) { \ + for (uint32_t _j = 0; _j < buckets[_i].count; _j++) { \ + zval_ptr_dtor(&buckets[_i].values[_j]); \ + } \ + efree(buckets[_i].values); \ + } \ + } \ + efree(buckets); \ + } while (0) + + ZEND_HASH_FOREACH_KEY_VAL(Z_ARRVAL_P(array), num_key, str_key, val) { + zval *deref_val = val; + ZVAL_DEREF(deref_val); + + zend_ulong hash; + + if (Z_TYPE_P(deref_val) == IS_LONG) { + /* Hash integer value directly */ + hash = (zend_ulong)Z_LVAL_P(deref_val); + } else if (Z_TYPE_P(deref_val) == IS_DOUBLE) { + /* Hash double as integer if it's a whole number */ + double dval = Z_DVAL_P(deref_val); + if (zend_isnan(dval)) { + hash = 0xDEADBEEF; /* All NaNs to same bucket, cmp() ensures correctness */ + } else if (zend_isinf(dval)) { + /* +INF and -INF get distinct hashes */ + hash = dval > 0 ? ZEND_ULONG_MAX : ZEND_ULONG_MAX - 1; + } else if (dval == (double)(zend_long)dval) { + hash = (zend_ulong)(zend_long)dval; /* 5.0 hashes like 5 */ + } else { + /* Non-integer double - use bit pattern as hash to avoid string conversion */ + union { double d; zend_ulong ul; } u; + u.d = dval; + hash = u.ul; + } + } else if (Z_TYPE_P(deref_val) == IS_STRING) { + /* Check if it's a numeric string */ + zend_long lval; + double dval; + zend_uchar str_type = is_numeric_string(Z_STRVAL_P(deref_val), Z_STRLEN_P(deref_val), &lval, &dval, 0); + + if (str_type == IS_LONG) { + /* Numeric string - hash as integer so "5" hashes like 5 */ + hash = (zend_ulong)lval; + } else if (str_type == IS_DOUBLE) { + /* Numeric string with decimal */ + if (dval == (double)(zend_long)dval) { + hash = (zend_ulong)(zend_long)dval; + } else { + hash = zend_string_hash_val(Z_STR_P(deref_val)); + } + } else if (Z_STRLEN_P(deref_val) == 0) { + /* Empty string "" compares equal to false/null/0 in SORT_REGULAR */ + hash = 0; + } else { + /* Non-numeric, non-empty string - use string hash */ + hash = zend_string_hash_val(Z_STR_P(deref_val)); + } + } else if (Z_TYPE_P(deref_val) == IS_TRUE) { + hash = 1; /* true hashes like integer 1 */ + } else if (Z_TYPE_P(deref_val) == IS_FALSE || Z_TYPE_P(deref_val) == IS_NULL) { + hash = 0; /* false/null hash like integer 0 */ + } else if (Z_TYPE_P(deref_val) == IS_RESOURCE) { + /* Resources use identity comparison (like ===), hash by handle and type + * Include type to prevent collisions between different resource types */ + hash = (zend_ulong)Z_RES_HANDLE_P(deref_val) ^ (zend_ulong)Z_RES_TYPE_P(deref_val); + } else { + /* Note: Arrays and objects should never reach here as they trigger + * has_complex_types and use the sort path instead. This is just + * a fallback for any unexpected types. */ + hash = (zend_ulong)Z_TYPE_P(deref_val); + } + + uint32_t bucket_idx = hash % SAFE_UNIQUE_HASH_BUCKETS; + safe_value_bucket *bucket = &buckets[bucket_idx]; + + /* Check if duplicate exists in this bucket only */ + bool is_duplicate = false; + for (uint32_t i = 0; i < bucket->count; i++) { + Bucket b1 = {.val = *deref_val}, b2 = {.val = bucket->values[i]}; + if (cmp(&b1, &b2) == 0) { + is_duplicate = true; + break; + } + + if (UNEXPECTED(EG(exception))) { + CLEANUP_BUCKETS(); + return; + } + } + + if (!is_duplicate) { + /* Grow bucket if needed - with overflow protection */ + if (UNEXPECTED(bucket->count >= bucket->capacity)) { + uint32_t new_capacity = bucket->capacity ? bucket->capacity * 2 : 4; + /* Check for overflow in capacity doubling */ + if (UNEXPECTED(new_capacity < bucket->capacity || new_capacity > UINT32_MAX / sizeof(zval))) { + /* Bucket too large - free all buckets and throw error */ + for (uint32_t j = 0; j < SAFE_UNIQUE_HASH_BUCKETS; j++) { + if (buckets[j].values) { + for (uint32_t k = 0; k < buckets[j].count; k++) { + zval_ptr_dtor(&buckets[j].values[k]); + } + efree(buckets[j].values); + } + } + efree(buckets); + zend_throw_error(NULL, "Array too large for array_unique()"); + RETURN_THROWS(); + } + bucket->values = safe_erealloc(bucket->values, new_capacity, sizeof(zval), 0); + bucket->capacity = new_capacity; + } + + /* Store value in bucket */ + ZVAL_COPY(&bucket->values[bucket->count], deref_val); + bucket->count++; + + if (UNEXPECTED(EG(exception))) { + CLEANUP_BUCKETS(); + return; + } + + /* Add to result */ + if (UNEXPECTED(Z_ISREF_P(val) && Z_REFCOUNT_P(val) == 1)) { + ZVAL_DEREF(val); + } + Z_TRY_ADDREF_P(val); + + if (str_key) { + zend_hash_add_new(Z_ARRVAL_P(return_value), str_key, val); + } else { + zend_hash_index_add_new(Z_ARRVAL_P(return_value), num_key, val); + } + + if (UNEXPECTED(EG(exception))) { + CLEANUP_BUCKETS(); + return; + } + } + } ZEND_HASH_FOREACH_END(); + + CLEANUP_BUCKETS(); + #undef CLEANUP_BUCKETS + + return; + } + if (sort_type == PHP_SORT_STRING) { HashTable seen; zend_long num_key; @@ -4996,8 +5240,12 @@ PHP_FUNCTION(array_unique) zend_tmp_string_release(tmp_str_val); } + if (UNEXPECTED(EG(exception))) { + zend_hash_destroy(&seen); + return; + } + if (retval) { - /* First occurrence of the value */ if (UNEXPECTED(Z_ISREF_P(val) && Z_REFCOUNT_P(val) == 1)) { ZVAL_DEREF(val); } @@ -5008,6 +5256,11 @@ PHP_FUNCTION(array_unique) } else { zend_hash_index_add_new(Z_ARRVAL_P(return_value), num_key, val); } + + if (UNEXPECTED(EG(exception))) { + zend_hash_destroy(&seen); + return; + } } } ZEND_HASH_FOREACH_END(); @@ -5015,6 +5268,7 @@ PHP_FUNCTION(array_unique) return; } +sort_based_dedup: cmp = php_get_data_compare_func_unstable(sort_type, 0); bool in_place = zend_may_modify_arg_in_place(array); @@ -5025,7 +5279,17 @@ PHP_FUNCTION(array_unique) } /* create and sort array with pointers to the target_hash buckets */ - arTmp = pemalloc((Z_ARRVAL_P(array)->nNumOfElements + 1) * sizeof(struct bucketindex), GC_FLAGS(Z_ARRVAL_P(array)) & IS_ARRAY_PERSISTENT); + uint32_t num_elements = Z_ARRVAL_P(array)->nNumOfElements; + if (UNEXPECTED(num_elements >= UINT32_MAX - 1)) { + zend_throw_error(NULL, "Array is too large for array_unique()"); + RETURN_THROWS(); + } + size_t alloc_size = (num_elements + 1) * sizeof(struct bucketindex); + if (UNEXPECTED(alloc_size / sizeof(struct bucketindex) != (num_elements + 1))) { + zend_throw_error(NULL, "Array is too large for array_unique()"); + RETURN_THROWS(); + } + arTmp = pemalloc(alloc_size, GC_FLAGS(Z_ARRVAL_P(array)) & IS_ARRAY_PERSISTENT); if (HT_IS_PACKED(Z_ARRVAL_P(array))) { zval *zv = Z_ARRVAL_P(array)->arPacked; for (i = 0, idx = 0; idx < Z_ARRVAL_P(array)->nNumUsed; idx++, zv++) { diff --git a/ext/standard/tests/array/array_unique_references.phpt b/ext/standard/tests/array/array_unique_references.phpt new file mode 100644 index 0000000000000..252f181714277 --- /dev/null +++ b/ext/standard/tests/array/array_unique_references.phpt @@ -0,0 +1,163 @@ +--TEST-- +Test array_unique() function : proper handling of references (ZVAL_DEREF) +--FILE-- + &$v1, "b" => &$v2, "c" => "test"]; +var_dump(array_unique($arr, SORT_REGULAR)); + +// Test 10: Type coercion with references in SORT_REGULAR +echo "\n-- Type coercion with references --\n"; +$int = 1; +$str = "1"; +$arr = [&$int, &$str, 1, "1"]; +var_dump(array_unique($arr, SORT_REGULAR)); + +echo "\nDone\n"; +?> +--EXPECT-- +*** Testing array_unique() with references (ZVAL_DEREF behavior) *** + +-- Integer references (integer-only path) -- +array(2) { + [0]=> + &int(5) + [2]=> + int(10) +} + +-- Reference to same variable -- +array(1) { + [0]=> + &int(42) +} + +-- String references (bucket path) -- +array(1) { + [0]=> + &string(5) "hello" +} + +-- Mixed types with references -- +array(2) { + [0]=> + &int(5) + [1]=> + &string(4) "test" +} + +-- References with SORT_NUMERIC -- +array(2) { + [0]=> + &int(15) + [1]=> + int(20) +} + +-- References with SORT_STRING -- +array(2) { + [0]=> + &string(3) "abc" + [1]=> + string(3) "def" +} + +-- Array references (complex types path) -- +array(2) { + [0]=> + &array(2) { + [0]=> + int(1) + [1]=> + int(2) + } + [2]=> + array(2) { + [0]=> + int(1) + [1]=> + int(3) + } +} + +-- Float references -- +array(2) { + [0]=> + &float(3.14) + [2]=> + float(2.71) +} + +-- References preserve keys -- +array(1) { + ["a"]=> + &string(4) "test" +} + +-- Type coercion with references -- +array(1) { + [0]=> + &int(1) +} + +Done diff --git a/ext/standard/tests/array/array_unique_variation_sort_regular.phpt b/ext/standard/tests/array/array_unique_variation_sort_regular.phpt new file mode 100644 index 0000000000000..a1b067bd5cd27 --- /dev/null +++ b/ext/standard/tests/array/array_unique_variation_sort_regular.phpt @@ -0,0 +1,214 @@ +--TEST-- +Test array_unique() function : SORT_REGULAR type coercion behavior +--FILE-- + +--EXPECT-- +*** Testing array_unique() with SORT_REGULAR *** + +-- Integer and string coercion -- +array(2) { + [0]=> + int(1) + [2]=> + int(2) +} + +-- Boolean coercion -- +array(2) { + [0]=> + bool(true) + [2]=> + bool(false) +} + +-- NULL coercion -- +array(2) { + [0]=> + NULL + [4]=> + string(1) "0" +} + +-- Float coercion -- +array(1) { + [0]=> + int(1) +} + +-- Numeric strings -- +array(1) { + [0]=> + string(2) "10" +} + +-- Leading zeros -- +array(1) { + [0]=> + string(2) "05" +} + +-- Partial numeric strings -- +array(2) { + [0]=> + string(4) "5abc" + [1]=> + string(1) "5" +} + +-- Whitespace in numeric strings -- +array(1) { + [0]=> + string(1) "5" +} + +-- Case sensitivity -- +array(3) { + [0]=> + string(3) "abc" + [1]=> + string(3) "ABC" + [2]=> + string(3) "Abc" +} + +-- Exponential notation -- +array(1) { + [0]=> + int(1000) +} + +-- Negative numbers -- +array(1) { + [0]=> + int(-5) +} + +-- Arrays -- +array(2) { + [0]=> + array(2) { + [0]=> + int(1) + [1]=> + int(2) + } + [2]=> + array(2) { + [0]=> + int(1) + [1]=> + int(3) + } +} + +-- NaN handling -- +array(3) { + [0]=> + float(NAN) + [1]=> + float(NAN) + [2]=> + int(1) +} + +-- INF handling -- +array(2) { + [0]=> + float(INF) + [2]=> + float(-INF) +} + +-- Bug GH-20262 case -- +array(3) { + [0]=> + string(1) "5" + [1]=> + string(2) "10" + [2]=> + string(2) "3A" +} + +-- SORT_REGULAR vs SORT_STRING -- +SORT_REGULAR: array(1) { + [0]=> + bool(true) +} +SORT_STRING: array(1) { + [0]=> + bool(true) +} + +Done diff --git a/ext/standard/tests/array/gh20262.phpt b/ext/standard/tests/array/gh20262.phpt new file mode 100644 index 0000000000000..0eb98c0dc2fd3 --- /dev/null +++ b/ext/standard/tests/array/gh20262.phpt @@ -0,0 +1,19 @@ +--TEST-- +Bug GH-20262 (array_unique() with SORT_REGULAR fails to remove duplicates with mixed strings) +--FILE-- + +--EXPECT-- +array(3) { + [0]=> + string(1) "5" + [1]=> + string(2) "10" + [2]=> + string(2) "3A" +}