Skip to content

Commit

Permalink
Update to PCRE2 10.39
Browse files Browse the repository at this point in the history
We also apply an respective upstream fix[1].

[1] <PCRE2Project/pcre2@d144199>

Closes GH-7678.
  • Loading branch information
cmb69 committed Nov 25, 2021
1 parent df5e95b commit 6008a75
Show file tree
Hide file tree
Showing 27 changed files with 4,633 additions and 4,146 deletions.
3 changes: 3 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ PHP NEWS
. Fixed bug #81649 (imap_(un)delete accept sequences, not single numbers).
(cmb)

- PCRE:
. Update bundled PCRE2 to 10.39 (cmb)

25 Nov 2021, PHP 8.1.0

- Core:
Expand Down
12 changes: 7 additions & 5 deletions ext/pcre/pcre2lib/pcre2.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
/* This is the public header file for the PCRE library, second API, to be
#included by applications that call PCRE2 functions.
Copyright (c) 2016-2020 University of Cambridge
Copyright (c) 2016-2021 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
Expand Down Expand Up @@ -42,9 +42,9 @@ POSSIBILITY OF SUCH DAMAGE.
/* The current PCRE version information. */

#define PCRE2_MAJOR 10
#define PCRE2_MINOR 37
#define PCRE2_MINOR 39
#define PCRE2_PRERELEASE
#define PCRE2_DATE 2021-05-26
#define PCRE2_DATE 2021-10-29

/* When an application links to a PCRE DLL in Windows, the symbols that are
imported have to be identified as such. When building PCRE2, the appropriate
Expand Down Expand Up @@ -84,8 +84,8 @@ set, we ensure here that it has no effect. */
/* Have to include limits.h, stdlib.h, and inttypes.h to ensure that size_t and
uint8_t, UCHAR_MAX, etc are defined. Some systems that do have inttypes.h do
not have stdint.h, which is why we use inttypes.h, which according to the C
standard is a superset of stdint.h. If none of these headers are available,
the relevant values must be provided by some other means. */
standard is a superset of stdint.h. If inttypes.h is not available the build
will break and the relevant values must be provided by some other means. */

#include <limits.h>
#include <stdlib.h>
Expand Down Expand Up @@ -152,6 +152,7 @@ D is inspected during pcre2_dfa_match() execution
#define PCRE2_EXTRA_MATCH_LINE 0x00000008u /* C */
#define PCRE2_EXTRA_ESCAPED_CR_IS_LF 0x00000010u /* C */
#define PCRE2_EXTRA_ALT_BSUX 0x00000020u /* C */
#define PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK 0x00000040u /* C */

/* These are for pcre2_jit_compile(). */

Expand Down Expand Up @@ -311,6 +312,7 @@ pcre2_pattern_convert(). */
#define PCRE2_ERROR_SCRIPT_RUN_NOT_AVAILABLE 196
#define PCRE2_ERROR_TOO_MANY_CAPTURES 197
#define PCRE2_ERROR_CONDITION_ATOMIC_ASSERTION_EXPECTED 198
#define PCRE2_ERROR_BACKSLASH_K_IN_LOOKAROUND 199


/* "Expected" matching error codes: no match and partial match. */
Expand Down
34 changes: 24 additions & 10 deletions ext/pcre/pcre2lib/pcre2_compile.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2020 University of Cambridge
New API code Copyright (c) 2016-2021 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
Expand Down Expand Up @@ -137,7 +137,7 @@ static BOOL

static int
check_lookbehinds(uint32_t *, uint32_t **, parsed_recurse_check *,
compile_block *);
compile_block *, int *);


/*************************************************
Expand Down Expand Up @@ -782,12 +782,15 @@ are allowed. */
#define PUBLIC_COMPILE_EXTRA_OPTIONS \
(PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX)
PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK)

/* Compile time error code numbers. They are given names so that they can more
easily be tracked. When a new number is added, the tables called eint1 and
eint2 in pcre2posix.c may need to be updated, and a new error text must be
added to compile_error_texts in pcre2_error.c. */
added to compile_error_texts in pcre2_error.c. Also, the error codes in
pcre2.h.in must be updated - their values are exactly 100 greater than these
values. */

enum { ERR0 = COMPILE_ERROR_BASE,
ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10,
Expand All @@ -799,7 +802,7 @@ enum { ERR0 = COMPILE_ERROR_BASE,
ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90,
ERR91, ERR92, ERR93, ERR94, ERR95, ERR96, ERR97, ERR98 };
ERR91, ERR92, ERR93, ERR94, ERR95, ERR96, ERR97, ERR98, ERR99 };

/* This is a table of start-of-pattern options such as (*UTF) and settings such
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
Expand Down Expand Up @@ -7799,6 +7802,16 @@ for (;; pptr++)
}
#endif

/* \K is forbidden in lookarounds since 10.38 because that's what Perl has
done. However, there's an option, in case anyone was relying on it. */

if (cb->assert_depth > 0 && meta_arg == ESC_K &&
(cb->cx->extra_options & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) == 0)
{
*errorcodeptr = ERR99;
return 0;
}

/* For the rest (including \X when Unicode is supported - if not it's
faulted at parse time), the OP value is the escape value when PCRE2_UCP is
not set; if it is set, these escapes do not show up here because they are
Expand Down Expand Up @@ -9148,7 +9161,7 @@ for (;; pptr++)
case META_LOOKAHEAD:
case META_LOOKAHEADNOT:
case META_LOOKAHEAD_NA:
*errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb);
*errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb, lcptr);
if (*errcodeptr != 0) return -1;

/* Ignore any qualifiers that follow a lookahead assertion. */
Expand Down Expand Up @@ -9488,16 +9501,16 @@ Arguments
retptr if not NULL, return the ket pointer here
recurses chain of recurse_check to catch mutual recursion
cb points to the compile block
lcptr points to loop counter
Returns: 0 on success, or an errorcode (cb->erroroffset will be set)
*/

static int
check_lookbehinds(uint32_t *pptr, uint32_t **retptr,
parsed_recurse_check *recurses, compile_block *cb)
parsed_recurse_check *recurses, compile_block *cb, int *lcptr)
{
int errorcode = 0;
int loopcount = 0;
int nestlevel = 0;

cb->erroroffset = PCRE2_UNSET;
Expand Down Expand Up @@ -9623,7 +9636,7 @@ for (; *pptr != META_END; pptr++)
case META_LOOKBEHIND:
case META_LOOKBEHINDNOT:
case META_LOOKBEHIND_NA:
if (!set_lookbehind_lengths(&pptr, &errorcode, &loopcount, recurses, cb))
if (!set_lookbehind_lengths(&pptr, &errorcode, lcptr, recurses, cb))
return errorcode;
break;
}
Expand Down Expand Up @@ -10078,7 +10091,8 @@ lengths. */

if (has_lookbehind)
{
errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb);
int loopcount = 0;
errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb, &loopcount);
if (errorcode != 0) goto HAD_CB_ERROR;
}

Expand Down
69 changes: 44 additions & 25 deletions ext/pcre/pcre2lib/pcre2_dfa_match.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2020 University of Cambridge
New API code Copyright (c) 2016-2021 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
Expand Down Expand Up @@ -3256,8 +3256,8 @@ BOOL has_first_cu = FALSE;
BOOL has_req_cu = FALSE;

#if PCRE2_CODE_UNIT_WIDTH == 8
BOOL memchr_not_found_first_cu = FALSE;
BOOL memchr_not_found_first_cu2 = FALSE;
PCRE2_SPTR memchr_found_first_cu = NULL;
PCRE2_SPTR memchr_found_first_cu2 = NULL;
#endif

PCRE2_UCHAR first_cu = 0;
Expand Down Expand Up @@ -3648,57 +3648,76 @@ for (;;)
}
}

/* Not anchored. Advance to a unique first code unit if there is one. In
8-bit mode, the use of memchr() gives a big speed up, even though we have
to call it twice in caseless mode, in order to find the earliest occurrence
of the character in either of its cases. If a call to memchr() that
searches the rest of the subject fails to find one case, remember that in
order not to keep on repeating the search. This can make a huge difference
when the strings are very long and only one case is present. */
/* Not anchored. Advance to a unique first code unit if there is one. */

else
{
if (has_first_cu)
{
if (first_cu != first_cu2) /* Caseless */
{
/* In 16-bit and 32_bit modes we have to do our own search, so can
look for both cases at once. */

#if PCRE2_CODE_UNIT_WIDTH != 8
PCRE2_UCHAR smc;
while (start_match < end_subject &&
(smc = UCHAR21TEST(start_match)) != first_cu &&
smc != first_cu2)
smc != first_cu2)
start_match++;
#else
/* In 8-bit mode, the use of memchr() gives a big speed up, even
though we have to call it twice in order to find the earliest
occurrence of the code unit in either of its cases. Caching is used
to remember the positions of previously found code units. This can
make a huge difference when the strings are very long and only one
case is actually present. */

#else /* 8-bit code units */
PCRE2_SPTR pp1 = NULL;
PCRE2_SPTR pp2 = NULL;
PCRE2_SIZE cu2size = end_subject - start_match;
PCRE2_SIZE searchlength = end_subject - start_match;

if (!memchr_not_found_first_cu)
/* If we haven't got a previously found position for first_cu, or if
the current starting position is later, we need to do a search. If
the code unit is not found, set it to the end. */

if (memchr_found_first_cu == NULL ||
start_match > memchr_found_first_cu)
{
pp1 = memchr(start_match, first_cu, end_subject - start_match);
if (pp1 == NULL) memchr_not_found_first_cu = TRUE;
else cu2size = pp1 - start_match;
pp1 = memchr(start_match, first_cu, searchlength);
memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1;
}

/* If pp1 is not NULL, we have arranged to search only as far as pp1,
to see if the other case is earlier, so we can set "not found" only
when both searches have returned NULL. */
/* If the start is before a previously found position, use the
previous position, or NULL if a previous search failed. */

else pp1 = (memchr_found_first_cu == end_subject)? NULL :
memchr_found_first_cu;

if (!memchr_not_found_first_cu2)
/* Do the same thing for the other case. */

if (memchr_found_first_cu2 == NULL ||
start_match > memchr_found_first_cu2)
{
pp2 = memchr(start_match, first_cu2, cu2size);
memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL);
pp2 = memchr(start_match, first_cu2, searchlength);
memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2;
}

else pp2 = (memchr_found_first_cu2 == end_subject)? NULL :
memchr_found_first_cu2;

/* Set the start to the end of the subject if neither case was found.
Otherwise, use the earlier found point. */

if (pp1 == NULL)
start_match = (pp2 == NULL)? end_subject : pp2;
else
start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
#endif

#endif /* 8-bit handling */
}

/* The caseful case */
/* The caseful case is much simpler. */

else
{
Expand Down
3 changes: 2 additions & 1 deletion ext/pcre/pcre2lib/pcre2_error.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2019 University of Cambridge
New API code Copyright (c) 2016-2021 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
Expand Down Expand Up @@ -186,6 +186,7 @@ static const unsigned char compile_error_texts[] =
"script runs require Unicode support, which this version of PCRE2 does not have\0"
"too many capturing groups (maximum 65535)\0"
"atomic assertion expected after (?( or (?(?C)\0"
"\\K is not allowed in lookarounds (but see PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK)\0"
;

/* Match-time and UTF error texts are in the same format. */
Expand Down

0 comments on commit 6008a75

Please sign in to comment.