Skip to content
This repository has been archived by the owner on Jun 1, 2023. It is now read-only.

Commit

Permalink
use utf8 Script - add is_{LATIN,COMMON}_SCRIPT_utf8
Browse files Browse the repository at this point in the history
let them be generated by regen/regcharclass.pl

change API from UV cp to U8*s. Spares a costly utf8n_to_uvchr
conversion in hot code. Do this only when throwing the error.

Slowdown <1%
See GH #229.
  • Loading branch information
Reini Urban committed Dec 4, 2016
1 parent 011d777 commit f986a23
Show file tree
Hide file tree
Showing 9 changed files with 1,073 additions and 180 deletions.
2 changes: 1 addition & 1 deletion charclass_invlists.h
Original file line number Diff line number Diff line change
Expand Up @@ -87862,7 +87862,7 @@ static const U8 WB_table[19][19] = {
#endif /* defined(PERL_IN_REGEXEC_C) */

/* Generated from:
* cb3170dd603ad12ba0299440e99e8f50a8afde60ade2ffcbf1ff4a8a53854b90 lib/Unicode/UCD.pm
* 6a03471439e81d73cc6b982218c18f6eecff96e5a0c263d58bea2683ec67e1e4 lib/Unicode/UCD.pm
* ae98bec7e4f0564758eed81eca5015481ba32581f8a735a825b71b3bba714450 lib/unicore/ArabicShaping.txt
* 1687fe5994eb7e5c0dab8503fc2a1b3b479d91af9d3b8055941c9bd791f7d0b5 lib/unicore/BidiBrackets.txt
* 350d1302116194b0b21def287434b55c5088098fbc726e879f7420a391965643 lib/unicore/BidiMirroring.txt
Expand Down
3 changes: 2 additions & 1 deletion embed.fnc
Original file line number Diff line number Diff line change
Expand Up @@ -1601,7 +1601,7 @@ EXMpn |void |_invlist_populate_swatch |NN SV* const invlist|const UV start \
EXp |SV* |_core_swash_init|NN const char* pkg|NN const char* name \
|NN SV* listsv|I32 minbits|I32 none \
|NULLOK SV* invlist|NULLOK U8* const flags_p
EXp |void |utf8_check_script|const UV uv
EXMp |void |utf8_check_script|NN const U8 *s
#endif
#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) || defined(PERL_IN_UTF8_C)
EiMRn |UV* |invlist_array |NN SV* const invlist
Expand Down Expand Up @@ -2653,6 +2653,7 @@ sR |SV* |swatch_get |NN SV* swash|UV start|UV span
sRM |U8* |swash_scan_list_line|NN U8* l|NN U8* const lend|NN UV* min \
|NN UV* max|NN UV* val|const bool wants_value \
|NN const U8* const typestr
sR |char* |utf8_get_script|NN const U8 *s
#endif

#ifndef PERL_NO_INLINE_FUNCTIONS
Expand Down
1 change: 1 addition & 0 deletions embed.h
Original file line number Diff line number Diff line change
Expand Up @@ -1838,6 +1838,7 @@
#define swash_scan_list_line(a,b,c,d,e,f,g) S_swash_scan_list_line(aTHX_ a,b,c,d,e,f,g)
#define swatch_get(a,b,c) S_swatch_get(aTHX_ a,b,c)
#define to_lower_latin1 S_to_lower_latin1
#define utf8_get_script(a) S_utf8_get_script(aTHX_ a)
# endif
# if defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C)
#define _to_upper_title_latin1(a,b,c,d) Perl__to_upper_title_latin1(aTHX_ a,b,c,d)
Expand Down
2 changes: 1 addition & 1 deletion pod/perldiag.pod
Original file line number Diff line number Diff line change
Expand Up @@ -3060,7 +3060,7 @@ than a colon or whitespace was seen between the elements of a layer list.
If the previous attribute had a parenthesised parameter list, perhaps that
list was terminated too soon.

=item script %s in identifier for U+%04
=item Invalid script %s in identifier %s for U+%04

(F) An invalid unicode character was found in an identifier, with a
Script property outside of Latin or Common, which was not declared by
Expand Down
13 changes: 11 additions & 2 deletions proto.h
Original file line number Diff line number Diff line change
Expand Up @@ -8804,8 +8804,11 @@ PERL_CALLCONV SV* Perl__core_swash_init(pTHX_ const char* pkg, const char* name,
#define PERL_ARGS_ASSERT__CORE_SWASH_INIT \
assert(pkg); assert(name); assert(listsv)

PERL_CALLCONV void Perl_utf8_check_script(pTHX_ UV uv)
__attribute__global__;
PERL_CALLCONV void Perl_utf8_check_script(pTHX_ const U8 *s)
__attribute__global__
__attribute__nonnull__(pTHX_1);
#define PERL_ARGS_ASSERT_UTF8_CHECK_SCRIPT \
assert(s)

#endif
#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_DQUOTE_C)
Expand Down Expand Up @@ -9544,6 +9547,12 @@ STATIC SV* S_swatch_get(pTHX_ SV* swash, UV start, UV span)
STATIC U8 S_to_lower_latin1(const U8 c, U8 *p, STRLEN *lenp)
__attribute__warn_unused_result__;

STATIC char* S_utf8_get_script(pTHX_ const U8 *s)
__attribute__warn_unused_result__
__attribute__nonnull__(pTHX_1);
#define PERL_ARGS_ASSERT_UTF8_GET_SCRIPT \
assert(s)

#endif
#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C)
PERL_CALLCONV UV Perl__to_upper_title_latin1(pTHX_ const U8 c, U8 *p, STRLEN *lenp, const char S_or_s)
Expand Down
1,017 changes: 1,015 additions & 2 deletions regcharclass.h

Large diffs are not rendered by default.

9 changes: 9 additions & 0 deletions regen/regcharclass.pl
Original file line number Diff line number Diff line change
Expand Up @@ -1698,3 +1698,12 @@ sub make_macro {
PATWS: pattern white space
=> generic cp : safe
\p{_Perl_PatWS}
COMMON_SCRIPT: Common Script
=> UTF8 :fast
\p{Script=Common}
LATIN_SCRIPT: Latin Script
=> UTF8 :fast
\p{Script=Latin}
172 changes: 7 additions & 165 deletions toke.c
Original file line number Diff line number Diff line change
Expand Up @@ -9197,162 +9197,6 @@ S_new_constant(pTHX_ const char *s, STRLEN len, const char *key, STRLEN keylen,
return res;
}

/* Latin or Common Script character: unicore/To/Sc.pl */
/* perl -nle'
/^([0-9A-F]+)\t([0-9A-F]+)\t(Common|Latin)\s/ &&
print $2-$1==1 ? " (p == 0x$1 || p == 0x$2) ||"
: " (p >= 0x$1 && p <= 0x$2) ||";
/^([0-9A-F]+)\t\t(Common|Latin)/ &&
print " (p == 0x$1) ||";' \
lib/unicore/To/Sc.pl */
PERL_STATIC_INLINE bool
isLATIN_or_COMMON_uni(UV p) {
return (p <= 0x2E9) ||
(p >= 0x2EC && p <= 0x2FF) ||
(p >= 0x2000 && p <= 0x27FF && !(p >= 0x20D0 && p <= 0x20F0) && p != 0x2126) ||
(p >= 0x2C60 && p <= 0x2C7F) ||
(p == 0x374) ||
(p == 0x37E) ||
(p == 0x385) ||
(p == 0x387) ||
(p == 0x589) ||
(p == 0x605) ||
(p == 0x61B) ||
(p == 0x61C) ||
(p == 0x60C) ||
(p == 0x61F) ||
(p == 0x640) ||
(p == 0x6DD) ||
(p == 0x61F) ||
(p == 0x964) || (p == 0x965) ||
(p == 0xE3F) ||
(p >= 0xFD5 && p <= 0xFD8) ||
(p == 0x10FB) ||
(p >= 0x16EB && p <= 0x16ED) ||
(p == 0x1735) || (p == 0x1736) ||
(p == 0x1802) || (p == 0x1803) ||
(p == 0x1805) ||
(p == 0x1CD3) ||
(p == 0x1CE1) ||
(p >= 0x1CE9 && p <= 0x1CEC) ||
(p >= 0x1CEE && p <= 0x1CF3) ||
(p == 0x1CF5) || (p == 0x1CF6) ||
(p >= 0x2C60 && p <= 0x2C7F) ||
(p >= 0x2E00 && p <= 0x2E42) ||
(p >= 0x2FF0 && p <= 0x3004) ||
(p == 0x3006) ||
(p >= 0x3008 && p <= 0x3020) ||
(p >= 0x3030 && p <= 0x3037) ||
(p >= 0x303C && p <= 0x303F) ||
(p >= 0x309B && p <= 0x309C) ||
(p == 0x30A0) ||
(p >= 0x30FB && p <= 0x30FC) ||
(p >= 0x3190 && p <= 0x319F) ||
(p >= 0x31C0 && p <= 0x31E3) ||
(p >= 0x3220 && p <= 0x325F) ||
(p >= 0x327F && p <= 0x32CF) ||
(p >= 0x3358 && p <= 0x33FF) ||
(p >= 0x4DC0 && p <= 0x4DFF) ||
(p >= 0xA700 && p <= 0xA721) ||
(p >= 0xA722 && p <= 0xA787) ||
(p >= 0xA788 && p <= 0xA78A) ||
(p >= 0xA78B && p <= 0xA7AD) ||
(p >= 0xA7B0 && p <= 0xA7B7) ||
(p >= 0xA7F7 && p <= 0xA7FF) ||
(p >= 0xA830 && p <= 0xA839) ||
(p == 0xA92E) ||
(p == 0xA9CF) ||
(p >= 0xAB30 && p <= 0xAB5A) ||
(p == 0xAB5B) ||
(p >= 0xAB5C && p <= 0xAB64) ||
(p >= 0xFB00 && p <= 0xFB06) ||
(p >= 0xFD3E && p <= 0xFD3F) ||
(p >= 0xFE10 && p <= 0xFE19) ||
(p >= 0xFE30 && p <= 0xFE52) ||
(p >= 0xFE54 && p <= 0xFE66) ||
(p >= 0xFE68 && p <= 0xFE6B) ||
(p == 0xFEFF) ||
(p >= 0xFF01 && p <= 0xFF20) ||
(p >= 0xFF21 && p <= 0xFF3A) ||
(p >= 0xFF3B && p <= 0xFF40) ||
(p >= 0xFF41 && p <= 0xFF5A) ||
(p >= 0xFF5B && p <= 0xFF65) ||
(p == 0xFF70) ||
(p >= 0xFF9E && p <= 0xFF9F) ||
(p >= 0xFFE0 && p <= 0xFFE6) ||
(p >= 0xFFE8 && p <= 0xFFEE) ||
(p >= 0xFFF9 && p <= 0xFFFD) ||
(p >= 0x10100 && p <= 0x10102) ||
(p >= 0x10107 && p <= 0x10133) ||
(p >= 0x10137 && p <= 0x1013F) ||
(p >= 0x10190 && p <= 0x1019B) ||
(p >= 0x101D0 && p <= 0x101FC) ||
(p >= 0x102E1 && p <= 0x102FB) ||
(p >= 0x1BCA0 && p <= 0x1BCA3) ||
(p >= 0x1D000 && p <= 0x1D0F5) ||
(p >= 0x1D100 && p <= 0x1D126) ||
(p >= 0x1D129 && p <= 0x1D166) ||
(p >= 0x1D16A && p <= 0x1D17A) ||
(p >= 0x1D183 && p <= 0x1D184) ||
(p >= 0x1D18C && p <= 0x1D1A9) ||
(p >= 0x1D1AE && p <= 0x1D1E8) ||
(p >= 0x1D300 && p <= 0x1D356) ||
(p >= 0x1D360 && p <= 0x1D371) ||
(p >= 0x1D400 && p <= 0x1D454) ||
(p >= 0x1D456 && p <= 0x1D49C) ||
(p >= 0x1D49E && p <= 0x1D49F) ||
(p == 0x1D4A2) ||
(p >= 0x1D4A5 && p <= 0x1D4A6) ||
(p >= 0x1D4A9 && p <= 0x1D4AC) ||
(p >= 0x1D4AE && p <= 0x1D4B9) ||
(p == 0x1D4BB) ||
(p >= 0x1D4BD && p <= 0x1D4C3) ||
(p >= 0x1D4C5 && p <= 0x1D505) ||
(p >= 0x1D507 && p <= 0x1D50A) ||
(p >= 0x1D50D && p <= 0x1D514) ||
(p >= 0x1D516 && p <= 0x1D51C) ||
(p >= 0x1D51E && p <= 0x1D539) ||
(p >= 0x1D53B && p <= 0x1D53E) ||
(p >= 0x1D540 && p <= 0x1D544) ||
(p == 0x1D546) ||
(p >= 0x1D54A && p <= 0x1D550) ||
(p >= 0x1D552 && p <= 0x1D6A5) ||
(p >= 0x1D6A8 && p <= 0x1D7CB) ||
(p >= 0x1D7CE && p <= 0x1D7FF) ||
(p >= 0x1F000 && p <= 0x1F02B) ||
(p >= 0x1F030 && p <= 0x1F093) ||
(p >= 0x1F0A0 && p <= 0x1F0AE) ||
(p >= 0x1F0B1 && p <= 0x1F0BF) ||
(p >= 0x1F0C1 && p <= 0x1F0CF) ||
(p >= 0x1F0D1 && p <= 0x1F0F5) ||
(p >= 0x1F100 && p <= 0x1F10C) ||
(p >= 0x1F110 && p <= 0x1F12E) ||
(p >= 0x1F130 && p <= 0x1F16B) ||
(p >= 0x1F170 && p <= 0x1F19A) ||
(p >= 0x1F1E6 && p <= 0x1F1FF) ||
(p >= 0x1F201 && p <= 0x1F202) ||
(p >= 0x1F210 && p <= 0x1F23A) ||
(p >= 0x1F240 && p <= 0x1F248) ||
(p == 0x1F250 || p == 0x1F251) ||
(p >= 0x1F300 && p <= 0x1F579) ||
(p >= 0x1F57B && p <= 0x1F5A3) ||
(p >= 0x1F5A5 && p <= 0x1F6D0) ||
(p >= 0x1F6E0 && p <= 0x1F6EC) ||
(p >= 0x1F6F0 && p <= 0x1F6F3) ||
(p >= 0x1F700 && p <= 0x1F773) ||
(p >= 0x1F780 && p <= 0x1F7D4) ||
(p >= 0x1F800 && p <= 0x1F80B) ||
(p >= 0x1F810 && p <= 0x1F847) ||
(p >= 0x1F850 && p <= 0x1F859) ||
(p >= 0x1F860 && p <= 0x1F887) ||
(p >= 0x1F890 && p <= 0x1F8AD) ||
(p >= 0x1F910 && p <= 0x1F918) ||
(p >= 0x1F980 && p <= 0x1F984) ||
(p == 0x1F9C0) ||
(p == 0xE0001) ||
(p >= 0xE0020 && p <= 0xE007F);
}

PERL_STATIC_INLINE void
S_parse_ident(pTHX_ char **s, char **d, char * const e, int allow_package,
bool is_utf8, bool check_dollar) {
Expand All @@ -9366,20 +9210,18 @@ S_parse_ident(pTHX_ char **s, char **d, char * const e, int allow_package,
* like c\N{COMBINING TILDE} would start failing, as the
* isWORDCHAR_A case below would gobble the 'c' up.
*/
STRLEN len = UTF8SKIP(*s);
char *t = *s + UTF8SKIP(*s);
const U8 *p = (U8*)*s;
STRLEN len = UTF8SKIP(p);
char *t = p + UTF8SKIP(p);
if (len > 1) {
const UV uv = utf8n_to_uvchr((U8*)*s, len, &len, UTF8_ALLOW_ANYUV);
if (UNLIKELY(!isLATIN_or_COMMON_uni(uv)))
/* get script. check if scripts>2 exist */
utf8_check_script(uv);
if (UNLIKELY(!(is_LATIN_SCRIPT_utf8(p) || is_COMMON_SCRIPT_utf8(p))))
utf8_check_script(p);
}
while (isIDCONT_utf8((U8*)t)) {
const int l = UTF8SKIP(t);
if (l>1) {
const UV uv = utf8n_to_uvchr((U8*)t, l, &len, UTF8_ALLOW_ANYUV);
if (UNLIKELY(!isLATIN_or_COMMON_uni(uv)))
utf8_check_script(uv);
if (UNLIKELY(!(is_LATIN_SCRIPT_utf8(t) || is_COMMON_SCRIPT_utf8(t))))
utf8_check_script((U8*)t);
}
t += l;
}
Expand Down
34 changes: 26 additions & 8 deletions utf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -4536,9 +4536,17 @@ Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
return uvoffuni_to_utf8_flags(d, uv, flags);
}

/*
=for apidoc utf8_get_script
Returns the script property as string of the unicode character.
=cut
*/

STATIC char*
S_uvuni_get_script(UV uv) {
S_utf8_get_script(const U8 *s) {
PERL_ARGS_ASSERT_UTF8_GET_SCRIPT;
return "<unknown script>";
}

Expand All @@ -4555,16 +4563,26 @@ Common or Latin script property.
*/

void
Perl_utf8_check_script(const UV uv)
Perl_utf8_check_script(pTHX_ const U8 *s)
{
const GV* gv = gv_fetchpvs("utf8::scripts", GV_NOTQUAL, SVt_PVHV);
const HV* allowed = gv ? GvHV(gv) : NULL;
const char* script = S_uvuni_get_script(uv);
if (!allowed || HvKEYS(allowed) <= 2) { /* Common and Latin always present */
Perl_croak(aTHX_ "Invalid script %s in identifier for U+%04" UVXf, script, uv);
} else {
if (!hv_exists(allowed, script, strlen(script)))
Perl_croak(aTHX_ "Invalid script %s in identifier for U+%04" UVXf, script, uv);
const char* script = S_utf8_get_script(s);

PERL_ARGS_ASSERT_UTF8_CHECK_SCRIPT;

if (!allowed /* utf8_heavy never loaded, use utf8 'Script' never imported */
|| HvKEYS(allowed) <= 2 /* Common and Latin are always present */
|| !hv_exists(allowed, script, strlen(script)))
{
STRLEN len = strlen((char*)s);
const UV uv = utf8n_to_uvchr(s, len, &len, UTF8_ALLOW_ANYUV);
SV* tmp = newSVpvs("");
Perl_croak(aTHX_ "Invalid script %s in identifier %s for U+%04" UVXf,
script,
pv_display(tmp, (char*)s, len, 0, 60),
uv);
SvREFCNT_dec(tmp);
}
return;
}
Expand Down

0 comments on commit f986a23

Please sign in to comment.