From 6f46c8b944dec7cef9b28ff152f5853fa77f627f Mon Sep 17 00:00:00 2001 From: Ivan Mikhailov Date: Thu, 28 Mar 2019 16:23:12 +0100 Subject: [PATCH] Fixed backslash escaped characters in QNames of SPARQL queries. (Bug16599) --- libsrc/Wi/json.l | 8 ++++-- libsrc/Wi/scn3.l | 36 +++++++++++++++--------- libsrc/Wi/sparql.h | 5 +++- libsrc/Wi/sparql_core.c | 25 +++++++++-------- libsrc/Wi/sparql_l.l | 62 +++++++++++++++++++++++------------------ 5 files changed, 82 insertions(+), 54 deletions(-) diff --git a/libsrc/Wi/json.l b/libsrc/Wi/json.l index f162977450..54354a0e9c 100644 --- a/libsrc/Wi/json.l +++ b/libsrc/Wi/json.l @@ -23,6 +23,7 @@ %option 8bit %option never-interactive +%option noyywrap %option nounput %{ #include @@ -39,7 +40,10 @@ int jsonyy_string_input (char *buf, int max); #define jsonyyerror(str) jsonyyerror_impl(str) struct sparp_s; /* forward */ -extern caddr_t spar_strliteral (struct sparp_s *sparp, const char *strg, int strg_is_long, int is_json); +#define SPAR_STRLITERAL_SPARQL_STRING 0 +#define SPAR_STRLITERAL_JSON_STRING 1 +#define SPAR_STRLITERAL_SPARQL_QNAME 2 +extern caddr_t spar_unescape_strliteral (struct sparp_s *sparp, const char *strg, int count_of_quotes, int mode); extern int json_line; %} @@ -72,7 +76,7 @@ HEX ([0-9A-Fa-f]) [^\\\"\n\r\t]*"\"" { BEGIN(INITIAL); - jsonyylval.box = spar_strliteral (NULL /* no sparp for JSON_LITERAL */, jsonyytext, 0, 1); + jsonyylval.box = spar_unescape_strliteral (NULL /* no sparp for JSON_LITERAL */, jsonyytext, 1, SPAR_STRLITERAL_JSON_STRING); return STRING; } diff --git a/libsrc/Wi/scn3.l b/libsrc/Wi/scn3.l index 8e5e253e6b..30f6c2d4d9 100644 --- a/libsrc/Wi/scn3.l +++ b/libsrc/Wi/scn3.l @@ -391,16 +391,21 @@ extern int yylex (YYSTYPE *yylval, yyscan_t yyscanner); S_NL (\r\n|\n|\r) SPAR_SQ_PLAIN ([^\\''\r\n]) SPAR_DQ_PLAIN ([^\\""\r\n]) -SPAR_ECHAR ([\\]([atbvnrf\\""'']|("u"{HEX}{HEX}{HEX}{HEX})|("U"{HEX}{HEX}{HEX}{HEX}{HEX}{HEX}{HEX}{HEX}))) -HEX ([0-9A-Fa-f]) - -SPAR_NCCHAR1p ([A-Za-z]) -SPAR_NCCHAR1 ([A-Za-z_]) -SPAR_VARNAME ([A-Za-z0-9_]+) -SPAR_NCCHAR ([A-Za-z0-9_-]) -SPAR_NCNAME_PREFIX ({SPAR_NCCHAR1p}([A-Za-z0-9_.-]*{SPAR_NCCHAR})?) -SPAR_NCNAME ({SPAR_NCCHAR1}([A-Za-z0-9_.-]*{SPAR_NCCHAR})?) - +SPAR_UCHAR ([\\](("u"{HEX}{HEX}{HEX}{HEX})|("U"{HEX}{HEX}{HEX}{HEX}{HEX}{HEX}{HEX}{HEX}))) +SPAR_ECHAR (([\\][atbvnrf\\""''])|{SPAR_UCHAR}) +HEX ([0-9A-Fa-f]) + +SPAR_VARNAME ([A-Za-z0-9_][A-Za-z0-9_\x7f-\xfe]*) + +PN_LOCAL_ESC ([\\][_~.!$&''()*+,;=/?#@%-]) +PN_LOCAL_ESC_X ({PN_LOCAL_ESC}|([%]{HEX}{HEX})) +PN_CHARS_BASE ([A-Za-z\x7f-\xfe]|{SPAR_UCHAR}) +PN_CHARS_U_09 ([A-Za-z0-9_\x7f-\xfe]|{SPAR_UCHAR}) +PN_CHARS_U_09_C_PLX ([A-Za-z0-9_\x7f-\xfe:]|{SPAR_UCHAR}|{PN_LOCAL_ESC_X}) +PN_CHARS ([A-Za-z0-9_\x7f-\xfe-]|{SPAR_UCHAR}) +PN_CHARS_C_PLX ([A-Za-z0-9_\x7f-\xfe:-]|{SPAR_UCHAR}|{PN_LOCAL_ESC_X}) +PN_PREFIX ({PN_CHARS_BASE}(([.]*{PN_CHARS})*)) +PN_LOCAL ({PN_CHARS_U_09_C_PLX}(([.]*{PN_CHARS_C_PLX})*)) %% @@ -873,7 +878,7 @@ EXEC[ \t]+SQL { sqlp_bin_op_serial = 0; BEGIN SQL; } t_set_push (&global_scs->scs_scn3c.namespaces, NULL); RETURN_WS(WS_PRAGMA_PREFIX_1); } -({SPAR_NCNAME_PREFIX}?)":" { +({PN_PREFIX}?)":" { BEGIN(PRAGMA_PREFIX_2); global_scs->scs_scn3c.namespaces->data = t_box_dv_uname_nchars (yytext, strlen (yytext) - 1); RETURN_WS(WS_PRAGMA_PREFIX_2); } @@ -882,7 +887,7 @@ EXEC[ \t]+SQL { sqlp_bin_op_serial = 0; BEGIN SQL; } const char *langl = strchr (yytext, '<'); BEGIN(SQL); global_scs->scs_scn3c.namespaces->next->data = t_box_dv_uname_nchars (langl + 1, (yytext + yyleng - 2) - langl); - RETURN_WS(WS_PRAGMA_PREFIX_3); } + RETURN_WS(WS_PRAGMA_PREFIX_3); } . { scn3yyerror ("Ill formed namespace prefix in #pragma prefix"); } <> { scn3yyerror ("Unexpected end of text in #pragma prefix"); } @@ -1016,6 +1021,7 @@ EXEC[ \t]+SQL { sqlp_bin_op_serial = 0; BEGIN SQL; } "}" TOKCLOSE_SKIP ('}','}'); "[" TOKOPEN_SKIP ('[',']'); "]" TOKCLOSE_SKIP (']',']'); + "<"([^<>"{}|^`\001-\040\\])*">" { TOK_SKIP; } ([""][^""\\\n]*[""])|([''][^''\\\n]*['']) { TOK_SKIP; } @@ -1048,7 +1054,11 @@ EXEC[ \t]+SQL { sqlp_bin_op_serial = 0; BEGIN SQL; } <> { scn3yyerror ("Unterminated SPARQL short single-quoted string"); } <> { scn3yyerror ("Unterminated SPARQL short double-quoted string"); } -[^#''""\\\n\r(){}\[\];<>]+ TOK_SKIP; +({PN_PREFIX}?)":"{PN_LOCAL} TOK_SKIP; +({PN_PREFIX}?)":" TOK_SKIP; +"_:"{PN_LOCAL} TOK_SKIP; +[^#''""\\\n\r(){}\[\];:<>]+ TOK_SKIP; + <> { #ifndef SCN3SPLIT scn3_include_fragment_t *outer = global_scs->scs_scn3c.include_stack + global_scs->scs_scn3c.include_depth; diff --git a/libsrc/Wi/sparql.h b/libsrc/Wi/sparql.h index 1d6c90502b..dbc6491779 100644 --- a/libsrc/Wi/sparql.h +++ b/libsrc/Wi/sparql.h @@ -931,7 +931,10 @@ extern caddr_t sparp_graph_sec_id_to_iri_nosignal (sparp_t *sparp, iri_id_t iid) extern caddr_t sparp_iri_to_id_nosignal (sparp_t *sparp, ccaddr_t qname); /*!< returns t_boxed IRI_ID or plain NULL pointer */ extern ccaddr_t sparp_id_to_iri (sparp_t *sparp, iri_id_t iid); /*!< returns t_boxed string or plain NULL pointer */ -extern caddr_t spar_strliteral (sparp_t *sparp, const char *sparyytext, int strg_is_long, int is_json); +#define SPAR_STRLITERAL_SPARQL_STRING 0 +#define SPAR_STRLITERAL_JSON_STRING 1 +#define SPAR_STRLITERAL_SPARQL_QNAME 2 +extern caddr_t spar_unescape_strliteral (sparp_t *sparp, const char *sparyytext, int count_of_quotes, int mode); extern caddr_t spar_mkid (sparp_t * sparp, const char *prefix); extern void spar_change_sign (caddr_t *lit_ptr); diff --git a/libsrc/Wi/sparql_core.c b/libsrc/Wi/sparql_core.c index 7b00003e52..90abfba6ba 100644 --- a/libsrc/Wi/sparql_core.c +++ b/libsrc/Wi/sparql_core.c @@ -827,15 +827,15 @@ sparp_id_to_iri (sparp_t *sparp, iri_id_t iid) return NULL; /* to keep compiler happy */ } -caddr_t spar_strliteral (sparp_t *sparp, const char *strg, int strg_is_long, int is_json) +caddr_t spar_unescape_strliteral (sparp_t *sparp, const char *strg, int count_of_quotes, int mode) { caddr_t tmp_buf; caddr_t res; const char *err_msg; const char *src_tail, *src_end; char *tgt_tail; - src_tail = strg + (strg_is_long ? 3 : 1); - src_end = strg + strlen (strg) - (strg_is_long ? 3 : 1); + src_tail = strg + count_of_quotes; + src_end = strg + strlen (strg) - count_of_quotes; tgt_tail = tmp_buf = dk_alloc_box ((src_end - src_tail) + 1, DV_SHORT_STRING); while (src_tail < src_end) { @@ -843,16 +843,16 @@ caddr_t spar_strliteral (sparp_t *sparp, const char *strg, int strg_is_long, int { case '\\': { - const char *bs_src = "abfnrtv/\\\'\"uU"; - const char *bs_trans = "\a\b\f\n\r\t\v/\\\'\"\0\0"; - const char *bs_lengths = "\2\2\2\2\2\2\2\2\2\2\2\6\012"; + const char *bs_src = ((SPAR_STRLITERAL_SPARQL_QNAME == mode) ? "_~.-!$&()*+,:=/?#@%\'uU" : "abfnrtv/\\\'\"uU" ); + const char *bs_trans = ((SPAR_STRLITERAL_SPARQL_QNAME == mode) ? "_~.-!$&()*+,:=/?#@%\'\0\0" : "\a\b\f\n\r\t\v/\\\'\"\0\0" ); + const char *bs_lengths = ((SPAR_STRLITERAL_SPARQL_QNAME == mode) ? "\2\2\2\2\2\2\2\2\2\2\2\2\2\2\2\2\2\2\2\2\6\012" : "\2\2\2\2\2\2\2\2\2\2\2\6\012" ); const char *hit = strchr (bs_src, src_tail[1]); char bs_len, bs_tran; const char *nextchr; if (NULL == hit) { - err_msg = "Unsupported escape sequence after '\'"; - goto err; + err_msg = "Unsupported escape sequence after '\'"; + goto err; } bs_len = bs_lengths [hit - bs_src]; bs_tran = bs_trans [hit - bs_src]; @@ -904,7 +904,7 @@ caddr_t spar_strliteral (sparp_t *sparp, const char *strg, int strg_is_long, int goto err; } } - else if (is_json && (6 == bs_len) && (acc >= 0xD800) && (acc <= 0xDFFF)) + else if ((SPAR_STRLITERAL_JSON_STRING == mode) && (6 == bs_len) && (acc >= 0xD800) && (acc <= 0xDFFF)) { if (acc >= 0xDC00) { @@ -931,14 +931,17 @@ caddr_t spar_strliteral (sparp_t *sparp, const char *strg, int strg_is_long, int default: (tgt_tail++)[0] = (src_tail++)[0]; } } - res = t_box_dv_short_nchars (tmp_buf, tgt_tail - tmp_buf); + if (SPAR_STRLITERAL_SPARQL_QNAME == mode) + res = t_box_dv_uname_nchars (tmp_buf, tgt_tail - tmp_buf); + else + res = t_box_dv_short_nchars (tmp_buf, tgt_tail - tmp_buf); box_flags (res) = BF_UTF8; dk_free_box (tmp_buf); return res; err: dk_free_box (tmp_buf); - if (is_json) + if (SPAR_STRLITERAL_JSON_STRING == mode) jsonyyerror_impl (err_msg); else sparyyerror_impl (sparp, NULL, err_msg); diff --git a/libsrc/Wi/sparql_l.l b/libsrc/Wi/sparql_l.l index b591c4e62c..2547687efc 100644 --- a/libsrc/Wi/sparql_l.l +++ b/libsrc/Wi/sparql_l.l @@ -188,8 +188,11 @@ void sparyyerror_if_long_qname (caddr_t box, const char *lex_type_descr, struct sparyyerror_if_long_qname (yylval->box, lex_type_descr, yyg); \ return (name); -#define TOKBOX_Q(n,name,lex_type_descr) { \ - yylval->box = t_box_dv_uname_string (yytext+(n)); \ +#define TOKBOX_Q_ESC(name,lex_type_descr) { \ + if (strchr (yytext, '\\')) \ + yylval->box = spar_unescape_strliteral (yyextra, yytext, 0, SPAR_STRLITERAL_SPARQL_QNAME); \ + else \ + yylval->box = t_box_dv_uname_string (yytext); \ TOKBOX_Q_FINAL(name,lex_type_descr) } #define TOKBOX_Q2(n1,n2,name,lex_type_descr) { \ @@ -216,7 +219,7 @@ sparyyalloc (yy_size_t size, yyscan_t yyscanner) } void * -sparyyrealloc (void * ptr, yy_size_t sz , yyscan_t yyscanner) +sparyyrealloc (void * ptr, yy_size_t sz, yyscan_t yyscanner) { int old_sz = ((NULL == ptr) ? 0 : box_length (ptr)); if (old_sz < sz) @@ -303,26 +306,31 @@ X ([Xx]) Y ([Yy]) Z ([Zz]) -INTEGER_LITERAL ([0-9]+) -DECIMAL_LITERAL (([0-9]+"."[0-9]*)|("."[0-9]+)) -DOUBLE_LITERAL (({INTEGER_LITERAL}|{DECIMAL_LITERAL})[eE][+-]?[0-9]+) - -SPAR_SQ_PLAIN ([^\\''\r\n]) -SPAR_DQ_PLAIN ([^\\""\r\n]) -SPAR_ECHAR ([\\]([atbvnrf\\""'']|("u"{HEX}{HEX}{HEX}{HEX})|("U"{HEX}{HEX}{HEX}{HEX}{HEX}{HEX}{HEX}{HEX}))) -S_NL ((\r\n)|(\n\r)|\n|\r) -HEX ([0-9A-Fa-f]) - -PN_CHARS_BASE ([A-Za-z\x7f-\xfe]) -PN_CHARS_U_09 ([A-Za-z0-9_\x7f-\xfe]) -PN_CHARS ([A-Za-z0-9_\x7f-\xfe-]) -PN_PREFIX ({PN_CHARS_BASE}(([.]*{PN_CHARS})*)) -PN_LOCAL ({PN_CHARS_U_09}(([.]*{PN_CHARS})*)) -SPAR_VARNAME ([A-Za-z0-9_][A-Za-z0-9_\x7f-\xfe]*) +INTEGER_LITERAL ([0-9]+) +DECIMAL_LITERAL (([0-9]+"."[0-9]*)|("."[0-9]+)) +DOUBLE_LITERAL (({INTEGER_LITERAL}|{DECIMAL_LITERAL})[eE][+-]?[0-9]+) + +SPAR_SQ_PLAIN ([^\\''\r\n]) +SPAR_DQ_PLAIN ([^\\""\r\n]) +SPAR_UCHAR ([\\](("u"{HEX}{HEX}{HEX}{HEX})|("U"{HEX}{HEX}{HEX}{HEX}{HEX}{HEX}{HEX}{HEX}))) +SPAR_ECHAR (([\\][atbvnrf\\""''])|{SPAR_UCHAR}) +S_NL ((\r\n)|(\n\r)|\n|\r) +HEX ([0-9A-Fa-f]) + +PN_LOCAL_ESC ([\\][_~.!$&''()*+,;=/?#@%-]) +PN_LOCAL_ESC_X ({PN_LOCAL_ESC}|([%]{HEX}{HEX})) +PN_CHARS_BASE ([A-Za-z\x7f-\xfe]|{SPAR_UCHAR}) +PN_CHARS_U_09 ([A-Za-z0-9_\x7f-\xfe]|{SPAR_UCHAR}) +PN_CHARS_U_09_C_PLX ([A-Za-z0-9_\x7f-\xfe:]|{SPAR_UCHAR}|{PN_LOCAL_ESC_X}) +PN_CHARS ([A-Za-z0-9_\x7f-\xfe-]|{SPAR_UCHAR}) +PN_CHARS_C_PLX ([A-Za-z0-9_\x7f-\xfe:-]|{SPAR_UCHAR}|{PN_LOCAL_ESC_X}) +PN_PREFIX ({PN_CHARS_BASE}(([.]*{PN_CHARS})*)) +PN_LOCAL ({PN_CHARS_U_09_C_PLX}(([.]*{PN_CHARS_C_PLX})*)) +SPAR_VARNAME ([A-Za-z0-9_][A-Za-z0-9_\x7f-\xfe]*) SPAR_PLAIN_SQLNAME ([A-Za-z_][A-Za-z0-9_]*) SPAR_DQ_SQLNAME ([""][^""\\\r\n]*[""]) SPAR_SQLNAME (([A-Za-z_][A-Za-z0-9_]*)|([""][^""\\\r\n]*[""])) -SPAR_PARAMNAME (([A-Z]+"::")?(({SPAR_SQLNAME}("."{SPAR_SQLNAME})?)|(":"{SPAR_SQLNAME})|(":"[0-9]+))) +SPAR_PARAMNAME (([A-Z]+"::")?(({SPAR_SQLNAME}("."{SPAR_SQLNAME})?)|(":"{SPAR_SQLNAME})|(":"[0-9]+))) %% @@ -372,9 +380,9 @@ SPAR_PARAMNAME (([A-Z]+"::")?(({SPAR_SQLNAME}("."{SPAR_SQLNAME})?)|(":"{SPAR_SQL return Q_IRI_REF; } -({PN_PREFIX}?)":"{PN_LOCAL} { TOKBOX_Q(0,QNAME,"qualified URI"); } -({PN_PREFIX}?)":" { TOKBOX_Q(0,QNAME_NS,"namespace"); } -"_:"{PN_LOCAL} { TOKBOX_Q(0,BLANK_NODE_LABEL,"blank node label"); } +({PN_PREFIX}?)":"{PN_LOCAL} { TOKBOX_Q_ESC(QNAME,"qualified URI"); } +({PN_PREFIX}?)":" { TOKBOX_Q_ESC(QNAME_NS,"namespace"); } +"_:"{PN_LOCAL} { TOKBOX_Q_ESC(BLANK_NODE_LABEL,"blank node label"); } [?$]{SPAR_VARNAME} { yylval->box = t_box_dv_uname_nchars (yytext + 1, yyleng - 1); @@ -434,8 +442,8 @@ SPAR_PARAMNAME (([A-Z]+"::")?(({SPAR_SQLNAME}("."{SPAR_SQLNAME})?)|(":"{SPAR_SQL [''][''][''] { yymore(); SET_INNER_BEGIN(SPARQL_SSSQ); BEGIN_INNER; } [""][""][""] { yymore(); SET_INNER_BEGIN(SPARQL_DDDQ); BEGIN_INNER; } -[''][''][''] { yylval->box = spar_strliteral (yyextra, yytext, 1, 0); BEGIN_OUTER; return SPARQL_STRING; } -[""][""][""] { yylval->box = spar_strliteral (yyextra, yytext, 1, 0); BEGIN_OUTER; return SPARQL_STRING; } +[''][''][''] { yylval->box = spar_unescape_strliteral (yyextra, yytext, 3, SPAR_STRLITERAL_SPARQL_STRING); BEGIN_OUTER; return SPARQL_STRING; } +[""][""][""] { yylval->box = spar_unescape_strliteral (yyextra, yytext, 3, SPAR_STRLITERAL_SPARQL_STRING); BEGIN_OUTER; return SPARQL_STRING; } (([''](['']?))?{S_NL}) { yyextra->sparp_lexlineno++; yymore(); } (([""]([""]?))?{S_NL}) { yyextra->sparp_lexlineno++; yymore(); } ((([''](['']?))?({SPAR_SQ_PLAIN}|{SPAR_ECHAR}))+) { yymore(); } @@ -450,8 +458,8 @@ SPAR_PARAMNAME (([A-Z]+"::")?(({SPAR_SQLNAME}("."{SPAR_SQLNAME})?)|(":"{SPAR_SQL [''] { yymore(); SET_INNER_BEGIN(SPARQL_SQ); BEGIN_INNER; } [""] { yymore(); SET_INNER_BEGIN(SPARQL_DQ); BEGIN_INNER; } -[''] { yylval->box = spar_strliteral (yyextra, yytext, 0, 0); BEGIN_OUTER; return SPARQL_STRING; } -[""] { yylval->box = spar_strliteral (yyextra, yytext, 0, 0); BEGIN_OUTER; return SPARQL_STRING; } +[''] { yylval->box = spar_unescape_strliteral (yyextra, yytext, 1, SPAR_STRLITERAL_SPARQL_STRING); BEGIN_OUTER; return SPARQL_STRING; } +[""] { yylval->box = spar_unescape_strliteral (yyextra, yytext, 1, SPAR_STRLITERAL_SPARQL_STRING); BEGIN_OUTER; return SPARQL_STRING; } {S_NL} { sparyyerror ("End-of-line in a short single-quoted string"); yymore(); } {S_NL} { sparyyerror ("End-of-line in a short double-quoted string"); yymore(); } (({SPAR_SQ_PLAIN}|{SPAR_ECHAR})+) { yymore(); }