Skip to content

Commit

Permalink
store string encoding in PBC and change semantic of wide char encodin…
Browse files Browse the repository at this point in the history
…gs in PIR string literals

git-svn-id: https://svn.parrot.org/parrot/trunk@46822 d31e2699-5ff4-0310-a27c-f18f2fbe73fe
  • Loading branch information
NotFound committed May 20, 2010
1 parent b196145 commit bdcf0f9
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 15 deletions.
1 change: 1 addition & 0 deletions PBC_COMPAT
Expand Up @@ -27,6 +27,7 @@

# please insert tab separated entries at the top of the list

6.17 2010.05.20 NotFound store encoding of string constants
6.16 2010.05.18 plobsing move freeze/thaw adjacent to visit
6.15 2010.05.06 bacek add StringBuilder PMC
6.14 2010.05.03 coke remove popaction, pushmark, pushaction ops.
Expand Down
62 changes: 53 additions & 9 deletions compilers/imcc/pbc.c
Expand Up @@ -888,27 +888,71 @@ STRING *
IMCC_string_from_reg(PARROT_INTERP, ARGIN(const SymReg *r))
{
ASSERT_ARGS(IMCC_string_from_reg)
const char *buf = r->name;
char *buf = r->name;

if (r->type & VT_ENCODED) {
/*
* the lexer parses: foo:"string"
* get first part as charset, rest as string
*/
STRING *s;
const CHARSET *s_charset;
const ENCODING *s_encoding = NULL;
const ENCODING *src_encoding;
const char *charset;
char * const p = strchr(r->name, '"');
#define MAX_NAME 31
char charset_name[MAX_NAME + 1];
char encoding_name[MAX_NAME + 1];
char * p = strchr(r->name, '"');
char * p2 = strchr(r->name, ':');
PARROT_ASSERT(p && p[-1] == ':');

p[-1] = 0;
charset = r->name;
if (p2 < p -1) {
strncpy(encoding_name, buf, p2 - buf);
encoding_name[p2-buf] = '\0';
strncpy(charset_name, p2 +1, p - p2 - 2);
charset_name[p- p2 - 2] = '\0';
/*fprintf(stderr, "%s:%s\n", charset_name, encoding_name);*/
s_charset = Parrot_find_charset(interp, charset_name);
s_encoding = Parrot_find_encoding(interp, encoding_name);
}
else {
strncpy(charset_name, buf, p - buf - 1);
charset_name[p - buf - 1] = '\0';
/*fprintf(stderr, "%s\n", charset_name);*/
s_charset = Parrot_find_charset(interp, charset_name);
}
if (strcmp(charset_name, "unicode") == 0)
src_encoding = Parrot_utf8_encoding_ptr;
else
src_encoding = Parrot_fixed_8_encoding_ptr;
if (s_encoding == NULL)
s_encoding = src_encoding;

/* past delim */
buf = p + 1;
s = Parrot_str_unescape(interp, buf, '"', charset);

/* restore colon, as we may reuse this string */
p[-1] = ':';
if (strcmp(charset_name, "unicode") == 0 && strcmp(encoding_name, "utf8") == 0) {
/* Special case needed for backward compatibility with utf8 literals
* using \xHH\xHH byte sequences */
s = Parrot_str_unescape(interp, buf, '"', "utf8:unicode");
}
else {
p = buf;
p2 = strchr(buf, '"');
while (p2 != NULL) {
p = p2;
p2 = strchr(p + 1, '"');
}
{
STRING * aux = Parrot_str_new_init(interp, buf, p - buf,
src_encoding, s_charset, 0);
s = Parrot_str_unescape_string(interp, aux,
s_charset, s_encoding, PObj_constant_FLAG);
if (!CHARSET_VALIDATE(interp, s))
Parrot_ex_throw_from_c_args(interp, NULL,
EXCEPTION_INVALID_STRING_REPRESENTATION,
"Malformed string");
}
}
return s;
}
else if (*buf == '"') {
Expand Down
27 changes: 21 additions & 6 deletions src/packfile/pf_items.c
Expand Up @@ -1216,28 +1216,42 @@ PF_fetch_string(PARROT_INTERP, ARGIN_NULLOK(PackFile *pf), ARGIN(const opcode_t
ASSERT_ARGS(PF_fetch_string)
STRING *s;
UINTVAL flags;
UINTVAL encoding_nr;
UINTVAL charset_nr;
ENCODING *encoding;
CHARSET *charset;
size_t size;
const int wordsize = pf ? pf->header->wordsize : sizeof (opcode_t);
opcode_t flag_charset_word = PF_fetch_opcode(pf, cursor);

if (flag_charset_word == -1)
return STRINGNULL;

/* decode flags and charset */
/* decode flags, charset and encoding */
flags = (flag_charset_word & 0x1 ? PObj_constant_FLAG : 0) |
(flag_charset_word & 0x2 ? PObj_private7_FLAG : 0) ;
charset_nr = flag_charset_word >> 8;
encoding_nr = (flag_charset_word >> 16);
charset_nr = (flag_charset_word >> 8) & 0xFF;


size = (size_t)PF_fetch_opcode(pf, cursor);

TRACE_PRINTF(("PF_fetch_string(): flags=0x%04x, ", flags));
TRACE_PRINTF(("encoding_nr=%ld, ", encoding_nr));
TRACE_PRINTF(("charset_nr=%ld, ", charset_nr));
TRACE_PRINTF(("size=%ld.\n", size));

s = string_make_from_charset(interp, (const char *)*cursor,
size, charset_nr, flags);
encoding = Parrot_get_encoding(interp, encoding_nr);
charset = Parrot_get_charset(interp, charset_nr);
if (!encoding)
Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNIMPLEMENTED,
"Invalid encoding number '%d' specified", encoding_nr);
if (!charset)
Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNIMPLEMENTED,
"Invalid charset number '%d' specified", charset_nr);

s = Parrot_str_new_init(interp, (const char *)*cursor, size,
encoding, charset, flags);

/* print only printable characters */
TRACE_PRINTF_VAL(("PF_fetch_string(): string is '%s' at 0x%x\n",
Expand Down Expand Up @@ -1298,8 +1312,9 @@ PF_store_string(ARGOUT(opcode_t *cursor), ARGIN(const STRING *s))
* see also PF_fetch_string
*/

/* encode charset_nr and flags into the same word for a 33% savings on constant overhead */
*cursor++ = (Parrot_charset_number_of_str(NULL, s) << 8) |
/* encode charset_nr, encoding_nr and flags into the same word */
*cursor++ = (Parrot_encoding_number_of_str(NULL, s) << 16) |
(Parrot_charset_number_of_str(NULL, s) << 8) |
(PObj_get_FLAGS(s) & PObj_constant_FLAG ? 0x1 : 0x0) |
(PObj_get_FLAGS(s) & PObj_private7_FLAG ? 0x2 : 0x0) ;
*cursor++ = s->bufused;
Expand Down

0 comments on commit bdcf0f9

Please sign in to comment.