Skip to content

Commit

Permalink
[GH #1075] find_codepoint namealias
Browse files Browse the repository at this point in the history
add a tools/build/namealias_c.pl to manually cleanup the generated
gperf c file for inline, C++ compat and codingstd_tests.

add Parrot_str_internal_find_codepoint to src/string/encoding.c
(not namealias.c because headerizer doesn't like namealias_c.in)

use that also for the non-ICU codepath, so that at least some names are
found. This can be used later to add all names to namealias for non-ICU builds.

remove inline from namealias. This doesn't need to be fast, just small.

fixup some weird codingstd tests for namealias. podchecker passes but Pod::Simple not.
c_parens.t misparses namealias_c.in as functions

make headerizer and bootstrap-ops.
  • Loading branch information
Reini Urban committed Jun 14, 2014
1 parent 5cb6219 commit 7bc3a07
Show file tree
Hide file tree
Showing 14 changed files with 268 additions and 212 deletions.
4 changes: 4 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
Thereby removed the vtable method calling overhead of _orig into a
wrapper with the mandatory write barrier. This was the first part
of Chirag's GSOC project. [GH #1069]
+ find_codepoint: Added name aliases for control character names which
disappeared with ICU 5.2, and added those names to non-ICU builds also.
Improved ICU search for u_charFromName() to check all UCharNameChoices,
not only U_EXTENDED_CHAR_NAME. [GH #1075, roast #43]
- Build
+ Fixed wrong ICU header probes on multi-arch systems (debian) [GH #1014]
+ Fix opengl on bsd which does not have __APPLE__ defined as 0 [GH #1070]
Expand Down
1 change: 1 addition & 0 deletions MANIFEST
Original file line number Diff line number Diff line change
Expand Up @@ -2161,6 +2161,7 @@ tools/build/c2str.pl []
tools/build/fixup_gen_file.pl []
tools/build/gen_version.pl []
tools/build/h2inc.pl []
tools/build/namealias_c.pl []
tools/build/ops2c.pl [devel]
tools/build/parrot_config_c.pl []
tools/build/pbcversion_h.pl []
Expand Down
11 changes: 6 additions & 5 deletions config/gen/makefiles/root.in
Original file line number Diff line number Diff line change
Expand Up @@ -2036,20 +2036,21 @@ src/string/namealias$(O) : \

## SUFFIX OVERRIDE
src/string/sprintf$(O) : $(PARROT_H_HEADERS) src/string/sprintf.c src/string/spf_private.h
$(CC) $(CFLAGS) @optimize::src/string/sprintf.c@ @ccwarn::src/string/sprintf.c@ @cc_shared@ \
$(CC) $(CFLAGS) @optimize::src/string/sprintf.c@ @ccwarn::src/string/sprintf.c@ \
-I$(@D)/. @cc_o_out@$@ -c src/string/sprintf.c

## SUFFIX OVERRIDE
src/string/spf_render$(O) : $(PARROT_H_HEADERS) src/string/spf_render.str \
src/string/spf_render.c src/string/spf_private.h
$(CC) $(CFLAGS) @optimize::src/string/spf_render.c@ @ccwarn::src/string/spf_render.c@ \
@cc_shared@ -I$(@D)/. @cc_o_out@$@ -c src/string/spf_render.c
-I$(@D)/. @cc_o_out@$@ -c src/string/spf_render.c

src/string/spf_vtable$(O) : $(PARROT_H_HEADERS) src/string/spf_vtable.str \
src/string/spf_vtable.c src/string/spf_private.h

src/string/encoding$(O) : \
$(PARROT_H_HEADERS) \
$(INC_DIR)/namealias.h \
src/string/encoding.c \
src/string/encoding.str

Expand Down Expand Up @@ -3232,9 +3233,9 @@ src/extra_nci_thunks$(O) : $(PARROT_H_HEADERS) src/extra_nci_thunks.c \
$(INC_PMC_DIR)/pmc_nci.h
$(CC) $(CFLAGS) @optimize::src/extra_nci_thunks.c@ @ccwarn::src/extra_nci_thunks.c@ @cc_shared@ -I$(@D)/. @cc_o_out@$@ -c src/extra_nci_thunks.c

# UnicodeData.txt updates must be done manual
bootstrap-namealias : src/string/namealias_c.in
gperf --output-file=src/string/namealias.c src/string/namealias_c.in
# UnicodeData.txt updates must be done manually, or with namealias_c.pl --create
bootstrap-namealias src/string/namealias.c : src/string/namealias_c.in $(BUILD_TOOLS_DIR)/namealias_c.pl
$(PERL) $(BUILD_TOOLS_DIR)/namealias_c.pl

# emacs etags
# this needs exuberant-ctags
Expand Down
13 changes: 12 additions & 1 deletion include/parrot/encoding.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* encoding.h
* Copyright (C) 2004-2007, Parrot Foundation.
* Copyright (C) 2004-2007,2014, Parrot Foundation.
* Overview:
* This is the header for the generic encoding functions
* Data Structure and Algorithms:
Expand Down Expand Up @@ -121,6 +121,13 @@ INTVAL Parrot_register_encoding(PARROT_INTERP, ARGIN(STR_VTABLE *encoding))
void Parrot_deinit_encodings(PARROT_INTERP)
__attribute__nonnull__(1);

PARROT_PURE_FUNCTION
PARROT_WARN_UNUSED_RESULT
INTVAL Parrot_str_internal_find_codepoint(PARROT_INTERP,
ARGIN(const STRING *name))
__attribute__nonnull__(1)
__attribute__nonnull__(2);

void Parrot_str_internal_register_encoding_names(PARROT_INTERP)
__attribute__nonnull__(1);

Expand Down Expand Up @@ -153,6 +160,10 @@ void Parrot_str_internal_register_encoding_names(PARROT_INTERP)
, PARROT_ASSERT_ARG(encoding))
#define ASSERT_ARGS_Parrot_deinit_encodings __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp))
#define ASSERT_ARGS_Parrot_str_internal_find_codepoint \
__attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(name))
#define ASSERT_ARGS_Parrot_str_internal_register_encoding_names \
__attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp))
Expand Down
17 changes: 11 additions & 6 deletions include/parrot/namealias.h
Original file line number Diff line number Diff line change
@@ -1,25 +1,30 @@
/* namealias.h
Copyright (C) 2014, Parrot Foundation.
Copyright (C) 2014, Parrot Foundation.
gperf generated icu control character namealias lookup
gperf generated icu control character namealias lookup
=over
=item C<struct Parrot_namealias>
Generated hash table with name and key.
Generated hash table with name and codepoint.
=back
=cut
*/

#ifndef PARROT_NAMEALIAS_H_GUARD
#define PARROT_NAMEALIAS_H_GUARD

#include "parrot/config.h"
#include "parrot/parrot.h"

#ifdef PARROT_IN_CORE

struct Parrot_namealias { int name; const int key; };
/* HEADERIZER STOP */

struct Parrot_namealias { int name; const INTVAL codepoint; };

PARROT_INLINE
const struct Parrot_namealias *
Parrot_namealias_lookup(register const char *str, register unsigned int len);

Expand Down
1 change: 1 addition & 0 deletions lib/Parrot/Test/Pod.pm
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ our %second_analysis_subs = (
| t/configure/testlib/bdefectivefoobar
| compilers/opsc
| tools/dev/mk_language_shell\.pl
| src/string/namealias_c\.in
| lib/IO/CaptureOutput\.pm
}x
) {
Expand Down
71 changes: 10 additions & 61 deletions src/ops/core_ops.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ extern op_lib_t core_op_lib;
#include "pmc/pmc_parrotlibrary.h"



/* Signed shift operator that is compatible with PMC shifts. This is
* guaranteed to produce the same result as bitwise_left_shift_internal modulo
* word size, ignoring the fact that Parrot integers are always signed. This
Expand All @@ -64,10 +65,10 @@ extern op_lib_t core_op_lib;

#include "../io/io_private.h"


#include "parrot/encoding.h"
#include "parrot/namealias.h"
#if PARROT_HAS_ICU
# include <unicode/uchar.h>
# include "parrot/namealias.h"
#endif


Expand Down Expand Up @@ -13795,15 +13796,17 @@ Parrot_local_return_p(opcode_t *cur_opcode, PARROT_INTERP) {

opcode_t *
Parrot_jump_i(opcode_t *cur_opcode, PARROT_INTERP) {
opcode_t * const loc = INTVAL2PTR(opcode_t *, IREG(1));
opcode_t * const loc = INTVAL2PTR(opcode_t *, IREG(1));

UNUSED(interp);
UNUSED(cur_opcode);
return (opcode_t *)loc;
}

opcode_t *
Parrot_jump_ic(opcode_t *cur_opcode, PARROT_INTERP) {
opcode_t * const loc = INTVAL2PTR(opcode_t *, ICONST(1));
opcode_t * const loc = INTVAL2PTR(opcode_t *, ICONST(1));

UNUSED(interp);
UNUSED(cur_opcode);
return (opcode_t *)loc;
Expand Down Expand Up @@ -22113,68 +22116,13 @@ Parrot_compose_s_sc(opcode_t *cur_opcode, PARROT_INTERP) {

opcode_t *
Parrot_find_codepoint_i_s(opcode_t *cur_opcode, PARROT_INTERP) {
#if PARROT_HAS_ICU
UErrorCode err = U_ZERO_ERROR;
char * const cstr = Parrot_str_to_cstring(interp, SREG(2));
/* At first search for proper names. This will not find name aliases for
control characters */
UCharNameChoice nameChoices[] = {U_UNICODE_CHAR_NAME, U_EXTENDED_CHAR_NAME,
U_CHAR_NAME_ALIAS, U_UNICODE_10_CHAR_NAME};
unsigned int i = 0;
IREG(1) = -1;
for (; i < (sizeof(nameChoices)/sizeof(nameChoices[0])); i++) {
UChar32 codepoint = u_charFromName(nameChoices[i], cstr, &err);
if (U_SUCCESS(err)) {
IREG(1) = (INTVAL) codepoint;
goto found;
}
}
{
const struct Parrot_namealias *namealias
= Parrot_namealias_lookup(cstr, STRING_byte_length(SREG(2)));
if (namealias)
IREG(1) = (INTVAL) namealias->key;
}
found:
Parrot_str_free_cstring(cstr);
#else
opcode_t * const dest = Parrot_ex_throw_from_op_args(interp, cur_opcode + 3, EXCEPTION_LIBRARY_ERROR, "no ICU lib loaded");
return (opcode_t *)dest;
#endif
IREG(1) = Parrot_str_internal_find_codepoint(interp, SREG(2));
return cur_opcode + 3;
}

opcode_t *
Parrot_find_codepoint_i_sc(opcode_t *cur_opcode, PARROT_INTERP) {
#if PARROT_HAS_ICU
UErrorCode err = U_ZERO_ERROR;
char * const cstr = Parrot_str_to_cstring(interp, SCONST(2));
UCharNameChoice nameChoices[] = {U_UNICODE_CHAR_NAME, U_EXTENDED_CHAR_NAME,
U_CHAR_NAME_ALIAS, U_UNICODE_10_CHAR_NAME};
unsigned int i = 0;
IREG(1) = -1;
for (; i < (sizeof(nameChoices)/sizeof(nameChoices[0])); i++) {
UChar32 codepoint = u_charFromName(nameChoices[i], cstr, &err);
if (U_SUCCESS(err)) {
IREG(1) = (INTVAL) codepoint;
Parrot_str_free_cstring(cstr);
return cur_opcode + 3;
}
}
{
const struct Parrot_namealias *namealias
= Parrot_namealias_lookup(cstr, STRING_byte_length(SREG(2)));
if (namealias)
IREG(1) = (INTVAL) namealias->key;
}
found:
Parrot_str_free_cstring(cstr);
#else
opcode_t * const dest = Parrot_ex_throw_from_op_args(interp, cur_opcode + 3, EXCEPTION_LIBRARY_ERROR, "no ICU lib loaded");
return (opcode_t *)dest;

#endif
;
IREG(1) = Parrot_str_internal_find_codepoint(interp, SCONST(2));
return cur_opcode + 3;
}

Expand Down Expand Up @@ -24652,6 +24600,7 @@ Parrot_terminate(opcode_t *cur_opcode, PARROT_INTERP) {
UNUSED(interp);
UNUSED(cur_opcode);
return (opcode_t *)0;
return cur_opcode + 1;
}


Expand Down
38 changes: 6 additions & 32 deletions src/ops/string.ops
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@

BEGIN_OPS_PREAMBLE

#include "parrot/encoding.h"
#include "parrot/namealias.h"
#if PARROT_HAS_ICU
# include <unicode/uchar.h>
# include "parrot/namealias.h"
#endif

END_OPS_PREAMBLE
Expand Down Expand Up @@ -576,41 +577,14 @@ op compose(out STR, in STR) {
=item B<find_codepoint>(out INT, in STR)

Set $1 to the codepoint with the name given in $2, or -1 if there is none.
Requires ICU lib, otherwise exception is thrown.

With ICU many more name aliases are found, but without currently only
for control characters.

=cut

op find_codepoint(out INT, in STR) {
#if PARROT_HAS_ICU
UErrorCode err = U_ZERO_ERROR;
char * const cstr = Parrot_str_to_cstring(interp, $2);
/* At first search for proper names. This will not find name aliases for
control characters */
UCharNameChoice nameChoices[] = {U_UNICODE_CHAR_NAME, U_EXTENDED_CHAR_NAME,
U_CHAR_NAME_ALIAS, U_UNICODE_10_CHAR_NAME};
unsigned int i = 0;
$1 = -1;
for (; i < (sizeof (nameChoices) / sizeof (nameChoices[0])); i++) {
UChar32 codepoint = u_charFromName(nameChoices[i], cstr, &err);
if (U_SUCCESS(err)) {
$1 = (INTVAL) codepoint;
goto found;
}
}
{
const struct Parrot_namealias *namealias
= Parrot_namealias_lookup(cstr, STRING_byte_length($2));
if (namealias)
$1 = (INTVAL) namealias->key;
}
found:
Parrot_str_free_cstring(cstr);
goto ADDRESS(dest);
#else
opcode_t * const dest = Parrot_ex_throw_from_op_args(interp, expr NEXT(),
EXCEPTION_LIBRARY_ERROR, "no ICU lib loaded");
goto ADDRESS(dest);
#endif
$1 = Parrot_str_internal_find_codepoint(INTERP, $2);
}

=back
Expand Down
51 changes: 51 additions & 0 deletions src/string/encoding.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ These are parrot's generic encoding handling functions
*/

#include "parrot/encoding.h"
#include "parrot/namealias.h"
#if PARROT_HAS_ICU
# include <unicode/uchar.h>
#endif
#include "encoding.str"

STR_VTABLE *Parrot_default_encoding_ptr = NULL;
Expand Down Expand Up @@ -499,6 +503,53 @@ Parrot_default_encoding(SHIM_INTERP)

/*
=item C<INTVAL Parrot_str_internal_find_codepoint(PARROT_INTERP, const STRING
*name)>
Helper function for string.ops in the ICU and non-ICU variant.
At first search for ICU names.
This will not find name aliases for control characters starting with ICU 5.2.
U_CHAR_NAME_ALIAS started with ICU 4.4,
U_UNICODE_10_CHAR_NAME (the "old name" like "LINE FEED") was deprecated with ICU 4.9,
but U_CHAR_NAME_CHOICE_COUNT is stable since 2.0.
=cut
*/

PARROT_PURE_FUNCTION
PARROT_WARN_UNUSED_RESULT
INTVAL
Parrot_str_internal_find_codepoint(PARROT_INTERP, ARGIN(const STRING *name))
{
ASSERT_ARGS(Parrot_str_internal_find_codepoint)
INTVAL retval = -1;
char * const cstr = Parrot_str_to_cstring(interp, name);
#if PARROT_HAS_ICU
UErrorCode err = U_ZERO_ERROR;
unsigned int i = 0;
for (; i < U_CHAR_NAME_CHOICE_COUNT; i++) {
UChar32 codepoint = u_charFromName((UCharNameChoice)i, cstr, &err);
if (U_SUCCESS(err)) {
retval = (INTVAL) codepoint;
goto found;
}
}
#endif
{
const struct Parrot_namealias *namealias
= Parrot_namealias_lookup(cstr, STRING_byte_length(name));
if (namealias)
retval = (INTVAL) namealias->codepoint;
}
found:
Parrot_str_free_cstring(cstr);
return retval;
}

/*
=back
*/
Expand Down
Loading

0 comments on commit 7bc3a07

Please sign in to comment.