Skip to content

Commit

Permalink
[GH #848] Support unicode strings for bitwise ops
Browse files Browse the repository at this point in the history
bands, bors, bxors and bnots understand now more multi-byte strings.

Old docs:
Performs a bitwise C<OR> on two Parrot strings, performing type and encoding
conversions if necessary.  Returns the result as a new string.

Added now:
Same encodings are compared bitwise.  Different two-byte encodings are first
converted down to latin1, and if that fails upgraded to ucs2 and if that
fails to ucs4.
2 byte <=> 4 byte conversion is not supported, thus e.g. utf8->utf16 fails.
But 2 byte to 1 byte is attempted.
  • Loading branch information
Reini Urban committed Sep 28, 2012
1 parent 04f1986 commit 6c41a21
Show file tree
Hide file tree
Showing 4 changed files with 370 additions and 77 deletions.
25 changes: 25 additions & 0 deletions include/parrot/encoding.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,22 @@ void Parrot_deinit_encodings(PARROT_INTERP)
void Parrot_str_internal_register_encoding_names(PARROT_INTERP)
__attribute__nonnull__(1);

PARROT_CAN_RETURN_NULL
STRING * str_internal_trans_to_fixed8(PARROT_INTERP,
ARGIN(const STRING *src))
__attribute__nonnull__(1)
__attribute__nonnull__(2);

PARROT_CAN_RETURN_NULL
STRING * str_internal_trans_to_ucs2(PARROT_INTERP, ARGIN(const STRING *src))
__attribute__nonnull__(1)
__attribute__nonnull__(2);

PARROT_CANNOT_RETURN_NULL
STRING * str_internal_trans_to_ucs4(PARROT_INTERP, ARGIN(const STRING *src))
__attribute__nonnull__(1)
__attribute__nonnull__(2);

#define ASSERT_ARGS_Parrot_default_encoding __attribute__unused__ int _ASSERT_ARGS_CHECK = (0)
#define ASSERT_ARGS_Parrot_encoding_c_name __attribute__unused__ int _ASSERT_ARGS_CHECK = (0)
#define ASSERT_ARGS_Parrot_encoding_name __attribute__unused__ int _ASSERT_ARGS_CHECK = (0)
Expand Down Expand Up @@ -158,6 +174,15 @@ void Parrot_str_internal_register_encoding_names(PARROT_INTERP)
#define ASSERT_ARGS_Parrot_str_internal_register_encoding_names \
__attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp))
#define ASSERT_ARGS_str_internal_trans_to_fixed8 __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(src))
#define ASSERT_ARGS_str_internal_trans_to_ucs2 __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(src))
#define ASSERT_ARGS_str_internal_trans_to_ucs4 __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(src))
/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
/* HEADERIZER END: src/string/encoding.c */

Expand Down
255 changes: 197 additions & 58 deletions src/string/api.c
Original file line number Diff line number Diff line change
Expand Up @@ -1455,10 +1455,26 @@ STRING *s2)>
Performs a bitwise C<AND> on two Parrot strings, performing type and encoding
conversions if necessary. Returns the result as a new string.
Same encodings are compared bitwise. Different two-byte encodings are first
converted down to latin1, and if that fails upgraded to ucs2 and if that
fails to ucs4.
=cut
*/

#define BITWISE_UPGRADE_BOTH(s1, s2) \
STRING * e_s1 = str_internal_trans_to_ucs2(interp, s1); \
STRING * e_s2 = str_internal_trans_to_ucs2(interp, s2); \
if (e_s1 && e_s2) { \
s1 = e_s1; \
s2 = e_s2; \
} \
else { \
s1 = str_internal_trans_to_ucs4(interp, s1); \
s2 = str_internal_trans_to_ucs4(interp, s2); \
}

PARROT_EXPORT
PARROT_CANNOT_RETURN_NULL
STRING *
Expand All @@ -1468,17 +1484,7 @@ Parrot_str_bitwise_and(PARROT_INTERP, ARGIN_NULLOK(const STRING *s1),
ASSERT_ARGS(Parrot_str_bitwise_and)
STRING *res;
size_t minlen;

/* we could also trans_encoding to iso-8859-1 */
if (s1 && STRING_max_bytes_per_codepoint(s1) != 1)
Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_ENCODING,
"string bitwise_and (%s/%s) unsupported",
s1->encoding->name, nonnull_encoding_name(s2));

if (s2 && STRING_max_bytes_per_codepoint(s2) != 1)
Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_ENCODING,
"string bitwise_and (%s/%s) unsupported",
nonnull_encoding_name(s1), s2->encoding->name);
int b1, b2;

/* think about case of dest string is one of the operands */
if (!STRING_IS_NULL(s1) && !STRING_IS_NULL(s2))
Expand All @@ -1496,6 +1502,62 @@ Parrot_str_bitwise_and(PARROT_INTERP, ARGIN_NULLOK(const STRING *s1),
return res;
}

b1 = STRING_max_bytes_per_codepoint(s1);
b2 = STRING_max_bytes_per_codepoint(s2);

if (minlen && b1 != 1 && b2 != 1) {
str_vtable_to_encoding_t enc1 = s1->encoding;
str_vtable_to_encoding_t enc2 = s2->encoding;
if (enc1 == enc2) {
;
}
/* GH 848: trans_encode down to latin1 or up to ucs2 or ucs4 */
else if (b1 == 2 && b2 == 1) {
STRING * e_down = str_internal_trans_to_fixed8(interp, s1);
if (e_down)
s1 = e_down;
else {
BITWISE_UPGRADE_BOTH(s1, s2);
}
}
else if (b1 == 1 && b2 == 2) {
STRING * e_down = str_internal_trans_to_fixed8(interp, s2);
if (e_down)
s2 = e_down;
else {
BITWISE_UPGRADE_BOTH(s1, s2);
}
}
else if (b1 == 2 && b2 == 2) {
STRING * e_down1 = str_internal_trans_to_fixed8(interp, s1);
STRING * e_down2 = str_internal_trans_to_fixed8(interp, s2);
if (e_down1 && e_down2) {
s1 = e_down1;
s2 = e_down2;
}
else {
BITWISE_UPGRADE_BOTH(s1, s2);
}
}
else if (b1 == 4 && b2 == 4) {
STRING * e_down1 = str_internal_trans_to_fixed8(interp, s1);
STRING * e_down2 = str_internal_trans_to_fixed8(interp, s2);
if (e_down1 && e_down2) {
s1 = e_down1;
s2 = e_down2;
}
else {
BITWISE_UPGRADE_BOTH(s1, s2);
}
}
else {
Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_ENCODING,
"string bitwise_and (%s/%s) unsupported",
s1->encoding->name, s2->encoding->name);
}
minlen = s1->strlen > s2->strlen ? s2->strlen : s1->strlen;
}

#if ! DISABLE_GC_DEBUG
/* trigger GC for debug */
if (interp && GC_DEBUG(interp))
Expand Down Expand Up @@ -1596,6 +1658,10 @@ STRING *s2)>
Performs a bitwise C<OR> on two Parrot strings, performing type and encoding
conversions if necessary. Returns the result as a new string.
Same encodings are compared bitwise. Different two-byte encodings are first
converted down to latin1, and if that fails upgraded to ucs2 and if that
fails to ucs4.
=cut
*/
Expand All @@ -1609,27 +1675,9 @@ Parrot_str_bitwise_or(PARROT_INTERP, ARGIN_NULLOK(const STRING *s1),
ASSERT_ARGS(Parrot_str_bitwise_or)
STRING *res;
size_t maxlen = 0;
int b1, b2;

if (!STRING_IS_NULL(s1)) {
if (STRING_max_bytes_per_codepoint(s1) != 1)
Parrot_ex_throw_from_c_args(interp, NULL,
EXCEPTION_INVALID_ENCODING,
"string bitwise_or (%s/%s) unsupported",
s1->encoding->name, nonnull_encoding_name(s2));

maxlen = s1->bufused;
}

if (!STRING_IS_NULL(s2)) {
if (STRING_max_bytes_per_codepoint(s2) != 1)
Parrot_ex_throw_from_c_args(interp, NULL,
EXCEPTION_INVALID_ENCODING,
"string bitwise_or (%s/%s) unsupported",
nonnull_encoding_name(s1), s2->encoding->name);

if (s2->bufused > maxlen)
maxlen = s2->bufused;
}
maxlen = s1->bufused > s2->bufused ? s1->bufused : s2->bufused;

res = Parrot_str_new_init(interp, NULL, maxlen,
Parrot_binary_encoding_ptr, 0);
Expand All @@ -1640,6 +1688,62 @@ Parrot_str_bitwise_or(PARROT_INTERP, ARGIN_NULLOK(const STRING *s1),
return res;
}

b1 = STRING_max_bytes_per_codepoint(s1);
b2 = STRING_max_bytes_per_codepoint(s2);

if (b1 != 1 && b2 != 1) {
str_vtable_to_encoding_t enc1 = s1->encoding;
str_vtable_to_encoding_t enc2 = s2->encoding;
if (enc1 == enc2) {
;
}
/* GH 848: trans_encode down to latin1 or up to ucs2 or ucs4 */
else if (b1 == 2 && b2 == 1) {
STRING * e_down = str_internal_trans_to_fixed8(interp, s1);
if (e_down)
s1 = e_down;
else {
BITWISE_UPGRADE_BOTH(s1, s2);
}
}
else if (b1 == 1 && b2 == 2) {
STRING * e_down = str_internal_trans_to_fixed8(interp, s2);
if (e_down)
s2 = e_down;
else {
BITWISE_UPGRADE_BOTH(s1, s2);
}
}
else if (b1 == 2 && b2 == 2) {
STRING * e_down1 = str_internal_trans_to_fixed8(interp, s1);
STRING * e_down2 = str_internal_trans_to_fixed8(interp, s2);
if (e_down1 && e_down2) {
s1 = e_down1;
s2 = e_down2;
}
else {
BITWISE_UPGRADE_BOTH(s1, s2);
}
}
else if (b1 == 4 && b2 == 4) {
STRING * e_down1 = str_internal_trans_to_fixed8(interp, s1);
STRING * e_down2 = str_internal_trans_to_fixed8(interp, s2);
if (e_down1 && e_down2) {
s1 = e_down1;
s2 = e_down2;
}
else {
BITWISE_UPGRADE_BOTH(s1, s2);
}
}
else {
Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_ENCODING,
"string bitwise_or (%s/%s) unsupported",
s1->encoding->name, s2->encoding->name);
}
maxlen = s1->bufused > s2->bufused ? s1->bufused : s2->bufused;
}

#if ! DISABLE_GC_DEBUG
/* trigger GC for debug */
if (interp && GC_DEBUG(interp))
Expand Down Expand Up @@ -1676,26 +1780,9 @@ Parrot_str_bitwise_xor(PARROT_INTERP, ARGIN_NULLOK(const STRING *s1),
STRING *res;
size_t maxlen = 0;

if (!STRING_IS_NULL(s1)) {
if (STRING_max_bytes_per_codepoint(s1) != 1)
Parrot_ex_throw_from_c_args(interp, NULL,
EXCEPTION_INVALID_ENCODING,
"string bitwise_xor (%s/%s) unsupported",
s1->encoding->name, nonnull_encoding_name(s2));

maxlen = s1->bufused;
}
int b1, b2;

if (!STRING_IS_NULL(s2)) {
if (STRING_max_bytes_per_codepoint(s2) != 1)
Parrot_ex_throw_from_c_args(interp, NULL,
EXCEPTION_INVALID_ENCODING,
"string bitwise_xor (%s/%s) unsupported",
nonnull_encoding_name(s1), s2->encoding->name);

if (s2->bufused > maxlen)
maxlen = s2->bufused;
}
maxlen = s1->bufused > s2->bufused ? s1->bufused : s2->bufused;

res = Parrot_str_new_init(interp, NULL, maxlen,
Parrot_binary_encoding_ptr, 0);
Expand All @@ -1706,6 +1793,61 @@ Parrot_str_bitwise_xor(PARROT_INTERP, ARGIN_NULLOK(const STRING *s1),
return res;
}

b1 = STRING_max_bytes_per_codepoint(s1);
b2 = STRING_max_bytes_per_codepoint(s2);

if (b1 != 1 && b2 != 1) {
str_vtable_to_encoding_t enc1 = s1->encoding;
str_vtable_to_encoding_t enc2 = s2->encoding;
if (enc1 == enc2) {
maxlen = s1->strlen > s2->strlen ? s1->strlen : s2->strlen;
}
/* GH 848: trans_encode down to latin1 or up to ucs2 or ucs4 */
else if (b1 == 2 && b2 == 1) {
STRING * e_down = str_internal_trans_to_fixed8(interp, s1);
if (e_down)
s1 = e_down;
else {
BITWISE_UPGRADE_BOTH(s1, s2);
}
}
else if (b1 == 1 && b2 == 2) {
STRING * e_down = str_internal_trans_to_fixed8(interp, s2);
if (e_down)
s2 = e_down;
else {
BITWISE_UPGRADE_BOTH(s1, s2);
}
}
else if (b1 == 2 && b2 == 2) {
STRING * e_down1 = str_internal_trans_to_fixed8(interp, s1);
STRING * e_down2 = str_internal_trans_to_fixed8(interp, s2);
if (e_down1 && e_down2) {
s1 = e_down1;
s2 = e_down2;
}
else {
BITWISE_UPGRADE_BOTH(s1, s2);
}
}
else if (b1 == 4 && b2 == 4) {
STRING * e_down1 = str_internal_trans_to_fixed8(interp, s1);
STRING * e_down2 = str_internal_trans_to_fixed8(interp, s2);
if (e_down1 && e_down2) {
s1 = e_down1;
s2 = e_down2;
}
else {
BITWISE_UPGRADE_BOTH(s1, s2);
}
}
else {
Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_ENCODING,
"string bitwise_xor (%s/%s) unsupported",
s1->encoding->name, s2->encoding->name);
}
}

#if ! DISABLE_GC_DEBUG
/* trigger GC for debug */
if (interp && GC_DEBUG(interp))
Expand Down Expand Up @@ -1740,6 +1882,8 @@ do { \
Performs a bitwise C<NOT> on a Parrot string. Returns the result as a new
string.
Multi-byte encoded strings are handled binary.
=cut
*/
Expand All @@ -1753,15 +1897,8 @@ Parrot_str_bitwise_not(PARROT_INTERP, ARGIN_NULLOK(const STRING *s))
STRING *res;
size_t len;

if (!STRING_IS_NULL(s)) {
if (STRING_max_bytes_per_codepoint(s) != 1)
Parrot_ex_throw_from_c_args(interp, NULL,
EXCEPTION_INVALID_ENCODING,
"string bitwise_not (%s) unsupported",
s->encoding->name);

if (!STRING_IS_NULL(s))
len = s->bufused;
}
else
len = 0;

Expand All @@ -1787,6 +1924,8 @@ Parrot_str_bitwise_not(PARROT_INTERP, ARGIN_NULLOK(const STRING *s))
return res;
}

#undef UPGRADE_BOTH


/*
Expand Down
Loading

0 comments on commit 6c41a21

Please sign in to comment.