Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

resolve conflict

  • Loading branch information...
commit 64fc4ac332eab0be7704cf6f7ec5a96c523c0ed9 1 parent 0d8adaa
@matz matz authored
View
23 include/mruby/string.h
@@ -30,34 +30,15 @@ extern const char mrb_digitmap[];
struct RString {
MRUBY_OBJECT_HEADER;
int len;
- union {
- int capa;
- struct RString *shared;
- } aux;
+ int capa;
char *buf;
};
-extern struct SCOPE {
- struct RBasic super;
- mrb_sym *local_tbl;
- mrb_value *local_vars;
- int flags;
-} *ruby_scope;
-
-struct RVarmap {
- struct RBasic super;
- mrb_sym id;
- mrb_value val;
- struct RVarmap *next;
-};
-extern struct RVarmap *ruby_dyna_vars;
-
#define mrb_str_ptr(s) ((struct RString*)((s).value.p))
#define RSTRING(s) ((struct RString*)((s).value.p))
#define RSTRING_PTR(s) (RSTRING(s)->buf)
#define RSTRING_LEN(s) (RSTRING(s)->len)
-#define RSTRING_CAPA(s) (RSTRING(s)->aux.capa)
-#define RSTRING_SHARED(s) (RSTRING(s)->aux.shared)
+#define RSTRING_CAPA(s) (RSTRING(s)->capa)
#define RSTRING_END(s) (RSTRING(s)->buf + RSTRING(s)->len)
#define MRB_STR_SHARED 256
View
8 src/class.c
@@ -17,12 +17,6 @@
#include "mruby/khash.h"
-#ifdef INCLUDE_REGEXP
- #define mrb_usascii_str_new2 mrb_usascii_str_new_cstr
-#else
- #define mrb_usascii_str_new2 mrb_str_new_cstr
-#endif
-
KHASH_MAP_INIT_INT(mt, struct RProc*);
KHASH_MAP_INIT_INT(iv, mrb_value);
@@ -1052,7 +1046,7 @@ mrb_mod_to_s(mrb_state *mrb, mrb_value klass)
{
//if (FL_TEST(klass, FL_SINGLETON)) {
if (mrb_type(klass) == MRB_TT_SCLASS) {
- mrb_value s = mrb_usascii_str_new2(mrb, "#<");
+ mrb_value s = mrb_str_new_cstr(mrb, "#<");
mrb_value v = mrb_iv_get(mrb, klass, mrb_intern(mrb, "__attached__"));
mrb_str_cat2(mrb, s, "Class:");
View
1,685 src/encoding.c
@@ -1,1685 +0,0 @@
-/*
-** encoding.c - Encoding class
-**
-** See Copyright Notice in mruby.h
-*/
-
-#include "mruby.h"
-#ifdef INCLUDE_ENCODING
-#include <ctype.h>
-#ifndef NO_LOCALE_CHARMAP
-#ifdef __CYGWIN__
-#include <windows.h>
-#endif
-#ifdef HAVE_LANGINFO_H
-#include <langinfo.h>
-#endif
-#endif
-
-#define USE_UPPER_CASE_TABLE
-
-#include <ctype.h>
-#include <stdio.h>
-#include "regenc.h"
-#include "regint.h"
-#include "encoding.h"
-#include "st.h"
-#include <string.h>
-#include "mruby/numeric.h"
-#include "mruby/string.h"
-#include "mruby/array.h"
-#include "mruby/variable.h"
-#include "mruby/hash.h"
-
-#define pprintf printf
-#define mrb_warning printf
-#define mrb_bug printf
-#ifndef INT_MAX
-#define INT_MAX 2147483647
-#endif
-#define mrb_isascii(c) ((unsigned long)(c) < 128)
-#define OBJ_FREEZE(a)
-static mrb_sym id_encoding;
-//mrb_value mrb_cEncoding;
-static mrb_value mrb_encoding_list;
-
-struct mrb_encoding_entry {
- const char *name;
- mrb_encoding *enc;
- mrb_encoding *base;
-};
-
-static struct {
- struct mrb_encoding_entry *list;
- int count;
- int size;
- st_table *names;
-} enc_table;
-
-void mrb_enc_init(mrb_state *mrb);
-
-enum {
- ENCINDEX_ASCII,
- ENCINDEX_UTF_8,
- ENCINDEX_US_ASCII,
- ENCINDEX_BUILTIN_MAX
-};
-#define ENCODING_COUNT ENCINDEX_BUILTIN_MAX
-#define ENCODING_NAMELEN_MAX 63
-#define valid_encoding_name_p(name) ((name) && strlen(name) <= ENCODING_NAMELEN_MAX)
-#define STRCASECMP(s1, s2) (st_strcasecmp(s1, s2))
-
-//#define BUILTIN_TYPE(x) (int)(((struct RBasic*)(x))->flags & T_MASK)
-#ifndef FALSE
-#define FALSE 0
-#endif
-
-#ifndef TRUE
-#define TRUE 1
-#endif
-
-#ifndef OTHER
-#define OTHER 2
-#endif
-
-#define mrb_usascii_str_new2 mrb_usascii_str_new_cstr
-
-static const struct mrb_data_type encoding_data_type = {
- "encoding", 0,
-};
-#define is_data_encoding(obj) (DATA_TYPE(obj) == &encoding_data_type)
-
-// RUBY_IMMEDIATE_MASK = 0x03,
-//#define IMMEDIATE_MASK RUBY_IMMEDIATE_MASK
-//#define IMMEDIATE_P(x) ((VALUE)(x) & IMMEDIATE_MASK)
-//#define SPECIAL_CONST_P(x) (IMMEDIATE_P(x) || !RTEST(x))
-
-static mrb_value
-enc_new(mrb_state *mrb, mrb_encoding *encoding)
-{
- return mrb_obj_value(Data_Wrap_Struct(mrb, ENCODE_CLASS, &encoding_data_type, encoding));
-}
-
-#define enc_autoload_p(enc) (!mrb_enc_mbmaxlen(enc))
-
-#define UNSPECIFIED_ENCODING INT_MAX
-
-
-static mrb_value
-mrb_enc_from_encoding_index(mrb_state *mrb, int idx)
-{
- mrb_value list, enc;
-
- if (mrb_nil_p(list = mrb_encoding_list)) {
- mrb_bug("mrb_enc_from_encoding_index(%d): no mrb_encoding_list", idx);
- }
- enc = mrb_ary_ref(mrb, list, idx);//mrb_ary_entry(list, idx);
- if (mrb_nil_p(enc)) {
- mrb_bug("mrb_enc_from_encoding_index(%d): not created yet", idx);
- }
- return enc;
-}
-
-mrb_value
-mrb_enc_from_encoding(mrb_state *mrb, mrb_encoding *encoding)
-{
- int idx;
- if (!encoding) return mrb_nil_value();
- idx = ENC_TO_ENCINDEX(encoding);
- return mrb_enc_from_encoding_index(mrb, idx);
-}
-
-static int enc_autoload(mrb_state *mrb, mrb_encoding *enc);
-static int
-check_encoding(mrb_state *mrb, mrb_encoding *enc)
-{
- int index = mrb_enc_to_index(enc);
- if (mrb_enc_from_index(mrb, index) != enc)
- return -1;
- if (enc_autoload_p(enc)) {
- index = enc_autoload(mrb, enc);
- }
- return index;
-}
-
-static int
-enc_check_encoding(mrb_state *mrb, mrb_value obj)
-{
- if (SPECIAL_CONST_P(obj) || !is_data_encoding(obj)) {
- return -1;
- }
- return check_encoding(mrb, RDATA(obj)->data);
-}
-
-static int
-must_encoding(mrb_state *mrb, mrb_value enc)
-{
- int index = enc_check_encoding(mrb, enc);
- if (index < 0) {
- mrb_raise(mrb, E_TYPE_ERROR, "wrong argument type %s (expected Encoding)",
- mrb_obj_classname(mrb, enc));
- }
- return index;
-}
-
-int
-mrb_to_encoding_index(mrb_state *mrb, mrb_value enc)
-{
- int idx;
-
- idx = enc_check_encoding(mrb, enc);
- if (idx >= 0) {
- return idx;
- }
- else if (mrb_nil_p(enc = mrb_check_string_type(mrb, enc))) {
- return -1;
- }
- if (!mrb_enc_asciicompat(mrb, mrb_enc_get(mrb, enc))) {
- return -1;
- }
- //return mrb_enc_find_index(StringValueCStr(enc));
- return mrb_enc_find_index(mrb, mrb_string_value_cstr(mrb, &enc));
-
-}
-
-static mrb_encoding *
-to_encoding(mrb_state *mrb, mrb_value enc)
-{
- int idx;
-
- //StringValue(enc);
- mrb_string_value(mrb, &enc);
-
- if (!mrb_enc_asciicompat(mrb, mrb_enc_get(mrb, enc))) {
- mrb_raise(mrb, E_ARGUMENT_ERROR, "invalid name encoding (non ASCII)");
- }
- //idx = mrb_enc_find_index(StringValueCStr(enc));
- idx = mrb_enc_find_index(mrb, mrb_string_value_cstr(mrb, &enc));
- if (idx < 0) {
- mrb_raise(mrb, E_ARGUMENT_ERROR, "unknown encoding name - %s", RSTRING_PTR(enc));
- }
- return mrb_enc_from_index(mrb, idx);
-}
-
-mrb_encoding *
-mrb_to_encoding(mrb_state *mrb, mrb_value enc)
-{
- if (enc_check_encoding(mrb, enc) >= 0) return RDATA(enc)->data;
- return to_encoding(mrb, enc);
-}
-
-static int
-enc_table_expand(int newsize)
-{
- struct mrb_encoding_entry *ent;
- int count = newsize;
-
- if (enc_table.size >= newsize) return newsize;
- newsize = (newsize + 7) / 8 * 8;
- ent = realloc(enc_table.list, sizeof(*enc_table.list) * newsize);
- if (!ent) return -1;
- memset(ent + enc_table.size, 0, sizeof(*ent)*(newsize - enc_table.size));
- enc_table.list = ent;
- enc_table.size = newsize;
- return count;
-}
-
-static int
-enc_register_at(mrb_state *mrb, int index, const char *name, mrb_encoding *encoding)
-{
- struct mrb_encoding_entry *ent = &enc_table.list[index];
- mrb_value list;
- mrb_value ref_ary;
-
- if (!valid_encoding_name_p(name)) return -1;
- if (!ent->name) {
- ent->name = name = strdup(name);
- }
- else if (STRCASECMP(name, ent->name)) {
- return -1;
- }
- if (!ent->enc) {
- ent->enc = xmalloc(sizeof(mrb_encoding));
- }
- if (encoding) {
- *ent->enc = *encoding;
- }
- else {
- memset(ent->enc, 0, sizeof(*ent->enc));
- }
- encoding = ent->enc;
- encoding->name = name;
- encoding->ruby_encoding_index = index;
- st_insert(enc_table.names, (st_data_t)name, (st_data_t)index);
- list = mrb_encoding_list;
- //if (list && mrb_nil_p((mrb_ary_ref(mrb, list, index)))) {
- if (list.tt) {
- ref_ary = mrb_ary_ref(mrb, list, index);
- if mrb_nil_p(ref_ary) {
- /* initialize encoding data */
- mrb_ary_set(mrb, list, index, enc_new(mrb, encoding));//rb_ary_store(list, index, enc_new(encoding));
- }
- }
- return index;
-}
-
-
-static int
-enc_register(mrb_state *mrb, const char *name, mrb_encoding *encoding)
-{
- int index = enc_table.count;
-
- if ((index = enc_table_expand(index + 1)) < 0) return -1;
- enc_table.count = index;
- return enc_register_at(mrb, index - 1, name, encoding);
-}
-
-static void set_encoding_const(mrb_state *, const char*, mrb_encoding*);
-int mrb_enc_registered(const char*);
-
-static void
-enc_check_duplication(mrb_state *mrb, const char *name)
-{
- if (mrb_enc_registered(name) >= 0) {
- mrb_raise(mrb, E_ARGUMENT_ERROR, "encoding %s is already registered", name);
- }
-}
-static mrb_encoding*
-set_base_encoding(int index, mrb_encoding *base)
-{
- mrb_encoding *enc = enc_table.list[index].enc;
-
- enc_table.list[index].base = base;
- if (mrb_enc_dummy_p(base)) ENC_SET_DUMMY(enc);
- return enc;
-}
-
-int
-mrb_enc_replicate(mrb_state *mrb, const char *name, mrb_encoding *encoding)
-{
- int idx;
-
- enc_check_duplication(mrb, name);
- idx = enc_register(mrb, name, encoding);
- set_base_encoding(idx, encoding);
- set_encoding_const(mrb, name, mrb_enc_from_index(mrb, idx));
- return idx;
-}
-
-/* 15.2.40.2.17 */
-/*
- * call-seq:
- * enc.replicate(name) -> encoding
- *
- * Returns a replicated encoding of _enc_ whose name is _name_.
- * The new encoding should have the same byte structure of _enc_.
- * If _name_ is used by another encoding, raise ArgumentError.
- *
- */
-static mrb_value
-enc_replicate(mrb_state *mrb, mrb_value encoding)
-{
- mrb_value name;
- mrb_get_args(mrb, "o", &name);
- return mrb_enc_from_encoding_index(mrb,
- //mrb_enc_replicate(mrb, StringValueCStr(name),
- mrb_enc_replicate(mrb, mrb_string_value_cstr(mrb, &name),
- mrb_to_encoding(mrb, encoding)));
-}
-static int
-enc_replicate_with_index(mrb_state *mrb, const char *name, mrb_encoding *origenc, int idx)
-{
- if (idx < 0) {
- idx = enc_register(mrb, name, origenc);
- }
- else {
- idx = enc_register_at(mrb, idx, name, origenc);
- }
- if (idx >= 0) {
- set_base_encoding(idx, origenc);
- set_encoding_const(mrb, name, mrb_enc_from_index(mrb, idx));
- }
- return idx;
-}
-int
-mrb_encdb_replicate(mrb_state *mrb, const char *name, const char *orig)
-{
- int origidx = mrb_enc_registered(orig);
- int idx = mrb_enc_registered(name);
-
- if (origidx < 0) {
- origidx = enc_register(mrb, orig, 0);
- }
- return enc_replicate_with_index(mrb, name, mrb_enc_from_index(mrb, origidx), idx);
-}
-int
-mrb_define_dummy_encoding(mrb_state *mrb, const char *name)
-{
- int index = mrb_enc_replicate(mrb, name, mrb_ascii8bit_encoding(mrb));
- mrb_encoding *enc = enc_table.list[index].enc;
-
- ENC_SET_DUMMY(enc);
- return index;
-}
-
-int
-mrb_encdb_dummy(mrb_state *mrb, const char *name)
-{
- int index = enc_replicate_with_index(mrb, name, mrb_ascii8bit_encoding(mrb),
- mrb_enc_registered(name));
- mrb_encoding *enc = enc_table.list[index].enc;
-
- ENC_SET_DUMMY(enc);
- return index;
-}
-
-/* 15.2.40.2.13 */
-/*
- * call-seq:
- * enc.dummy? -> true or false
- *
- * Returns true for dummy encodings.
- * A dummy encoding is an encoding for which character handling is not properly
- * implemented.
- * It is used for stateful encodings.
- *
- * Encoding::ISO_2022_JP.dummy? #=> true
- * Encoding::UTF_8.dummy? #=> false
- *
- */
-static mrb_value
-enc_dummy_p(mrb_state *mrb, mrb_value enc)
-{
- return ENC_DUMMY_P(enc_table.list[must_encoding(mrb, enc)].enc) ? mrb_true_value() : mrb_false_value();
-}
-
-/* 15.2.40.2.12 */
-/*
- * call-seq:
- * enc.ascii_compatible? -> true or false
- *
- * Returns whether ASCII-compatible or not.
- *
- * Encoding::UTF_8.ascii_compatible? #=> true
- * Encoding::UTF_16BE.ascii_compatible? #=> false
- *
- */
-static mrb_value
-enc_ascii_compatible_p(mrb_state *mrb, mrb_value enc)
-{
- return mrb_enc_asciicompat(mrb, enc_table.list[must_encoding(mrb, enc)].enc) ? mrb_true_value() : mrb_false_value();
-}
-
-static const char *
-enc_alias_internal(const char *alias, int idx)
-{
- alias = strdup(alias);
- st_insert(enc_table.names, (st_data_t)alias, (st_data_t)idx);
- return alias;
-}
-
-/*
- * Returns 1 when the encoding is Unicode series other than UTF-7 else 0.
- */
-int
-mrb_enc_unicode_p(mrb_encoding *enc)
-{
- const char *name = mrb_enc_name(enc);
- return name[0] == 'U' && name[1] == 'T' && name[2] == 'F' && name[4] != '7';
-}
-
-extern mrb_encoding OnigEncodingUTF_8;
-extern mrb_encoding OnigEncodingUS_ASCII;
-
-void
-mrb_enc_init(mrb_state *mrb)
-{
- enc_table_expand(ENCODING_COUNT + 1);
- if (!enc_table.names) {
- enc_table.names = st_init_strcasetable();
- }
-#define ENC_REGISTER(enc) enc_register_at(mrb, ENCINDEX_##enc, mrb_enc_name(&OnigEncoding##enc), &OnigEncoding##enc)
- ENC_REGISTER(ASCII);
- ENC_REGISTER(UTF_8);
- ENC_REGISTER(US_ASCII);
-#undef ENC_REGISTER
- enc_table.count = ENCINDEX_BUILTIN_MAX;
-}
-
-mrb_encoding *
-mrb_enc_from_index(mrb_state *mrb, int index)
-{
- if (!enc_table.list) {
- mrb_enc_init(mrb);
- }
- if (index < 0 || enc_table.count <= index) {
- return 0;
- }
- return enc_table.list[index].enc;
-}
-
-int
-mrb_enc_registered(const char *name)
-{
- st_data_t idx = 0;
-
- if (!name) return -1;
- if (!enc_table.list) return -1;
- if (st_lookup(enc_table.names, (st_data_t)name, &idx)) {
- return (int)idx;
- }
- return -1;
-}
-
-mrb_value
-mrb_require_safe(mrb_value fname, int safe)
-{
- mrb_value result = mrb_nil_value();
- return result;
-}
-static int
-load_encoding(const char *name)
-{
- mrb_value enclib;// = mrb_sprintf("enc/%s.so", name);
- //mrb_value verbose;// = ruby_verbose;
- //mrb_value debug;// = ruby_debug;
- //mrb_value loaded;
- char *s = RSTRING_PTR(enclib) + 4, *e = RSTRING_END(enclib) - 3;
- int idx;
-
- while (s < e) {
- if (!ISALNUM(*s)) *s = '_';
- else if (ISUPPER(*s)) *s = TOLOWER(*s);
- ++s;
- }
- OBJ_FREEZE(enclib);
- //ruby_verbose = mrb_false_value();
- //ruby_debug = mrb_false_value();
- //loaded = mrb_protect(require_enc, enclib, 0);
- //ruby_verbose = verbose;
- //ruby_debug = debug;
- //rb_set_errinfo(mrb_nil_value());
- //if (mrb_nil_p(loaded)) return -1;
- if ((idx = mrb_enc_registered(name)) < 0) return -1;
- if (enc_autoload_p(enc_table.list[idx].enc)) return -1;
- return idx;
-}
-
-static int
-enc_autoload(mrb_state *mrb, mrb_encoding *enc)
-{
- int i;
- mrb_encoding *base = enc_table.list[ENC_TO_ENCINDEX(enc)].base;
-
- if (base) {
- i = 0;
- do {
- if (i >= enc_table.count) return -1;
- } while (enc_table.list[i].enc != base && (++i, 1));
- if (enc_autoload_p(base)) {
- if (enc_autoload(mrb, base) < 0) return -1;
- }
- i = ENC_TO_ENCINDEX(enc);
- enc_register_at(mrb, i, mrb_enc_name(enc), base);
- }
- else {
- i = load_encoding(mrb_enc_name(enc));
- }
- return i;
-}
-
-int
-mrb_enc_find_index(mrb_state *mrb, const char *name)
-{
- int i = mrb_enc_registered(name);
- mrb_encoding *enc;
-
- if (i < 0) {
- i = load_encoding(name);
- }
- else if (!(enc = mrb_enc_from_index(mrb, i))) {
- if (i != UNSPECIFIED_ENCODING) {
- mrb_raise(mrb, E_ARGUMENT_ERROR, "encoding %s is not registered", name);
- }
- }
- else if (enc_autoload_p(enc)) {
- if (enc_autoload(mrb, enc) < 0) {
- //mrb_warn("failed to load encoding (%s); use ASCII-8BIT instead",
- printf("failed to load encoding (%s); use ASCII-8BIT instead",
- name);
- return 0;
- }
- }
- return i;
-}
-
-mrb_encoding *
-mrb_enc_find(mrb_state *mrb, const char *name)
-{
- int idx = mrb_enc_find_index(mrb, name);
- if (idx < 0) idx = 0;
- return mrb_enc_from_index(mrb, idx);
-}
-
-static inline int
-enc_capable(mrb_value obj)
-{
- if (SPECIAL_CONST_P(obj)) return (mrb_type(obj) == MRB_TT_SYMBOL);
- switch (mrb_type(obj)/*BUILTIN_TYPE(obj)*/) {
- case MRB_TT_STRING:
- case MRB_TT_REGEX:
- case MRB_TT_FILE:
- return TRUE;
- case MRB_TT_DATA:
- if (is_data_encoding(obj)) return TRUE;
- default:
- return FALSE;
- }
-}
-
-mrb_sym
-mrb_id_encoding(mrb_state *mrb)
-{
- //CONST_ID(id_encoding, "encoding");
- id_encoding = mrb_intern(mrb, "encoding");
- return id_encoding;
-}
-
-int
-mrb_enc_get_index(mrb_state *mrb, mrb_value obj)
-{
- int i = -1;
- mrb_value tmp;
- struct RString *ps;
-
- if (SPECIAL_CONST_P(obj)) {
- if (mrb_type(obj) != MRB_TT_SYMBOL) return -1;
- //obj = mrb_id2str(SYM2ID(obj));
- obj = mrb_str_new_cstr(mrb, mrb_sym2name(mrb, SYM2ID(obj)));
- }
- switch (mrb_type(obj)/*BUILTIN_TYPE(obj)*/) {
- as_default:
- default:
- case MRB_TT_STRING:
- case MRB_TT_REGEX:
- i = (int)ENCODING_GET_INLINED(obj);
- ps = mrb_str_ptr(obj);
- if (i == ENCODING_INLINE_MAX) {
- mrb_value iv;
-
- //iv = rb_ivar_get(obj, mrb_id_encoding(mrb));
- iv = mrb_iv_get(mrb, obj, mrb_id_encoding(mrb));
- i = mrb_fixnum(iv);
- }
- break;
-
- case MRB_TT_FILE:
- tmp = mrb_funcall(mrb, obj, "internal_encoding", 0, 0);
- if (mrb_nil_p(tmp)) obj = mrb_funcall(mrb, obj, "external_encoding", 0, 0);
- else obj = tmp;
- if (mrb_nil_p(obj)) break;
- case MRB_TT_DATA:
- if (is_data_encoding(obj)) {
- i = enc_check_encoding(mrb, obj);
- }
- else {
- goto as_default;
- }
- break;
- }
- return i;
-}
-
-void
-mrb_enc_set_index(mrb_state *mrb, mrb_value obj, int idx)
-{
- if (idx < ENCODING_INLINE_MAX) {
- ENCODING_SET_INLINED(obj, idx);
- return;
- }
- ENCODING_SET_INLINED(obj, ENCODING_INLINE_MAX);
- //mrb_ivar_set(obj, mrb_id_encoding(mrb), INT2NUM(idx));
- mrb_iv_set(mrb, obj, mrb_id_encoding(mrb), mrb_fixnum_value(idx));
- return;
-}
-
-mrb_value
-mrb_enc_associate_index(mrb_state *mrb, mrb_value obj, int idx)
-{
-/* enc_check_capable(obj);*/
- if (mrb_enc_get_index(mrb, obj) == idx)
- return obj;
- if (SPECIAL_CONST_P(obj)) {
- mrb_raise(mrb, E_ARGUMENT_ERROR, "cannot set encoding");
- }
- if (!ENC_CODERANGE_ASCIIONLY(obj) ||
- !mrb_enc_asciicompat(mrb, mrb_enc_from_index(mrb, idx))) {
- ENC_CODERANGE_CLEAR(obj);
- }
- mrb_enc_set_index(mrb, obj, idx);
- return obj;
-}
-
-mrb_value
-mrb_enc_associate(mrb_state *mrb, mrb_value obj, mrb_encoding *enc)
-{
- return mrb_enc_associate_index(mrb, obj, mrb_enc_to_index(enc));
-}
-
-mrb_encoding*
-mrb_enc_get(mrb_state *mrb, mrb_value obj)
-{
- return mrb_enc_from_index(mrb, mrb_enc_get_index(mrb, obj));
-}
-
-mrb_encoding*
-mrb_enc_check(mrb_state *mrb, mrb_value str1, mrb_value str2)
-{
- mrb_encoding *enc = mrb_enc_compatible(mrb, str1, str2);
- if (!enc)
- mrb_raise(mrb, E_ENCODING_ERROR, "incompatible character encodings: %s and %s",
- mrb_enc_name(mrb_enc_get(mrb, str1)),
- mrb_enc_name(mrb_enc_get(mrb, str2)));
- return enc;
-}
-
-mrb_encoding*
-mrb_enc_compatible(mrb_state *mrb, mrb_value str1, mrb_value str2)
-{
- int idx1, idx2;
- mrb_encoding *enc1, *enc2;
-
- idx1 = mrb_enc_get_index(mrb, str1);
- idx2 = mrb_enc_get_index(mrb, str2);
-
- if (idx1 < 0 || idx2 < 0)
- return 0;
-
- if (idx1 == idx2) {
- return mrb_enc_from_index(mrb, idx1);
- }
- enc1 = mrb_enc_from_index(mrb, idx1);
- enc2 = mrb_enc_from_index(mrb, idx2);
-
- if (mrb_type(str2) == MRB_TT_STRING && RSTRING_LEN(str2) == 0)
- //return (idx1 == ENCINDEX_US_ASCII && mrb_enc_asciicompat(mrb, enc2)) ? enc2 : enc1;
- return enc1;
- if (mrb_type(str1) == MRB_TT_STRING && RSTRING_LEN(str1) == 0)
- //return (idx2 == ENCINDEX_US_ASCII && mrb_enc_asciicompat(mrb, enc1)) ? enc1 : enc2;
- return enc2;
- if (!mrb_enc_asciicompat(mrb, enc1) || !mrb_enc_asciicompat(mrb, enc2)) {
- return 0;
- }
-
- /* objects whose encoding is the same of contents */
- //if (mrb_type(str2)/*BUILTIN_TYPE(str2)*/ != MRB_TT_STRING && idx2 == ENCINDEX_US_ASCII)
- //return enc1;
- //if (mrb_type(str1)/*BUILTIN_TYPE(str1)*/ != MRB_TT_STRING && idx1 == ENCINDEX_US_ASCII)
- //return enc2;
-
- if (mrb_type(str1)/*BUILTIN_TYPE(str1)*/ != MRB_TT_STRING) {
- mrb_value tmp = str1;
- int idx0 = idx1;
- str1 = str2;
- str2 = tmp;
- idx1 = idx2;
- idx2 = idx0;
- }
- if (mrb_type(str1)/*BUILTIN_TYPE(str1)*/ == MRB_TT_STRING) {
- int cr1, cr2;
-
- cr1 = mrb_enc_str_coderange(mrb, str1);
- if (mrb_type(str2)/*BUILTIN_TYPE(str2)*/ == MRB_TT_STRING) {
- cr2 = mrb_enc_str_coderange(mrb, str2);
- if (cr1 != cr2) {
- /* may need to handle ENC_CODERANGE_BROKEN */
- if (cr1 == ENC_CODERANGE_7BIT) return enc2;
- if (cr2 == ENC_CODERANGE_7BIT) return enc1;
- }
- if (cr2 == ENC_CODERANGE_7BIT) {
- if (idx1 == ENCINDEX_ASCII) return enc2;
- return enc1;
- }
- }
- if (cr1 == ENC_CODERANGE_7BIT)
- return enc2;
- }
- return 0;
-}
-
-void
-mrb_enc_copy(mrb_state *mrb, mrb_value obj1, mrb_value obj2)
-{
- mrb_enc_associate_index(mrb, obj1, mrb_enc_get_index(mrb, obj2));
-}
-
-
-/*
- * call-seq:
- * obj.encoding -> encoding
- *
- * Returns the Encoding object that represents the encoding of obj.
- */
-
-mrb_value
-mrb_obj_encoding(mrb_state *mrb, mrb_value obj)
-{
- mrb_encoding *enc = mrb_enc_get(mrb, obj);
- if (!enc) {
- mrb_raise(mrb, E_TYPE_ERROR, "unknown encoding");
- }
- return mrb_enc_from_encoding(mrb, enc);
-}
-
-int
-mrb_enc_fast_mbclen(const char *p, const char *e, mrb_encoding *enc)
-{
- return ONIGENC_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
-}
-
-int
-mrb_enc_mbclen(const char *p, const char *e, mrb_encoding *enc)
-{
- int n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
- if (MBCLEN_CHARFOUND_P(n) && MBCLEN_CHARFOUND_LEN(n) <= e-p)
- return MBCLEN_CHARFOUND_LEN(n);
- else {
- int min = mrb_enc_mbminlen(enc);
- return min <= e-p ? min : (int)(e-p);
- }
-}
-
-int
-mrb_enc_precise_mbclen(const char *p, const char *e, mrb_encoding *enc)
-{
- int n;
- if (e <= p)
- return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1);
- n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
- if (e-p < n)
- return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(n-(int)(e-p));
- return n;
-}
-
-int
-mrb_enc_ascget(mrb_state *mrb, const char *p, const char *e, int *len, mrb_encoding *enc)
-{
- unsigned int c, l;
- if (e <= p)
- return -1;
- if (mrb_enc_asciicompat(mrb, enc)) {
- c = (unsigned char)*p;
- if (!ISASCII(c))
- return -1;
- if (len) *len = 1;
- return c;
- }
- l = mrb_enc_precise_mbclen(p, e, enc);
- if (!MBCLEN_CHARFOUND_P(l))
- return -1;
- c = mrb_enc_mbc_to_codepoint(p, e, enc);
- if (!mrb_enc_isascii(c, enc))
- return -1;
- if (len) *len = l;
- return c;
-}
-
-unsigned int
-mrb_enc_codepoint_len(mrb_state *mrb, const char *p, const char *e, int *len_p, mrb_encoding *enc)
-{
- int r;
- if (e <= p)
- mrb_raise(mrb, E_ARGUMENT_ERROR, "empty string");
- r = mrb_enc_precise_mbclen(p, e, enc);
- if (MBCLEN_CHARFOUND_P(r)) {
- if (len_p) *len_p = MBCLEN_CHARFOUND_LEN(r);
- return mrb_enc_mbc_to_codepoint(p, e, enc);
- }
- else
- mrb_raise(mrb, E_ARGUMENT_ERROR, "invalid byte sequence in %s", mrb_enc_name(enc));
- return 0;
-}
-
-#undef mrb_enc_codepoint
-unsigned int
-mrb_enc_codepoint(mrb_state *mrb, const char *p, const char *e, mrb_encoding *enc)
-{
- return mrb_enc_codepoint_len(mrb, p, e, 0, enc);
-}
-
-int
-mrb_enc_codelen(mrb_state *mrb, int c, mrb_encoding *enc)
-{
- int n = ONIGENC_CODE_TO_MBCLEN(enc,c);
- if (n == 0) {
- mrb_raise(mrb, E_ARGUMENT_ERROR, "invalid codepoint 0x%x in %s", c, mrb_enc_name(enc));
- }
- return n;
-}
-
-int
-mrb_enc_toupper(int c, mrb_encoding *enc)
-{
- return (ONIGENC_IS_ASCII_CODE(c)?ONIGENC_ASCII_CODE_TO_UPPER_CASE(c):(c));
-}
-
-int
-mrb_enc_tolower(int c, mrb_encoding *enc)
-{
- return (ONIGENC_IS_ASCII_CODE(c)?ONIGENC_ASCII_CODE_TO_LOWER_CASE(c):(c));
-}
-
-/* 15.2.40.2.14 */
-/*
- * call-seq:
- * enc.inspect -> string
- *
- * Returns a string which represents the encoding for programmers.
- *
- * Encoding::UTF_8.inspect #=> "#<Encoding:UTF-8>"
- * Encoding::ISO_2022_JP.inspect #=> "#<Encoding:ISO-2022-JP (dummy)>"
- */
-static mrb_value
-enc_inspect(mrb_state *mrb, mrb_value self)
-{
- mrb_value str;
- //mrb_value str = mrb_sprintf("#<%s:%s%s>", mrb_obj_classname(mrb, self),
- // mrb_enc_name((mrb_encoding*)(DATA_PTR(self))),
- // (mrb_fixnum(enc_dummy_p(mrb, self)) ? " (dummy)" : ""));
- char buf[256];
- sprintf(buf, "#<%s:%s%s>", mrb_obj_classname(mrb, self),
- mrb_enc_name((mrb_encoding*)(DATA_PTR(self))),
- (mrb_enc_dummy_p((mrb_encoding*)(DATA_PTR(self))) ? " (dummy)" : ""));
- str = mrb_str_new(mrb, buf, strlen(buf));
- ENCODING_CODERANGE_SET(mrb, str, mrb_usascii_encindex(), ENC_CODERANGE_7BIT);
- return str;
-}
-
-/* 15.2.40.2.15 */
-/* 15.2.40.2.18 */
-/*
- * call-seq:
- * enc.name -> string
- *
- * Returns the name of the encoding.
- *
- * Encoding::UTF_8.name #=> "UTF-8"
- */
-static mrb_value
-enc_name(mrb_state *mrb, mrb_value self)
-{
- return mrb_usascii_str_new2(mrb, mrb_enc_name((mrb_encoding*)DATA_PTR(self)));
-}
-
-struct fn_arg {
- mrb_state *mrb;
- enum st_retval (*func)(ANYARGS);
- void *a;
-};
-
-static enum st_retval
-fn_i(st_data_t key, st_data_t val, st_data_t arg) {
- struct fn_arg *a = (struct fn_arg*)arg;
-
- return (*a->func)(a->mrb, key, val, a->a);
-}
-
-static int
-st_foreachNew(mrb_state *mrb, st_table *tbl, enum st_retval (*func)(ANYARGS), void *a)
-{
- struct fn_arg arg = {
- mrb,
- func,
- a,
- };
-
- return st_foreach(tbl, fn_i, (st_data_t)&arg);
-}
-
-static enum st_retval
-enc_names_i(mrb_state *mrb, st_data_t name, st_data_t idx, st_data_t args)
-{
- mrb_value *arg = (mrb_value*)args;
- int iargs = mrb_fixnum(arg[0]);
- //if ((int)idx == (int)arg[0]) {
- if ((int)idx == iargs) {
- mrb_value str = mrb_usascii_str_new2(mrb, (char*)name);
- //OBJ_FREEZE(str);
- mrb_ary_push(mrb, arg[1], str);
- }
- return ST_CONTINUE;
-}
-
-/* 15.2.40.2.16 */
-/*
- * call-seq:
- * enc.names -> array
- *
- * Returns the list of name and aliases of the encoding.
- *
- * Encoding::WINDOWS_31J.names #=> ["Windows-31J", "CP932", "csWindows31J"]
- */
-static mrb_value
-enc_names(mrb_state *mrb, mrb_value self)
-{
- mrb_value args[2];
-
- args[0] = mrb_fixnum_value(mrb_to_encoding_index(mrb, self));
- args[1] = mrb_ary_new_capa(mrb, 0);//mrb_ary_new2(0);
- st_foreachNew(mrb, enc_table.names, enc_names_i, args);
- return args[1];
-}
-
-/* 15.2.40.2.8 */
-/*
- * call-seq:
- * Encoding.list -> [enc1, enc2, ...]
- *
- * Returns the list of loaded encodings.
- *
- * Encoding.list
- * #=> [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
- * #<Encoding:ISO-2022-JP (dummy)>]
- *
- * Encoding.find("US-ASCII")
- * #=> #<Encoding:US-ASCII>
- *
- * Encoding.list
- * #=> [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
- * #<Encoding:US-ASCII>, #<Encoding:ISO-2022-JP (dummy)>]
- *
- */
-static mrb_value
-enc_list(mrb_state *mrb, mrb_value klass)
-{
- struct RArray *ar = (struct RArray*)mrb_encoding_list.value.p;
- mrb_value ary = mrb_ary_new_capa(mrb, 0);//mrb_ary_new2(0);
- //mrb_ary_replace_m(mrb, ary/*, mmrb_encoding_list*/);
- mrb_ary_replace(mrb, mrb_ary_ptr(ary), ar->buf, enc_table.count);
- return ary;
-}
-
-/* 15.2.40.2.7 */
-/*
- * call-seq:
- * Encoding.find(string) -> enc
- * Encoding.find(symbol) -> enc
- *
- * Search the encoding with specified <i>name</i>.
- * <i>name</i> should be a string or symbol.
- *
- * Encoding.find("US-ASCII") #=> #<Encoding:US-ASCII>
- * Encoding.find(:Shift_JIS) #=> #<Encoding:Shift_JIS>
- *
- * Names which this method accept are encoding names and aliases
- * including following special aliases
- *
- * "external":: default external encoding
- * "internal":: default internal encoding
- * "locale":: locale encoding
- * "filesystem":: filesystem encoding
- *
- * An ArgumentError is raised when no encoding with <i>name</i>.
- * Only <code>Encoding.find("internal")</code> however returns nil
- * when no encoding named "internal", in other words, when Ruby has no
- * default internal encoding.
- */
-static mrb_value
-enc_find(mrb_state *mrb, mrb_value klass)
-{
- mrb_value enc;
-
- mrb_get_args(mrb, "o", &enc);
- return mrb_enc_from_encoding(mrb, to_encoding(mrb, enc));
-}
-
-/* 15.2.40.2.2 */
-/*
- * call-seq:
- * Encoding.compatible?(str1, str2) -> enc or nil
- *
- * Checks the compatibility of two strings.
- * If they are compatible, means concatenatable,
- * returns an encoding which the concatenated string will be.
- * If they are not compatible, nil is returned.
- *
- * Encoding.compatible?("\xa1".force_encoding("iso-8859-1"), "b")
- * #=> #<Encoding:ISO-8859-1>
- *
- * Encoding.compatible?(
- * "\xa1".force_encoding("iso-8859-1"),
- * "\xa1\xa1".force_encoding("euc-jp"))
- * #=> nil
- *
- */
-static mrb_value
-enc_compatible_p(mrb_state *mrb, mrb_value klass)
-{
- mrb_value str1;
- mrb_value str2;
- mrb_encoding *enc;
-
- mrb_get_args(mrb, "oo", &str1, &str2);
- if (!enc_capable(str1)) return mrb_nil_value();
- if (!enc_capable(str2)) return mrb_nil_value();
- enc = mrb_enc_compatible(mrb, str1, str2);
- if (!enc) return mrb_nil_value();
- return mrb_enc_from_encoding(mrb, enc);
-}
-
-/* 15.2.40.2.19 */
-/* :nodoc: */
-static mrb_value
-enc_dump(mrb_state *mrb, /*int argc, mrb_value *argv,*/ mrb_value self)
-{
- //mrb_scan_args(argc, argv, "01", 0);
- return enc_name(mrb, self);
-}
-
-/* 15.2.40.2.11 */
-/* :nodoc: */
-static mrb_value
-enc_load(mrb_state *mrb, mrb_value klass)
-{
- mrb_value str;
-
- mrb_get_args(mrb, "o", &str);
- return enc_find(mrb, str);
-}
-
-mrb_encoding *
-mrb_ascii8bit_encoding(mrb_state *mrb)
-{
- if (!enc_table.list) {
- mrb_enc_init(mrb);
- }
- return enc_table.list[ENCINDEX_ASCII].enc;
-}
-
-int
-mrb_ascii8bit_encindex(void)
-{
- return ENCINDEX_ASCII;
-}
-
-mrb_encoding *
-mrb_utf8_encoding(mrb_state *mrb)
-{
- if (!enc_table.list) {
- mrb_enc_init(mrb);
- }
- return enc_table.list[ENCINDEX_UTF_8].enc;
-}
-
-int
-mrb_utf8_encindex(void)
-{
- return ENCINDEX_UTF_8;
-}
-
-mrb_encoding *
-mrb_usascii_encoding(mrb_state *mrb)
-{
- if (!enc_table.list) {
- mrb_enc_init(mrb);
- }
- return enc_table.list[ENCINDEX_US_ASCII].enc;
-}
-
-int
-mrb_usascii_encindex(void)
-{
- return ENCINDEX_US_ASCII;
-}
-
-int
-mrb_locale_encindex(mrb_state *mrb)
-{
- mrb_value charmap = mrb_locale_charmap(mrb, mrb_obj_value(ENCODE_CLASS));
- int idx;
-
- if (mrb_nil_p(charmap))
- idx = mrb_usascii_encindex();
- //else if ((idx = mrb_enc_find_index(StringValueCStr(charmap))) < 0)
- else if ((idx = mrb_enc_find_index(mrb, mrb_string_value_cstr(mrb, &charmap))) < 0)
- idx = mrb_ascii8bit_encindex();
-
- if (mrb_enc_registered("locale") < 0) enc_alias_internal("locale", idx);
-
- return idx;
-}
-
-mrb_encoding *
-mrb_locale_encoding(mrb_state *mrb)
-{
- return mrb_enc_from_index(mrb, mrb_locale_encindex(mrb));
-}
-
-static int
-enc_set_filesystem_encoding(mrb_state *mrb)
-{
- int idx;
-#if defined NO_LOCALE_CHARMAP
- idx = mrb_enc_to_index(mrb_default_external_encoding(mrb));
-#elif defined _WIN32 || defined __CYGWIN__
- char cp[sizeof(int) * 8 / 3 + 4];
- //snprintf(cp, sizeof cp, "CP%d", AreFileApisANSI() ? GetACP() : GetOEMCP());
- idx = mrb_enc_find_index(mrb, cp);
- if (idx < 0) idx = mrb_ascii8bit_encindex();
-#else
- idx = mrb_enc_to_index(mrb_default_external_encoding(mrb));
-#endif
-
- enc_alias_internal("filesystem", idx);
- return idx;
-}
-
-int
-mrb_filesystem_encindex(void)
-{
- int idx = mrb_enc_registered("filesystem");
- if (idx < 0)
- idx = mrb_ascii8bit_encindex();
- return idx;
-}
-
-mrb_encoding *
-mrb_filesystem_encoding(mrb_state *mrb)
-{
- return mrb_enc_from_index(mrb, mrb_filesystem_encindex());
-}
-
-struct default_encoding {
- int index; /* -2 => not yet set, -1 => nil */
- mrb_encoding *enc;
-};
-
-static struct default_encoding default_external = {0};
-
-static int
-enc_set_default_encoding(mrb_state *mrb, struct default_encoding *def, mrb_value encoding, const char *name)
-{
- int overridden = FALSE;
-
- if (def->index != -2)
- /* Already set */
- overridden = TRUE;
-
- if (mrb_nil_p(encoding)) {
- def->index = -1;
- def->enc = 0;
- st_insert(enc_table.names, (st_data_t)strdup(name),
- (st_data_t)UNSPECIFIED_ENCODING);
- }
- else {
- def->index = mrb_enc_to_index(mrb_to_encoding(mrb, encoding));
- def->enc = 0;
- enc_alias_internal(name, def->index);
- }
-
- if (def == &default_external)
- enc_set_filesystem_encoding(mrb);
-
- return overridden;
-}
-
-mrb_encoding *
-mrb_default_external_encoding(mrb_state *mrb)
-{
- if (default_external.enc) return default_external.enc;
-
- if (default_external.index >= 0) {
- default_external.enc = mrb_enc_from_index(mrb, default_external.index);
- return default_external.enc;
- }
- else {
- return mrb_locale_encoding(mrb);
- }
-}
-
-mrb_value
-mrb_enc_default_external(mrb_state *mrb)
-{
- return mrb_enc_from_encoding(mrb, mrb_default_external_encoding(mrb));
-}
-
-/* 15.2.40.2.3 */
-/*
- * call-seq:
- * Encoding.default_external -> enc
- *
- * Returns default external encoding.
- *
- * It is initialized by the locale or -E option.
- */
-static mrb_value
-get_default_external(mrb_state *mrb, mrb_value klass)
-{
- return mrb_enc_default_external(mrb);
-}
-
-void
-mrb_enc_set_default_external(mrb_state *mrb, mrb_value encoding)
-{
- if (mrb_nil_p(encoding)) {
- mrb_raise(mrb, E_ARGUMENT_ERROR, "default external can not be nil");
- }
- enc_set_default_encoding(mrb, &default_external, encoding,
- "external");
-}
-
-/* 15.2.40.2.4 */
-/*
- * call-seq:
- * Encoding.default_external = enc
- *
- * Sets default external encoding.
- */
-static mrb_value
-set_default_external(mrb_state *mrb, mrb_value klass)
-{
- mrb_value encoding;
-
- mrb_get_args(mrb, "o", &encoding);
- mrb_warning("setting Encoding.default_external");
- mrb_enc_set_default_external(mrb, encoding);
- return encoding;
-}
-
-static struct default_encoding default_internal = {-2};
-
-mrb_encoding *
-mrb_default_internal_encoding(mrb_state *mrb)
-{
- if (!default_internal.enc && default_internal.index >= 0) {
- default_internal.enc = mrb_enc_from_index(mrb, default_internal.index);
- }
- return default_internal.enc; /* can be NULL */
-}
-
-mrb_value
-mrb_enc_default_internal(mrb_state *mrb)
-{
- /* Note: These functions cope with default_internal not being set */
- return mrb_enc_from_encoding(mrb, mrb_default_internal_encoding(mrb));
-}
-
-/* 15.2.40.2.5 */
-/*
- * call-seq:
- * Encoding.default_internal -> enc
- *
- * Returns default internal encoding.
- *
- * It is initialized by the source internal_encoding or -E option.
- */
-static mrb_value
-get_default_internal(mrb_state *mrb, mrb_value klass)
-{
- return mrb_enc_default_internal(mrb);
-}
-
-void
-mrb_enc_set_default_internal(mrb_state *mrb, mrb_value encoding)
-{
- enc_set_default_encoding(mrb, &default_internal, encoding,
- "internal");
-}
-
-/* 15.2.40.2.6 */
-/*
- * call-seq:
- * Encoding.default_internal = enc or nil
- *
- * Sets default internal encoding.
- * Or removes default internal encoding when passed nil.
- */
-static mrb_value
-set_default_internal(mrb_state *mrb, mrb_value klass)
-{
- mrb_value encoding;
-
- mrb_get_args(mrb, "o", &encoding);
- mrb_warning("setting Encoding.default_internal");
- mrb_enc_set_default_internal(mrb, encoding);
- return encoding;
-}
-
-#define digit(x) ((x) >= '0' && (x) <= '9')
-#ifndef _MSC_VER
-#define strstart(s, n) (strncasecmp(s, n, strlen(n)) == 0)
-#else
-#define strstart(s, n) (_stricmp(s, n) == 0)
-#endif
-#define C_CODESET "US-ASCII" /* Return this as the encoding of the
- * C/POSIX locale. Could as well one day
- * become "UTF-8". */
-#if defined _WIN32 || defined __CYGWIN__
-#define JA_CODESET "Windows-31J"
-#else
-#define JA_CODESET "EUC-JP"
-#endif
-
-static char buf[16];
-
-const char *
-nl_langinfo_codeset(void)
-{
- const char *l, *p;
- int n;
-
- if (((l = getenv("LC_ALL")) && *l) ||
- ((l = getenv("LC_CTYPE")) && *l) ||
- ((l = getenv("LANG")) && *l)) {
- /* check standardized locales */
- if (!strcmp(l, "C") || !strcmp(l, "POSIX"))
- return C_CODESET;
- /* check for encoding name fragment */
- p = strchr(l, '.');
- if (!p++) p = l;
- if (strstart(p, "UTF"))
- return "UTF-8";
- if ((n = 5, strstart(p, "8859-")) || (n = 9, strstart(p, "ISO-8859-"))) {
- if (digit(p[n])) {
- p += n;
- memcpy(buf, "ISO-8859-\0\0", 12);
- buf[9] = *p++;
- if (digit(*p)) buf[10] = *p++;
- return buf;
- }
- }
- if (strstart(p, "KOI8-R")) return "KOI8-R";
- if (strstart(p, "KOI8-U")) return "KOI8-U";
- if (strstart(p, "620")) return "TIS-620";
- if (strstart(p, "2312")) return "GB2312";
- if (strstart(p, "HKSCS")) return "Big5HKSCS"; /* no MIME charset */
- if (strstart(p, "BIG5")) return "Big5";
- if (strstart(p, "GBK")) return "GBK"; /* no MIME charset */
- if (strstart(p, "18030")) return "GB18030"; /* no MIME charset */
- if (strstart(p, "Shift_JIS") || strstart(p, "SJIS")) return "Windows-31J";
- /* check for conclusive modifier */
- if (strstart(p, "euro")) return "ISO-8859-15";
- /* check for language (and perhaps country) codes */
- if (strstart(l, "zh_TW")) return "Big5";
- if (strstart(l, "zh_HK")) return "Big5HKSCS"; /* no MIME charset */
- if (strstart(l, "zh")) return "GB2312";
- if (strstart(l, "ja")) return JA_CODESET;
- if (strstart(l, "ko")) return "EUC-KR";
- if (strstart(l, "ru")) return "KOI8-R";
- if (strstart(l, "uk")) return "KOI8-U";
- if (strstart(l, "pl") || strstart(l, "hr") ||
- strstart(l, "hu") || strstart(l, "cs") ||
- strstart(l, "sk") || strstart(l, "sl")) return "ISO-8859-2";
- if (strstart(l, "eo") || strstart(l, "mt")) return "ISO-8859-3";
- if (strstart(l, "el")) return "ISO-8859-7";
- if (strstart(l, "he")) return "ISO-8859-8";
- if (strstart(l, "tr")) return "ISO-8859-9";
- if (strstart(l, "th")) return "TIS-620"; /* or ISO-8859-11 */
- if (strstart(l, "lt")) return "ISO-8859-13";
- if (strstart(l, "cy")) return "ISO-8859-14";
- if (strstart(l, "ro")) return "ISO-8859-2"; /* or ISO-8859-16 */
- if (strstart(l, "am") || strstart(l, "vi")) return "UTF-8";
- /* Send me further rules if you like, but don't forget that we are
- * *only* interested in locale naming conventions on platforms
- * that do not already provide an nl_langinfo(CODESET) implementation. */
- }
- return NULL;
-}
-
-/* 15.2.40.2.9 */
-/*
- * call-seq:
- * Encoding.locale_charmap -> string
- *
- * Returns the locale charmap name.
- *
- * Debian GNU/Linux
- * LANG=C
- * Encoding.locale_charmap #=> "ANSI_X3.4-1968"
- * LANG=ja_JP.EUC-JP
- * Encoding.locale_charmap #=> "EUC-JP"
- *
- * SunOS 5
- * LANG=C
- * Encoding.locale_charmap #=> "646"
- * LANG=ja
- * Encoding.locale_charmap #=> "eucJP"
- *
- * The result is highly platform dependent.
- * So Encoding.find(Encoding.locale_charmap) may cause an error.
- * If you need some encoding object even for unknown locale,
- * Encoding.find("locale") can be used.
- *
- */
-mrb_value
-mrb_locale_charmap(mrb_state *mrb, mrb_value klass)
-{
-#if defined NO_LOCALE_CHARMAP
- return mrb_usascii_str_new2(mrb, "ASCII-8BIT");
-#elif defined _WIN32 || defined __CYGWIN__
- const char *nl_langinfo_codeset(void);
- const char *codeset = nl_langinfo_codeset();
- char cp[sizeof(int) * 3 + 4];
- if (!codeset) {
- //snprintf(cp, sizeof(cp), "CP%d", GetConsoleCP());
- codeset = cp;
- }
- return mrb_usascii_str_new2(mrb, codeset);
-#elif defined HAVE_LANGINFO_H
- char *codeset;
- codeset = nl_langinfo(CODESET);
- return mrb_usascii_str_new2(mrb, codeset);
-#else
- return mrb_nil_value();
-#endif
-}
-static void
-set_encoding_const(mrb_state *mrb, const char *name, mrb_encoding *enc)
-{
- mrb_value encoding = mrb_enc_from_encoding(mrb, enc);
- char *s = (char*)name;
- int haslower = 0, hasupper = 0, valid = 0;
-
- if (ISDIGIT(*s)) return;
- if (ISUPPER(*s)) {
- hasupper = 1;
- while (*++s && (ISALNUM(*s) || *s == '_')) {
- if (ISLOWER(*s)) haslower = 1;
- }
- }
- if (!*s) {
- if (s - name > ENCODING_NAMELEN_MAX) return;
- valid = 1;
- //mrb_define_const(mrb_cEncoding, name, encoding);
- mrb_define_const(mrb, ENCODE_CLASS, name, encoding);
- }
- if (!valid || haslower) {
- size_t len = s - name;
- if (len > ENCODING_NAMELEN_MAX) return;
- if (!haslower || !hasupper) {
- do {
- if (ISLOWER(*s)) haslower = 1;
- if (ISUPPER(*s)) hasupper = 1;
- } while (*++s && (!haslower || !hasupper));
- len = s - name;
- }
- len += strlen(s);
- if (len++ > ENCODING_NAMELEN_MAX) return;
- //MEMCPY(s = ALLOCA_N(char, len), name, char, len);
- memcpy(s = mrb_malloc(mrb, len), name, len);
- name = s;
- if (!valid) {
- if (ISLOWER(*s)) *s = ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s);
- for (; *s; ++s) {
- if (!ISALNUM(*s)) *s = '_';
- }
- if (hasupper) {
- mrb_define_const(mrb, ENCODE_CLASS, name, encoding);
- }
- }
- if (haslower) {
- for (s = (char*)name; *s; ++s) {
- if (ISLOWER(*s)) *s = ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s);
- }
- mrb_define_const(mrb, ENCODE_CLASS, name, encoding);
- }
- }
-}
-static enum st_retval
-mrb_enc_name_list_i(mrb_state *mrb, st_data_t name, st_data_t idx, mrb_value *arg)
-{
- mrb_value ary = *arg;
- mrb_value str = mrb_usascii_str_new2(mrb, (char*)name);
- //OBJ_FREEZE(str);
- mrb_ary_push(mrb, ary, str);
- return ST_CONTINUE;
-}
-
-/* 15.2.40.2.10 */
-/*
- * call-seq:
- * Encoding.name_list -> ["enc1", "enc2", ...]
- *
- * Returns the list of available encoding names.
- *
- * Encoding.name_list
- * #=> ["US-ASCII", "ASCII-8BIT", "UTF-8",
- * "ISO-8859-1", "Shift_JIS", "EUC-JP",
- * "Windows-31J",
- * "BINARY", "CP932", "eucJP"]
- *
- */
-
-static mrb_value
-mrb_enc_name_list(mrb_state *mrb, mrb_value klass)
-{
- mrb_value ary = mrb_ary_new_capa(mrb, enc_table.names->num_entries);//mrb_ary_new2(enc_table.names->num_entries);
- st_foreachNew(mrb, enc_table.names, mrb_enc_name_list_i, &ary);
- return ary;
-}
-
-static enum st_retval
-mrb_enc_aliases_enc_i(mrb_state *mrb, st_data_t name, st_data_t orig, st_data_t arg)
-{
- mrb_value *p = (mrb_value*)arg;
- mrb_value aliases = p[0], ary = p[1];
- int idx = (int)orig;
- mrb_value key, str = mrb_ary_ref(mrb, ary, idx);//mrb_ary_entry(ary, idx);
-
- if (mrb_nil_p(str)) {
- mrb_encoding *enc = mrb_enc_from_index(mrb, idx);
-
- if (!enc) return ST_CONTINUE;
- if (STRCASECMP((char*)name, mrb_enc_name(enc)) == 0) {
- return ST_CONTINUE;
- }
- str = mrb_usascii_str_new2(mrb, mrb_enc_name(enc));
- OBJ_FREEZE(str);
- mrb_ary_set(mrb, ary, idx, str);//rb_ary_store(ary, idx, str);
- }
- key = mrb_usascii_str_new2(mrb, (char*)name);
- OBJ_FREEZE(key);
- mrb_hash_set(mrb, aliases, key, str);
- return ST_CONTINUE;
-}
-
-/* 15.2.40.2.1 */
-/*
- * call-seq:
- * Encoding.aliases -> {"alias1" => "orig1", "alias2" => "orig2", ...}
- *
- * Returns the hash of available encoding alias and original encoding name.
- *
- * Encoding.aliases
- * #=> {"BINARY"=>"ASCII-8BIT", "ASCII"=>"US-ASCII", "ANSI_X3.4-1986"=>"US-ASCII",
- * "SJIS"=>"Shift_JIS", "eucJP"=>"EUC-JP", "CP932"=>"Windows-31J"}
- *
- */
-
-static mrb_value
-mrb_enc_aliases(mrb_state *mrb, mrb_value klass)
-{
- mrb_value aliases[2];
- aliases[0] = mrb_hash_new_capa(mrb, 0);
- aliases[1] = mrb_ary_new(mrb);
- st_foreachNew(mrb, enc_table.names, mrb_enc_aliases_enc_i, aliases);
- return aliases[0];
-}
-
-void
-mrb_init_encoding(mrb_state *mrb)
-{
-#undef mrb_intern
-#define mrb_intern(str) mrb_intern_const(str)
- mrb_value list;
- int i;
- struct RClass *s;
-
- s = mrb_define_class(mrb, "Encoding", mrb->object_class);
- //mrb_undef_alloc_func(mrb_cEncoding);
- //mrb_undef_method(CLASS_OF(mrb_cEncoding), "new");
- mrb_define_class_method(mrb, s, "aliases", mrb_enc_aliases, ARGS_NONE()); /* 15.2.40.2.1 */
- mrb_define_class_method(mrb, s, "compatible?", enc_compatible_p, ARGS_REQ(2)); /* 15.2.40.2.2 */
- mrb_define_class_method(mrb, s, "default_external", get_default_external, ARGS_NONE()); /* 15.2.40.2.3 */
- mrb_define_class_method(mrb, s, "default_external=", set_default_external, ARGS_REQ(1)); /* 15.2.40.2.4 */
- mrb_define_class_method(mrb, s, "default_internal", get_default_internal, ARGS_NONE()); /* 15.2.40.2.5 */
- mrb_define_class_method(mrb, s, "default_internal=", set_default_internal, ARGS_REQ(1)); /* 15.2.40.2.6 */
- mrb_define_class_method(mrb, s, "find", enc_find, ARGS_REQ(1)); /* 15.2.40.2.7 */
- mrb_define_class_method(mrb, s, "list", enc_list, ARGS_NONE()); /* 15.2.40.2.8 */
- mrb_define_class_method(mrb, s, "locale_charmap", mrb_locale_charmap, ARGS_NONE()); /* 15.2.40.2.9 */
- mrb_define_class_method(mrb, s, "name_list", mrb_enc_name_list, ARGS_NONE()); /* 15.2.40.2.10 */
- mrb_define_class_method(mrb, s, "_load", enc_load, ARGS_REQ(1)); /* 15.2.40.2.11 */
- mrb_define_method(mrb, s, "ascii_compatible?", enc_ascii_compatible_p, ARGS_NONE()); /* 15.2.40.2.12 */
- mrb_define_method(mrb, s, "dummy?", enc_dummy_p, ARGS_NONE()); /* 15.2.40.2.13 */
- mrb_define_method(mrb, s, "inspect", enc_inspect, ARGS_NONE()); /* 15.2.40.2.14 */
- mrb_define_method(mrb, s, "name", enc_name, ARGS_NONE()); /* 15.2.40.2.15 */
- mrb_define_method(mrb, s, "names", enc_names, ARGS_NONE()); /* 15.2.40.2.16 */
- mrb_define_method(mrb, s, "replicate", enc_replicate, ARGS_REQ(1)); /* 15.2.40.2.17 */
- mrb_define_method(mrb, s, "to_s", enc_name, ARGS_NONE()); /* 15.2.40.2.18 */
- mrb_define_method(mrb, s, "_dump", enc_dump, ARGS_ANY()); /* 15.2.40.2.19 */
-
-/* add kusuda --> */
- if (!enc_table.list) {
- mrb_enc_init(mrb);
- }
-/* add kusuda --< */
- list = mrb_ary_new_capa(mrb, enc_table.count);//mrb_ary_new2(enc_table.count);
- RBASIC(list)->c = 0;
- mrb_encoding_list = list;
- //mrb_gc_register_mark_object(list);
-
- for (i = 0; i < enc_table.count; ++i) {
- mrb_ary_push(mrb, list, enc_new(mrb, enc_table.list[i].enc));
- }
-}
-
-/* locale insensitive functions */
-
-#define ctype_test(c, ctype) \
- (mrb_isascii(c) && ONIGENC_IS_ASCII_CODE_CTYPE((c), ctype))
-
-int mrb_isalnum(int c) { return ctype_test(c, ONIGENC_CTYPE_ALNUM); }
-int mrb_isalpha(int c) { return ctype_test(c, ONIGENC_CTYPE_ALPHA); }
-int mrb_isblank(int c) { return ctype_test(c, ONIGENC_CTYPE_BLANK); }
-int mrb_iscntrl(int c) { return ctype_test(c, ONIGENC_CTYPE_CNTRL); }
-int mrb_isdigit(int c) { return ctype_test(c, ONIGENC_CTYPE_DIGIT); }
-int mrb_isgraph(int c) { return ctype_test(c, ONIGENC_CTYPE_GRAPH); }
-int mrb_islower(int c) { return ctype_test(c, ONIGENC_CTYPE_LOWER); }
-int mrb_isprint(int c) { return ctype_test(c, ONIGENC_CTYPE_PRINT); }
-int mrb_ispunct(int c) { return ctype_test(c, ONIGENC_CTYPE_PUNCT); }
-int mrb_isspace(int c) { return ctype_test(c, ONIGENC_CTYPE_SPACE); }
-int mrb_isupper(int c) { return ctype_test(c, ONIGENC_CTYPE_UPPER); }
-int mrb_isxdigit(int c) { return ctype_test(c, ONIGENC_CTYPE_XDIGIT); }
-
-int
-mrb_tolower(int c)
-{
- return mrb_isascii(c) ? ONIGENC_ASCII_CODE_TO_LOWER_CASE(c) : c;
-}
-
-int
-mrb_toupper(int c)
-{
- return mrb_isascii(c) ? ONIGENC_ASCII_CODE_TO_UPPER_CASE(c) : c;
-}
-#endif //INCLUDE_ENCODING
View
9 src/encoding.h
@@ -174,11 +174,7 @@ int mrb_enc_codelen(mrb_state *mrb, int code, mrb_encoding *enc);
#endif //INCLUDE_ENCODING
/* code,ptr,encoding -> write buf */
-#ifdef INCLUDE_ENCODING
-#define mrb_enc_mbcput(c,buf,enc) ONIGENC_CODE_TO_MBC(enc,c,(UChar*)(buf))
-#else
-#define mrb_enc_mbcput(c,buf,enc) *(buf) = (char)(c)
-#endif //INCLUDE_ENCODING
+#define mrb_enc_mbcput(c,buf,enc) ((*(buf) = (char)(c)),1)
/* start, ptr, end, encoding -> prev_char */
#define mrb_enc_prev_char(s,p,e,enc) (char *)onigenc_get_prev_char_head(enc,(UChar*)(s),(UChar*)(p),(UChar*)(e))
@@ -232,9 +228,6 @@ mrb_value mrb_enc_default_internal(mrb_state *mrb);
void mrb_enc_set_default_external(mrb_state *mrb, mrb_value encoding);
void mrb_enc_set_default_internal(mrb_state *mrb, mrb_value encoding);
mrb_value mrb_locale_charmap(mrb_state *mrb, mrb_value klass);
-#ifdef INCLUDE_ENCODING
-int mrb_memsearch(mrb_state *mrb, const void*,int,const void*,int,mrb_encoding*);
-#endif //INCLUDE_ENCODING
mrb_value mrb_usascii_str_new_cstr(mrb_state *mrb, const char *ptr);
int mrb_str_buf_cat_escaped_char(mrb_state *mrb, mrb_value result, unsigned int c, int unicode_p);
View
4 src/gc.c
@@ -266,7 +266,7 @@ mrb_obj_alloc(mrb_state *mrb, enum mrb_vtype ttype, struct RClass *cls)
mrb->live++;
if (mrb->arena_idx > MRB_ARENA_SIZE) {
/* arena overflow error */
- mrb->arena_idx = MRB_ARENA_SIZE - 2; /* force room in arena */
+ mrb->arena_idx = MRB_ARENA_SIZE - 4; /* force room in arena */
mrb_raise(mrb, mrb->eRuntimeError_class, "arena overflow error");
}
mrb->arena[mrb->arena_idx++] = p;
@@ -360,12 +360,14 @@ gc_mark_children(mrb_state *mrb, struct RBasic *obj)
case MRB_TT_STRING:
{
+#if 0
struct RString *s = (struct RString*)obj;
while (s->flags & MRB_STR_SHARED) {
s = s->aux.shared;
if (!s) break;
}
+#endif
}
break;
View
2  src/init.c
@@ -20,7 +20,6 @@ void mrb_init_proc(mrb_state*);
void mrb_init_range(mrb_state*);
void mrb_init_string(mrb_state*);
void mrb_init_regexp(mrb_state*);
-void mrb_init_encoding(mrb_state*);
void mrb_init_exception(mrb_state*);
void mrb_init_time(mrb_state*);
void mrb_init_io(mrb_state*);
@@ -54,7 +53,6 @@ mrb_init_core(mrb_state *mrb)
mrb_init_gc(mrb);
#ifdef INCLUDE_REGEXP
mrb_init_regexp(mrb);
- mrb_init_encoding(mrb);
#endif
mrb_init_exception(mrb);
mrb_init_print(mrb);
View
13 src/object.c
@@ -11,13 +11,6 @@
#include "mruby/class.h"
#include "mruby/numeric.h"
-#ifdef INCLUDE_REGEXP
- #define mrb_usascii_str_new2 mrb_usascii_str_new_cstr
-#else
- #define mrb_usascii_str_new2 mrb_str_new_cstr
- #define mrb_usascii_str_new mrb_str_new
-#endif
-
#ifndef FALSE
#define FALSE 0
#endif
@@ -106,7 +99,7 @@ mrb_true(mrb_state *mrb, mrb_value obj)
static mrb_value
nil_to_s(mrb_state *mrb, mrb_value obj)
{
- return mrb_usascii_str_new(mrb, 0, 0);
+ return mrb_str_new(mrb, 0, 0);
}
/***********************************************************************
@@ -166,7 +159,7 @@ true_xor(mrb_state *mrb, mrb_value obj)
static mrb_value
true_to_s(mrb_state *mrb, mrb_value obj)
{
- return mrb_usascii_str_new2(mrb, "true");
+ return mrb_str_new_cstr(mrb, "true");
}
/* 15.2.5.3.4 */
@@ -279,7 +272,7 @@ false_or(mrb_state *mrb, mrb_value obj)
static mrb_value
false_to_s(mrb_state *mrb, mrb_value obj)
{
- return mrb_usascii_str_new2(mrb, "false");
+ return mrb_str_new_cstr(mrb, "false");
}
void
View
792 src/re.c
@@ -7,16 +7,11 @@
#include "mruby.h"
#include <string.h>
#include "mruby/string.h"
-#include "mruby/khash.h"
#include "encoding.h"
#include "re.h"
-#include "mruby/numeric.h"
-#include "mruby/range.h"
#include "mruby/array.h"
#include "regint.h"
#include "mruby/class.h"
-#include "mruby/hash.h"
-#include "mruby/variable.h"
#include "error.h"
#ifdef INCLUDE_REGEXP
@@ -54,13 +49,10 @@ unsigned long ruby_scan_oct(const char*, size_t, size_t*);
unsigned long ruby_scan_hex(const char*, size_t, size_t*);
static mrb_value mrb_match_to_a(mrb_state *mrb, mrb_value match);
-static mrb_value mrb_reg_preprocess(mrb_state *mrb, const char *p, const char *end, mrb_encoding *enc,
- mrb_encoding **fixed_enc, onig_errmsg_buffer err);
-static void mrb_reg_expr_str(mrb_state *mrb, mrb_value str, const char *s, long len,
- mrb_encoding *enc, mrb_encoding *resenc);
+static mrb_value mrb_reg_preprocess(mrb_state *mrb, const char *p, const char *end, onig_errmsg_buffer err);
+static void mrb_reg_expr_str(mrb_state *mrb, mrb_value str, const char *s, long len);
static char * option_to_str(char str[4], int options);
-static mrb_value reg_cache;
//static int may_need_recompile;
//static int reg_kcode = DEFAULT_KCODE;
/* ------------------------------------------------------------------------- */
@@ -94,22 +86,20 @@ mrb_reg_s_new_instance(mrb_state *mrb, /*int argc, mrb_value *argv, */mrb_value
re->usecnt = 0;
return mrb_funcall_argv(mrb, mrb_obj_value(re), "initialize", argc, argv);
}
-//#define mrb_enc_mbcput(a,b,c) a
+
mrb_value
mrb_reg_quote(mrb_state *mrb, mrb_value str)
{
- mrb_encoding *enc = mrb_enc_get(mrb, str);
char *s, *send, *t;
mrb_value tmp;
- int c,clen;
- int ascii_only = mrb_enc_str_asciionly_p(mrb, str);
+ int c;
s = RSTRING_PTR(str);
send = s + RSTRING_LEN(str);
while (s < send) {
- c = mrb_enc_ascget(mrb, s, send, &clen, enc);
+ c = *s;
if (c == -1) {
- s += mbclen(s, send, enc);
+ s += send - s;
continue;
}
switch (c) {
@@ -121,38 +111,29 @@ mrb_reg_quote(mrb_state *mrb, mrb_value str)
case '\t': case '\f': case '\n': case '\r':
goto meta_found;
}
- s += clen;
+ s++;
}
//tmp = mrb_str_new3(str);
tmp = mrb_str_new(mrb, RSTRING_PTR(str), RSTRING_LEN(str));
- if (ascii_only) {
- mrb_enc_associate(mrb, tmp, mrb_usascii_encoding(mrb));
- }
return tmp;
meta_found:
tmp = mrb_str_new(mrb, 0, RSTRING_LEN(str)*2);
- if (ascii_only) {
- mrb_enc_associate(mrb, tmp, mrb_usascii_encoding(mrb));
- }
- else {
- mrb_enc_copy(mrb, tmp, str);
- }
t = RSTRING_PTR(tmp);
/* copy upto metacharacter */
memcpy(t, RSTRING_PTR(str), s - RSTRING_PTR(str));
t += s - RSTRING_PTR(str);
while (s < send) {
- c = mrb_enc_ascget(mrb, s, send, &clen, enc);
+ c = *s;
if (c == -1) {
- int n = mbclen(s, send, enc);
+ int n = send - s;
while (n--)
*t++ = *s++;
continue;
}
- s += clen;
+ s++;
switch (c) {
case '[': case ']': case '{': case '}':
case '(': case ')': case '|': case '-':
@@ -263,7 +244,7 @@ mrb_reg_nth_match(mrb_state *mrb, mrb_int nth, mrb_value match)
if (start == -1) return mrb_nil_value();
end = m->rmatch->regs.end[nth];
len = end - start;
- str = mrb_str_substr(mrb, mrb_obj_value(m->str), start, len);
+ str = mrb_str_subseq(mrb, mrb_obj_value(m->str), start, len);
return str;
}
@@ -379,75 +360,13 @@ mrb_reg_options(mrb_state *mrb, mrb_value re)
return options;
}
-static void
-reg_enc_error(mrb_state *mrb, mrb_value re, mrb_value str)
-{
- mrb_raise(mrb, E_ENCODING_ERROR,
- "incompatible encoding regexp match (%s regexp with %s string)",
- mrb_enc_name(mrb_enc_get(mrb, re)),
- mrb_enc_name(mrb_enc_get(mrb, str)));
-}
-
-static int
-mrb_reg_fixed_encoding_p(mrb_value re)
-{
- /*if (FL_TEST(re, KCODE_FIXED))
- return Qtrue;
- else */
- return 0/*Qfalse*/;
-}
-
-static mrb_encoding*
-mrb_reg_prepare_enc(mrb_state *mrb, mrb_value re, mrb_value str, int warn)
-{
- mrb_encoding *enc = 0;
-
- if (mrb_enc_str_coderange(mrb, str) == ENC_CODERANGE_BROKEN) {
- mrb_raise(mrb, E_ARGUMENT_ERROR,
- "invalid byte sequence in %s",
- mrb_enc_name(mrb_enc_get(mrb, str)));
- }
-
- mrb_reg_check(mrb, re);
- enc = mrb_enc_get(mrb, str);
- if (!mrb_enc_str_asciicompat_p(mrb, str)) {
- if (RREGEXP(re)->ptr->enc != enc) {
- reg_enc_error(mrb, re, str);
- }
- }
- else if (mrb_reg_fixed_encoding_p(re)) {
- if (RREGEXP(re)->ptr->enc != enc &&
- (!mrb_enc_asciicompat(mrb, RREGEXP(re)->ptr->enc) ||
- mrb_enc_str_coderange(mrb, str) != ENC_CODERANGE_7BIT)) {
- reg_enc_error(mrb, re, str);
- }
- enc = RREGEXP(re)->ptr->enc;
- }
- if (warn && (RBASIC(re)->flags & REG_ENCODING_NONE) &&
- enc != mrb_ascii8bit_encoding(mrb) &&
- mrb_enc_str_coderange(mrb, str) != ENC_CODERANGE_7BIT) {
- mrb_warn("regexp match /.../n against to %s string",
- mrb_enc_name(enc));
- }
- return enc;
-}
-
static mrb_value
mrb_reg_desc(mrb_state *mrb, const char *s, long len, mrb_value re)
{
- mrb_encoding *enc = mrb_enc_get(mrb, re);
mrb_value str = mrb_str_new_cstr(mrb, "/");//mrb_str_buf_new2("/");
- mrb_encoding *resenc = mrb_default_internal_encoding(mrb);
- if (resenc == NULL) resenc = mrb_default_external_encoding(mrb);
- if (re.tt && mrb_enc_asciicompat(mrb, enc)) {
- mrb_enc_copy(mrb, str, re);
- }
- else {
- mrb_enc_associate(mrb, str, mrb_usascii_encoding(mrb));
- }
- mrb_reg_expr_str(mrb, str, s, len, enc, resenc);
- mrb_str_buf_cat(mrb, str, "/", strlen("/"));//mrb_str_buf_cat2(str, "/");
+ mrb_reg_expr_str(mrb, str, s, len);
+ mrb_str_buf_cat(mrb, str, "/", strlen("/"));
if (re.tt) {
char opts[4];
mrb_reg_check(mrb, re);
@@ -476,18 +395,14 @@ mrb_reg_prepare_re(mrb_state *mrb, mrb_value re, mrb_value str)
OnigErrorInfo einfo;
const char *pattern;
mrb_value unescaped;
- mrb_encoding *fixed_enc = 0;
- mrb_encoding *enc = mrb_reg_prepare_enc(mrb, re, str, 1);
-
- if (reg->enc == enc) return reg;
+ mrb_encoding *enc = mrb_ascii8bit_encoding(mrb);
mrb_reg_check(mrb, re);
reg = RREGEXP(re)->ptr;
pattern = RREGEXP_SRC_PTR(re);
unescaped = mrb_reg_preprocess(mrb,
- pattern, pattern + RREGEXP(re)->src->len, enc,
- &fixed_enc, err);
+ pattern, pattern + RREGEXP(re)->src->len, err);
if (mrb_nil_p(unescaped)) {
mrb_raise(mrb, E_ARGUMENT_ERROR, "regexp preprocess failed: %s", err);
@@ -675,18 +590,6 @@ ruby_scan_hex(const char *start, size_t len, size_t *retlen)
return retval;
}
-static int
-check_unicode_range(unsigned long code, onig_errmsg_buffer err)
-{
- if ((0xd800 <= code && code <= 0xdfff) || /* Surrogates */
- 0x10ffff < code) {
- //errcpy(err, "invalid Unicode range");
- printf("invalid Unicode range");
- return -1;
- }
- return 0;
-}
-
#define BYTEWIDTH 8
int
@@ -735,59 +638,6 @@ mrb_uv_to_utf8(mrb_state *mrb, char buf[6], unsigned long uv)
return 0;
}
-static int
-append_utf8(mrb_state *mrb, unsigned long uv,
- mrb_value buf, mrb_encoding **encp, onig_errmsg_buffer err)
-{
- if (check_unicode_range(uv, err) != 0)
- return -1;
- if (uv < 0x80) {
- char escbuf[5];
- snprintf(escbuf, sizeof(escbuf), "\\x%02X", (int)uv);
- mrb_str_buf_cat(mrb, buf, escbuf, 4);
- }
- else {
- int len;
- char utf8buf[6];
- len = mrb_uv_to_utf8(mrb, utf8buf, uv);
- mrb_str_buf_cat(mrb, buf, utf8buf, len);
-
- if (*encp == 0)
- *encp = mrb_utf8_encoding(mrb);
- else if (*encp != mrb_utf8_encoding(mrb)) {
- //errcpy(err, "UTF-8 character in non UTF-8 regexp");
- printf("UTF-8 character in non UTF-8 regexp");
- return -1;
- }
- }
- return 0;
-}
-
-static int
-unescape_unicode_bmp(mrb_state *mrb, const char **pp, const char *end,
- mrb_value buf, mrb_encoding **encp, onig_errmsg_buffer err)
-{
- const char *p = *pp;
- size_t len;
- unsigned long code;
-
- if (end < p+4) {
- //errcpy(err, "invalid Unicode escape");
- printf("invalid Unicode escape");
- return -1;
- }
- code = ruby_scan_hex(p, 4, &len);
- if (len != 4) {
- //errcpy(err, "invalid Unicode escape");
- printf("invalid Unicode escape");
- return -1;
- }
- if (append_utf8(mrb, code, buf, encp, err) != 0)
- return -1;
- *pp = p + 4;
- return 0;
-}
-
unsigned long
ruby_scan_oct(const char *start, size_t len, size_t *retlen)
{
@@ -802,400 +652,29 @@ ruby_scan_oct(const char *start, size_t len, size_t *retlen)
return retval;
}
-static int
-read_escaped_byte(const char **pp, const char *end, onig_errmsg_buffer err)
-{
- const char *p = *pp;
- int code;
- int meta_prefix = 0, ctrl_prefix = 0;
- size_t len;
-
- if (p == end || *p++ != '\\') {
- //errcpy(err, "too short escaped multibyte character");
- printf("too short escaped multibyte character");
- return -1;
- }
-
-again:
- if (p == end) {
- //errcpy(err, "too short escape sequence");
- printf("too short escape sequence");
- return -1;
- }
- switch (*p++) {
- case '\\': code = '\\'; break;
- case 'n': code = '\n'; break;
- case 't': code = '\t'; break;
- case 'r': code = '\r'; break;
- case 'f': code = '\f'; break;
- case 'v': code = '\013'; break;
- case 'a': code = '\007'; break;
- case 'e': code = '\033'; break;
-
- /* \OOO */
- case '0': case '1': case '2': case '3':
- case '4': case '5': case '6': case '7':
- p--;
- code = scan_oct(p, end < p+3 ? end-p : 3, &len);
- p += len;
- break;
-
- case 'x': /* \xHH */
- code = scan_hex(p, end < p+2 ? end-p : 2, &len);
- if (len < 1) {
- //errcpy(err, "invalid hex escape");
- printf("invalid hex escape");
- return -1;
- }
- p += len;
- break;
-
- case 'M': /* \M-X, \M-\C-X, \M-\cX */
- if (meta_prefix) {
- //errcpy(err, "duplicate meta escape");
- printf("duplicate meta escape");
- return -1;
- }
- meta_prefix = 1;
- if (p+1 < end && *p++ == '-' && (*p & 0x80) == 0) {
- if (*p == '\\') {
- p++;
- goto again;
- }
- else {
- code = *p++;
- break;
- }
- }
- //errcpy(err, "too short meta escape");
- printf("too short meta escape");
- return -1;
-
- case 'C': /* \C-X, \C-\M-X */
- if (p == end || *p++ != '-') {
- //errcpy(err, "too short control escape");
- printf("too short control escape");
- return -1;
- }
- case 'c': /* \cX, \c\M-X */
- if (ctrl_prefix) {
- //errcpy(err, "duplicate control escape");
- printf("duplicate control escape");
- return -1;
- }
- ctrl_prefix = 1;
- if (p < end && (*p & 0x80) == 0) {
- if (*p == '\\') {
- p++;
- goto again;
- }
- else {
- code = *p++;
- break;
- }
- }
- //errcpy(err, "too short control escape");
- printf("too short control escape");
- return -1;
-
- default:
- //errcpy(err, "unexpected escape sequence");
- printf("unexpected escape sequence");
- return -1;
- }
- if (code < 0 || 0xff < code) {
- //errcpy(err, "invalid escape code");
- printf("invalid escape code");
- return -1;
- }
-
- if (ctrl_prefix)
- code &= 0x1f;
- if (meta_prefix)
- code |= 0x80;
-
- *pp = p;
- return code;
-}
-
-static int
-unescape_escaped_nonascii(mrb_state *mrb, const char **pp, const char *end, mrb_encoding *enc,
- mrb_value buf, mrb_encoding **encp, onig_errmsg_buffer err)
-{
- const char *p = *pp;
- int chmaxlen = mrb_enc_mbmaxlen(enc);
- //char *chbuf = ALLOCA_N(char, chmaxlen);
- char *chbuf = mrb_malloc(mrb, chmaxlen);
- int chlen = 0;
- int byte;
- int l;
-
- memset(chbuf, 0, chmaxlen);
-
- byte = read_escaped_byte(&p, end, err);
- if (byte == -1) {
- return -1;
- }
-
- chbuf[chlen++] = byte;
- while (chlen < chmaxlen &&
- MBCLEN_NEEDMORE_P(mrb_enc_precise_mbclen(chbuf, chbuf+chlen, enc))) {
- byte = read_escaped_byte(&p, end, err);
- if (byte == -1) {
- return -1;
- }
- chbuf[chlen++] = byte;
- }
-
- l = mrb_enc_precise_mbclen(chbuf, chbuf+chlen, enc);
- if (MBCLEN_INVALID_P(l)) {
- //errcpy(err, "invalid multibyte escape");
- printf("invalid multibyte escape");
- return -1;
- }
- if (1 < chlen || (chbuf[0] & 0x80)) {
- mrb_str_buf_cat(mrb, buf, chbuf, chlen);
-
- if (*encp == 0)
- *encp = enc;
- else if (*encp != enc) {
- //errcpy(err, "escaped non ASCII character in UTF-8 regexp");
- printf("escaped non ASCII character in UTF-8 regexp");
- return -1;
- }
- }
- else {
- char escbuf[5];
- snprintf(escbuf, sizeof(escbuf), "\\x%02X", chbuf[0]&0xff);
- mrb_str_buf_cat(mrb, buf, escbuf, 4);
- }
- *pp = p;
- return 0;
-}
-
-static int
-unescape_unicode_list(mrb_state *mrb, const char **pp, const char *end,
- mrb_value buf, mrb_encoding **encp, onig_errmsg_buffer err)
-{
- const char *p = *pp;
- int has_unicode = 0;
- unsigned long code;
- size_t len;
-
- while (p < end && ISSPACE(*p)) p++;
-
- while (1) {
- code = ruby_scan_hex(p, end-p, &len);
- if (len == 0)
- break;
- if (6 < len) { /* max 10FFFF */
- //errcpy(err, "invalid Unicode range");
- printf("invalid Unicode range");
- return -1;
- }
- p += len;
- if (append_utf8(mrb, code, buf, encp, err) != 0)
- return -1;
- has_unicode = 1;
-
- while (p < end && ISSPACE(*p)) p++;
- }
-
- if (has_unicode == 0) {
- //errcpy(err, "invalid Unicode list");
- printf("invalid Unicode list");
- return -1;
- }
-
- *pp = p;
-
- return 0;
-}
-
-static int
-unescape_nonascii(mrb_state *mrb, const char *p, const char *end, mrb_encoding *enc,
- mrb_value buf, mrb_encoding **encp, int *has_property,
- onig_errmsg_buffer err)
-{
- char c;
- char smallbuf[2];
-
- while (p < end) {
- int chlen = mrb_enc_precise_mbclen(p, end, enc);
- if (!MBCLEN_CHARFOUND_P(chlen)) {
- //errcpy(err, "invalid multibyte character");
- printf("invalid multibyte character");
- return -1;
- }
- chlen = MBCLEN_CHARFOUND_LEN(chlen);
- if (1 < chlen || (*p & 0x80)) {
- mrb_str_buf_cat(mrb, buf, p, chlen);
- p += chlen;
- if (*encp == 0)
- *encp = enc;
- else if (*encp != enc) {
- //errcpy(err, "non ASCII character in UTF-8 regexp");
- printf("non ASCII character in UTF-8 regexp");
- return -1;
- }
- continue;
- }
-
- switch (c = *p++) {
- case '\\':
- if (p == end) {
- //errcpy(err, "too short escape sequence");
- printf("too short escape sequence");
- return -1;
- }
- switch (c = *p++) {
- case '1': case '2': case '3':
- case '4': case '5': case '6': case '7': /* \O, \OO, \OOO or backref */
- {
- size_t octlen;
- if (ruby_scan_oct(p-1, end-(p-1), &octlen) <= 0177) {
- /* backref or 7bit octal.
- no need to unescape anyway.
- re-escaping may break backref */
- goto escape_asis;
- }
- }
- /* xxx: How about more than 199 subexpressions? */
-
- case '0': /* \0, \0O, \0OO */
-
- case 'x': /* \xHH */
- case 'c': /* \cX, \c\M-X */
- case 'C': /* \C-X, \C-\M-X */
- case 'M': /* \M-X, \M-\C-X, \M-\cX */
- p = p-2;
- if (unescape_escaped_nonascii(mrb, &p, end, enc, buf, encp, err) != 0)
- return -1;
- break;
-
- case 'u':
- if (p == end) {
- //errcpy(err, "too short escape sequence");
- printf("too short escape sequence");
- return -1;
- }
- if (*p == '{') {
- /* \u{H HH HHH HHHH HHHHH HHHHHH ...} */
- p++;
- if (unescape_unicode_list(mrb, &p, end, buf, encp, err) != 0)
- return -1;
- if (p == end || *p++ != '}') {
- //errcpy(err, "invalid Unicode list");
- printf("invalid Unicode list");
- return -1;
- }
- break;
- }
- else {
- /* \uHHHH */
- if (unescape_unicode_bmp(mrb, &p, end, buf, encp, err) != 0)
- return -1;
- break;
- }
-
- case 'p': /* \p{Hiragana} */
- case 'P':
- if (!*encp) {
- *has_property = 1;
- }
- goto escape_asis;
-