Permalink
Browse files

release 1.3 ready after adding encoding support

  • Loading branch information...
Peter Ohler
Peter Ohler committed Sep 19, 2011
1 parent 6ee56fa commit 264e303935b0f45bc926e819c2821ff83e734734
Showing with 103 additions and 24 deletions.
  1. +15 −8 README.md
  2. +0 −2 ext/ox/ox.c
  3. +0 −1 ext/ox/ox.h
  4. +35 −7 ext/ox/sax.c
  5. +1 −1 notes
  6. +52 −5 test/sax_test.rb
View
@@ -20,19 +20,19 @@ A fast XML parser and Object marshaller as a Ruby gem.
## <a name="release">Release Notes</a>
-### Release 1.2.15
+### Release 1.3.0
- - added support for JRuby
- - added support for RBX
+ - fixed Mutex dump bug
+ - added SAX parser, 30+ times faster than Nokogiri and 10+ times faster than LibXML
## <a name="description">Description</a>
Optimized XML (Ox), as the name implies was written to provide speed optimized
XML handling. It was designed to be an alternative to Nokogiri in generic XML
parsing and as an alternative to Marshal for Object serialization.
-Nokogiri relies on libXml while Ox is self contained. Ox uses nothing other
-than standard C libraries so version issues with libXml are not an issue.
+Unlike Nokogiri Ox is self contained. Ox uses nothing other than standard C
+libraries so version issues with libXml are not an issue.
Marshal uses a binary format for serializing Objects. That binary format
changes with releases making Marshal dumped Object incompatible between some
@@ -46,9 +46,9 @@ It is possible to write an XML serialization gem with Nokogiri but writing
such a package in Ruby results in a module significantly slower than
Marshal. This is what triggered the start of Ox development.
-Ox handles XML documents in two ways. It is a generic XML parser and writer as
-well as a fast Object / XML marshaller. Ox was written for speed as a
-replacement for Nokogiri and for Marshal.
+Ox handles XML documents in three ways. It is a generic XML parser and writer,
+a fast Object / XML marshaller, and a stream SAX parser. Ox was written for
+speed as a replacement for Nokogiri, Ruby LibXML, and for Marshal.
As an XML parser it is 2 or more times faster than Nokogiri and as a generic
XML writer it is as much as 20 times faster than Nokogiri. Of course different
@@ -57,6 +57,13 @@ files may result in slightly different times.
As an Object serializer Ox is up to 6 times faster than the standard Ruby
Marshal.dump() and up to 3 times faster than Marshal.load().
+The SAX like stream parser is over 30 times faster than Nokogiri and more than
+10 times faster than LibXML when using a trivial Ruby side set of
+callbacks. Unlike Nokogiri and LibXML, Ox can be tuned to use only the SAX
+callbacks that are of interest to the caller. (See the perf_sax.rb file for an
+example.)
+
+Ox is compatible with Ruby 1.8.7, 1.9.2, JRuby, and RBX.
### Object Dump Sample:
View
@@ -109,7 +109,6 @@ VALUE time_class;
Cache symbol_cache = 0;
Cache class_cache = 0;
Cache attr_cache = 0;
-Cache str_cache = 0;
static struct _Options default_options = {
{ '\0' }, // encoding
@@ -683,7 +682,6 @@ void Init_ox() {
ox_cache_new(&symbol_cache);
ox_cache_new(&class_cache);
ox_cache_new(&attr_cache);
- ox_cache_new(&str_cache);
rb_define_module_function(Ox, "cache_test", cache_test, 0);
rb_define_module_function(Ox, "cache8_test", cache8_test, 0);
View
@@ -250,7 +250,6 @@ extern VALUE ox_cdata_clas;
extern Cache symbol_cache;
extern Cache class_cache;
extern Cache attr_cache;
-extern Cache str_cache;
#if defined(__cplusplus)
#if 0
View
@@ -63,6 +63,7 @@ typedef struct _SaxDrive {
int has_start_element;
int has_end_element;
int has_error;
+ rb_encoding *encoding;
} *SaxDrive;
static void sax_drive_init(SaxDrive dr, VALUE handler, VALUE io);
@@ -203,6 +204,7 @@ sax_drive_init(SaxDrive dr, VALUE handler, VALUE io) {
dr->has_start_element = rb_respond_to(handler, start_element_id);
dr->has_end_element = rb_respond_to(handler, end_element_id);
dr->has_error = rb_respond_to(handler, error_id);
+ dr->encoding = 0;
}
static void
@@ -451,6 +453,11 @@ read_cdata(SaxDrive dr) {
VALUE args[1];
args[0] = rb_str_new2(dr->str);
+#ifdef HAVE_RUBY_ENCODING_H
+ if (0 != dr->encoding) {
+ rb_enc_associate(args[0], dr->encoding);
+ }
+#endif
rb_funcall2(dr->handler, cdata_id, 1, args);
}
dr->str = 0;
@@ -490,6 +497,11 @@ read_comment(SaxDrive dr) {
VALUE args[1];
args[0] = rb_str_new2(dr->str);
+#ifdef HAVE_RUBY_ENCODING_H
+ if (0 != dr->encoding) {
+ rb_enc_associate(args[0], dr->encoding);
+ }
+#endif
rb_funcall2(dr->handler, comment_id, 1, args);
}
dr->str = 0;
@@ -502,7 +514,6 @@ read_comment(SaxDrive dr) {
*/
static int
read_element(SaxDrive dr) {
- char start_name[1024];
VALUE name = Qnil;
VALUE attrs = Qnil;
char c;
@@ -511,8 +522,6 @@ read_element(SaxDrive dr) {
if ('\0' == (c = read_name_token(dr))) {
return -1;
}
- strcpy(start_name, dr->str);
- //name = rb_str_new2(dr->str);
name = str2sym(dr->str);
if ('/' == c) {
closed = 1;
@@ -545,8 +554,7 @@ read_element(SaxDrive dr) {
if (0 != read_children(dr, 0)) {
return -1;
}
- //if (0 != strcmp(dr->str, rb_id2name(SYM2ID(name)))) {
- if (0 != strcmp(dr->str, start_name)) {
+ if (0 != strcmp(dr->str, rb_id2name(SYM2ID(name)))) {
sax_drive_error(dr, "invalid format, element start and end names do not match", 1);
return -1;
}
@@ -578,6 +586,11 @@ read_text(SaxDrive dr) {
VALUE args[1];
args[0] = rb_str_new2(dr->str);
+#ifdef HAVE_RUBY_ENCODING_H
+ if (0 != dr->encoding) {
+ rb_enc_associate(args[0], dr->encoding);
+ }
+#endif
rb_funcall2(dr->handler, text_id, 1, args);
}
return 0;
@@ -586,6 +599,7 @@ read_text(SaxDrive dr) {
static int
read_attrs(SaxDrive dr, VALUE *attrs, char c, char termc, char term2) {
VALUE name = Qnil;
+ int is_encoding = 0;
dr->str = dr->cur; // lock it down
if (is_white(c)) {
@@ -600,9 +614,11 @@ read_attrs(SaxDrive dr, VALUE *attrs, char c, char termc, char term2) {
if ('\0' == (c = read_name_token(dr))) {
return -1;
}
+ if ('?' == termc && 0 == strcmp("encoding", dr->str)) {
+ is_encoding = 1;
+ }
if (dr->has_instruct) {
name = str2sym(dr->str);
- //name = rb_str_new2(dr->str);
}
if (is_white(c)) {
c = next_non_white(dr);
@@ -614,11 +630,23 @@ read_attrs(SaxDrive dr, VALUE *attrs, char c, char termc, char term2) {
if (0 != read_quoted_value(dr)) {
return -1;
}
+#ifdef HAVE_RUBY_ENCODING_H
+ if (is_encoding) {
+ dr->encoding = rb_enc_find(dr->str);
+ }
+#endif
if (dr->has_instruct) {
+ VALUE rstr = rb_str_new2(dr->str);
+
if (Qnil == *attrs) {
*attrs = rb_hash_new();
}
- rb_hash_aset(*attrs, name, rb_str_new2(dr->str));
+#ifdef HAVE_RUBY_ENCODING_H
+ if (0 != dr->encoding) {
+ rb_enc_associate(rstr, dr->encoding);
+ }
+#endif
+ rb_hash_aset(*attrs, name, rstr);
}
c = next_non_white(dr);
}
View
2 notes
@@ -6,7 +6,7 @@
- changes for 1.3.0
- fixed Mutex dump bug
- - added SAX parser, 20+ times faster than Nokogiri and 6+ times faster than LibXML
+ - added SAX parser, 30+ times faster than Nokogiri and 10+ times faster than LibXML
- docs
- added SAX info to docs with example
View
@@ -324,11 +324,58 @@ def test_sax_cdata_no_term
[:error, "invalid format, cdata terminated unexpectedly", 5, 1]])
end
- # TBD mix of elements, text, and attributes - tight and loose
- # TBD read invalid xml with recoverable errors (elements out of order, multiple top elements)
- # TBD read invalid xml (missing
-
- # TBD test encoding
+
+ def test_sax_mixed
+ parse_compare(%{<?xml version="1.0"?>
+<?ox version="1.0" mode="object" circular="no" xsd_date="no"?>
+<!DOCTYPE table PUBLIC "-//ox//DTD TABLE 1.0//EN" "http://www.ohler.com/DTDs/TestTable-1.0.dtd">
+<table>
+ <row id="00004">
+ <cell id="A" type="Fixnum">1234</cell>
+ <cell id="B" type="String">A string.</cell>
+ <cell id="C" type="String">This is a longer string that stretches over a larger number of characters.</cell>
+ <cell id="D" type="Float">-12.345</cell>
+ <cell id="E" type="Date">2011-09-18 23:07:26 +0900</cell>
+ <cell id="F" type="Image"><![CDATA[xx00xx00xx00xx00xx00xx00xx00xx00xx00xx00xx00xx00xx00xx00xx00xx00xx00xx00xx00]]></cell>
+ </row>
+</table>
+},
+ [[:instruct, 'xml', {:version => '1.0'}],
+ [:instruct, "ox", {:version=>"1.0", :mode=>"object", :circular=>"no", :xsd_date=>"no"}],
+ [:doctype, " table PUBLIC \"-//ox//DTD TABLE 1.0//EN\" \"http://www.ohler.com/DTDs/TestTable-1.0.dtd\""],
+ [:start_element, :table, nil],
+ [:start_element, :row, {:id=>"00004"}],
+ [:start_element, :cell, {:id=>"A", :type=>"Fixnum"}],
+ [:text, "1234"],
+ [:end_element, :cell],
+ [:start_element, :cell, {:id=>"B", :type=>"String"}],
+ [:text, "A string."],
+ [:end_element, :cell],
+ [:start_element, :cell, {:id=>"C", :type=>"String"}],
+ [:text, "This is a longer string that stretches over a larger number of characters."],
+ [:end_element, :cell],
+ [:start_element, :cell, {:id=>"D", :type=>"Float"}],
+ [:text, "-12.345"],
+ [:end_element, :cell],
+ [:start_element, :cell, {:id=>"E", :type=>"Date"}],
+ [:text, "2011-09-18 23:07:26 +0900"],
+ [:end_element, :cell],
+ [:start_element, :cell, {:id=>"F", :type=>"Image"}],
+ [:cdata, "xx00xx00xx00xx00xx00xx00xx00xx00xx00xx00xx00xx00xx00xx00xx00xx00xx00xx00xx00"],
+ [:end_element, :cell],
+ [:end_element, :row],
+ [:end_element, :table]])
+ end
+
+ def test_sax_encoding
+ parse_compare(%{<?xml version="1.0" encoding="UTF-8"?>
+<top>ピーター</top>
+},
+ [[:instruct, 'xml', {:version => '1.0', :encoding => 'UTF-8'}],
+ [:start_element, :top, nil],
+ [:text, 'ピーター'],
+ [:end_element, :top]])
+ end
end

0 comments on commit 264e303

Please sign in to comment.