Skip to content

Commit

Permalink
making the super awesome sax parser go
Browse files Browse the repository at this point in the history
  • Loading branch information
tenderlove committed Sep 21, 2008
1 parent f2ffa56 commit 0200a46
Show file tree
Hide file tree
Showing 5 changed files with 260 additions and 58 deletions.
56 changes: 53 additions & 3 deletions Manifest.txt
Expand Up @@ -2,40 +2,90 @@ History.txt
Manifest.txt
README.txt
Rakefile
ext/nokogiri/Makefile
ext/nokogiri/conftest.dSYM/Contents/Info.plist
ext/nokogiri/conftest.dSYM/Contents/Resources/DWARF/conftest
ext/nokogiri/extconf.rb
ext/nokogiri/html_document.c
ext/nokogiri/html_document.h
ext/nokogiri/mkmf.log
ext/nokogiri/native.c
ext/nokogiri/native.h
ext/nokogiri/xml_document.c
ext/nokogiri/xml_document.h
ext/nokogiri/xml_node.c
ext/nokogiri/xml_node.h
ext/nokogiri/xml_node_set.c
ext/nokogiri/xml_node_set.h
ext/nokogiri/xml_sax_parser.c
ext/nokogiri/xml_sax_parser.h
ext/nokogiri/xml_text.c
ext/nokogiri/xml_text.h
ext/nokogiri/xml_xpath.c
ext/nokogiri/xml_xpath.h
ext/nokogiri/xslt_stylesheet.c
ext/nokogiri/xslt_stylesheet.h
lib/nokogiri.rb
lib/nokogiri/generated_interface.rb
lib/nokogiri/css.rb
lib/nokogiri/css/generated_tokenizer.rb
lib/nokogiri/css/node.rb
lib/nokogiri/css/parser.rb
lib/nokogiri/css/parser.y
lib/nokogiri/css/tokenizer.rb
lib/nokogiri/css/tokenizer.rex
lib/nokogiri/css/xpath_visitor.rb
lib/nokogiri/decorators.rb
lib/nokogiri/decorators/hpricot.rb
lib/nokogiri/decorators/hpricot/node.rb
lib/nokogiri/decorators/hpricot/node_set.rb
lib/nokogiri/decorators/hpricot/xpath_visitor.rb
lib/nokogiri/hpricot.rb
lib/nokogiri/html.rb
lib/nokogiri/html/builder.rb
lib/nokogiri/html/document.rb
lib/nokogiri/version.rb
lib/nokogiri/xml.rb
lib/nokogiri/xml/builder.rb
lib/nokogiri/xml/document.rb
lib/nokogiri/xml/node.rb
lib/nokogiri/xml/node_set.rb
lib/nokogiri/xml/sax.rb
lib/nokogiri/xml/sax/document.rb
lib/nokogiri/xml/sax/parser.rb
lib/nokogiri/xml/text.rb
lib/nokogiri/xml/text_node.rb
lib/nokogiri/xml/xpath.rb
lib/nokogiri/xslt.rb
lib/nokogiri/xslt/stylesheet.rb
nokogiri.gemspec
test/css/test_parser.rb
test/css/test_tokenizer.rb
test/files/staff.xml
test/files/staff.xslt
test/files/tlm.html
test/helper.rb
test/hpricot/files/basic.xhtml
test/hpricot/files/boingboing.html
test/hpricot/files/cy0.html
test/hpricot/files/immob.html
test/hpricot/files/pace_application.html
test/hpricot/files/tenderlove.html
test/hpricot/files/uswebgen.html
test/hpricot/files/utf8.html
test/hpricot/files/week9.html
test/hpricot/files/why.xml
test/hpricot/load_files.rb
test/hpricot/test_alter.rb
test/hpricot/test_builder.rb
test/hpricot/test_parser.rb
test/hpricot/test_paths.rb
test/hpricot/test_preserved.rb
test/hpricot/test_xml.rb
test/html/test_builder.rb
test/html/test_document.rb
test/test_convert_xpath.rb
test/test_nokogiri.rb
test/test_xslt_transforms.rb
test/xml/sax/test_parser.rb
test/xml/test_document.rb
test/xml/test_node.rb
test/xml/test_node_set.rb
test/xml/test_text.rb
129 changes: 77 additions & 52 deletions ext/nokogiri/xml_sax_parser.c
@@ -1,5 +1,11 @@
#include <xml_sax_parser.h>

/*
* call-seq:
* parse_memory(data)
*
* Parse the document stored in +data+
*/
static VALUE parse_memory(VALUE self, VALUE data)
{
xmlSAXHandlerPtr handler;
Expand All @@ -12,114 +18,132 @@ static VALUE parse_memory(VALUE self, VALUE data)
return data;
}

static void internal_subset( void * ctx,
const xmlChar *name,
const xmlChar *external_id,
const xmlChar *system_id )
static VALUE native_parse_file(VALUE self, VALUE data)
{
xmlSAXHandlerPtr handler;
Data_Get_Struct(self, xmlSAXHandler, handler);
xmlSAXUserParseFile( handler,
(void *)self,
StringValuePtr(data)
);
return data;
}

static void start_document(void * ctx)
{
VALUE self = (VALUE)ctx;
VALUE doc = rb_funcall(self, rb_intern("document"), 0);
rb_funcall(doc, rb_intern("internal_subset"), 3,
rb_str_new2((char *)name),
rb_str_new2((char *)external_id),
rb_str_new2((char *)system_id));
rb_funcall(doc, rb_intern("start_document"), 0);
}

/* Not using these yet...
static int is_standalone(void * ctx)
static void end_document(void * ctx)
{
VALUE self = (VALUE)ctx;
VALUE doc = rb_funcall(self, rb_intern("document"), 0);
if(Qtrue == rb_funcall(doc, rb_intern("standalone?"), 0))
return 1;
return 0;
rb_funcall(doc, rb_intern("end_document"), 0);
}

static int has_internal_subset(void * ctx)
static void start_element(void * ctx, const xmlChar *name, const xmlChar **atts)
{
VALUE self = (VALUE)ctx;
VALUE doc = rb_funcall(self, rb_intern("document"), 0);
if(Qtrue == rb_funcall(doc, rb_intern("internal_subset?"), 0))
return 1;
VALUE attributes = rb_ary_new();
const xmlChar * attr;
int i = 0;
if(atts) {
while((attr = atts[i]) != NULL) {
rb_funcall(attributes, rb_intern("<<"), 1, rb_str_new2((const char *)attr));
i++;
}
}

return 0;
rb_funcall( doc,
rb_intern("start_element"),
2,
rb_str_new2((const char *)name),
attributes
);
}

static int has_external_subset(void * ctx)
static void end_element(void * ctx, const xmlChar *name)
{
VALUE self = (VALUE)ctx;
VALUE doc = rb_funcall(self, rb_intern("document"), 0);
if(Qtrue == rb_funcall(doc, rb_intern("external_subset?"), 0))
return 1;
rb_funcall(doc, rb_intern("end_element"), 1, rb_str_new2((const char *)name));
}

return 0;
static void characters_func(void * ctx, const xmlChar * ch, int len)
{
VALUE self = (VALUE)ctx;
VALUE doc = rb_funcall(self, rb_intern("document"), 0);
VALUE str = rb_str_new((const char *)ch, (long)len);
rb_funcall(doc, rb_intern("characters"), 1, str);
}
*/

static void start_document(void * ctx)
static void comment_func(void * ctx, const xmlChar * value)
{
VALUE self = (VALUE)ctx;
VALUE doc = rb_funcall(self, rb_intern("document"), 0);
rb_funcall(doc, rb_intern("start_document"), 0);
VALUE str = rb_str_new2((const char *)value);
rb_funcall(doc, rb_intern("comment"), 1, str);
}

static void end_document(void * ctx)
static void warning_func(void * ctx, const char *msg, ...)
{
VALUE self = (VALUE)ctx;
VALUE doc = rb_funcall(self, rb_intern("document"), 0);
rb_funcall(doc, rb_intern("end_document"), 0);
char * message;

va_list args;
va_start(args, msg);
vasprintf(&message, msg, args);
va_end(args);

rb_funcall(doc, rb_intern("warning"), 1, rb_str_new2(message));
free(message);
}

static void start_element(void * ctx, const xmlChar *name, const xmlChar **atts)
static void error_func(void * ctx, const char *msg, ...)
{
VALUE self = (VALUE)ctx;
VALUE doc = rb_funcall(self, rb_intern("document"), 0);
VALUE attributes = rb_ary_new();
xmlChar * attr;
int i = 0;
if(atts) {
while(attr = atts[i]) {
rb_funcall(attributes, rb_intern("<<"), 1, rb_str_new2((char *)attr));
i++;
}
}
char * message;

rb_funcall( doc,
rb_intern("start_element"),
2,
rb_str_new2((char *)name),
attributes
);
va_list args;
va_start(args, msg);
vasprintf(&message, msg, args);
va_end(args);

rb_funcall(doc, rb_intern("error"), 1, rb_str_new2(message));
free(message);
}

static void end_element(void * ctx, const xmlChar *name)
static void cdata_block(void * ctx, const xmlChar * value, int len)
{
VALUE self = (VALUE)ctx;
VALUE doc = rb_funcall(self, rb_intern("document"), 0);
rb_funcall(doc, rb_intern("end_element"), 1, rb_str_new2((char *)name));
VALUE string = rb_str_new((const char *)value, (long)len);
rb_funcall(doc, rb_intern("cdata_block"), 1, string);
}

static void deallocate(xmlSAXHandlerPtr handler)
{
/* FIXME */
free(handler);
}

static VALUE allocate(VALUE klass)
{
xmlSAXHandlerPtr handler = calloc(1, sizeof(xmlSAXHandler));

handler->internalSubset = internal_subset;
/*
handler->isStandalone = is_standalone;
handler->hasInternalSubset = has_internal_subset;
handler->hasExternalSubset = has_external_subset;
*/
handler->startDocument = start_document;
handler->endDocument = end_document;
handler->startElement = start_element;
handler->endElement = end_element;
handler->characters = characters_func;
handler->comment = comment_func;
handler->warning = warning_func;
handler->error = error_func;
handler->cdataBlock = cdata_block;

return Data_Wrap_Struct(klass, NULL, deallocate, handler);
}
Expand All @@ -131,4 +155,5 @@ void init_xml_sax_parser()
rb_const_get(mNokogiriXmlSax, rb_intern("Parser"));
rb_define_alloc_func(klass, allocate);
rb_define_method(klass, "parse_memory", parse_memory, 1);
rb_define_private_method(klass, "native_parse_file", native_parse_file, 1);
}
43 changes: 40 additions & 3 deletions lib/nokogiri/xml/sax/document.rb
Expand Up @@ -2,20 +2,57 @@ module Nokogiri
module XML
module SAX
class Document
def internal_subset name, external_id, system_id
end

###
# Called when document starts parsing
def start_document
end

###
# Called when document ends parsing
def end_document
end

###
# Called at the beginning of an element
# +name+ is the name of the tag with +attrs+ as attributes
def start_element name, attrs = []
end

###
# Called at the end of an element
# +name+ is the tag name
def end_element name
end

###
# Characters read between a tag
# +string+ contains the character data
def characters string
end

###
# Called when comments are encountered
# +string+ contains the comment data
def comment string
end

###
# Called on document warnings
# +string+ contains the warning
def warning string
end

###
# Called on document errors
# +string+ contains the error
def error string
end

###
# Called when cdata blocks are found
# +string+ contains the cdata content
def cdata_block string
end
end
end
end
Expand Down
21 changes: 21 additions & 0 deletions lib/nokogiri/xml/sax/parser.rb
Expand Up @@ -6,6 +6,27 @@ class Parser
def initialize(doc = SAX::Document.new)
@document = doc
end

###
# Parse given +thing+ which may be a string containing xml, or an
# IO object.
def parse thing
parse_memory(thing.is_a?(IO) ? thing.read : thing)
end

###
# Parse given +io+
def parse_io io
parse_memory io.read
end

###
# Parse a file with +filename+
def parse_file filename
raise Errno::ENOENT unless File.exists?(filename)
raise Errno::EISDIR if File.directory?(filename)
native_parse_file filename
end
end
end
end
Expand Down

0 comments on commit 0200a46

Please sign in to comment.