Skip to content

Commit

Permalink
Merge branch 'html-sax-push-parser'
Browse files Browse the repository at this point in the history
  • Loading branch information
flavorjones committed Jan 21, 2015
2 parents 6bc27c4 + 0a47fdc commit f990e6d
Show file tree
Hide file tree
Showing 8 changed files with 366 additions and 3 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.ja.rdoc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
* `XML::Comment.new` argument types are now consistent and safe (and documented) across MRI and JRuby. (#1224)
* (MRI) Restoring support for Ruby 1.9.2 that was broken in v1.6.4.1 and v1.6.5. (#1207)
* Check if `zlib` is available before building `libxml2`. (#1188)
* (JRuby) HtmlSaxPushParser now exists. (#1147) (Thanks, Piotr Szmielew!)


==== 互換性
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.rdoc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
* `XML::Comment.new` argument types are now consistent and safe (and documented) across MRI and JRuby. (#1224)
* (MRI) Restoring support for Ruby 1.9.2 that was broken in v1.6.4.1 and v1.6.5. (#1207)
* Check if `zlib` is available before building `libxml2`. (#1188)
* (JRuby) HtmlSaxPushParser now exists. (#1147) (Thanks, Piotr Szmielew!)


==== Compatibility Note
Expand Down
1 change: 1 addition & 0 deletions Manifest.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ ext/java/nokogiri/HtmlDocument.java
ext/java/nokogiri/HtmlElementDescription.java
ext/java/nokogiri/HtmlEntityLookup.java
ext/java/nokogiri/HtmlSaxParserContext.java
ext/java/nokogiri/HtmlSaxPushParser.java
ext/java/nokogiri/NokogiriService.java
ext/java/nokogiri/XmlAttr.java
ext/java/nokogiri/XmlAttributeDecl.java
Expand Down
2 changes: 1 addition & 1 deletion ext/java/nokogiri/HtmlSaxParserContext.java
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ public static IRubyObject parse_io(ThreadContext context,
/**
* Create a new parser context that will read from a raw input
* stream. Not a JRuby method. Meant to be run in a separate
* thread by XmlSaxPushParser.
* thread by HtmlSaxPushParser.
*/
public static IRubyObject parse_stream(ThreadContext context,
IRubyObject klazz,
Expand Down
244 changes: 244 additions & 0 deletions ext/java/nokogiri/HtmlSaxPushParser.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,244 @@
/**
* (The MIT License)
*
* Copyright (c) 2008 - 2012:
*
* * {Aaron Patterson}[http://tenderlovemaking.com]
* * {Mike Dalessio}[http://mike.daless.io]
* * {Charles Nutter}[http://blog.headius.com]
* * {Sergio Arbeo}[http://www.serabe.com]
* * {Patrick Mahoney}[http://polycrystal.org]
* * {Yoko Harada}[http://yokolet.blogspot.com]
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* 'Software'), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package nokogiri;

import static nokogiri.internals.NokogiriHelpers.getNokogiriClass;
import static org.jruby.javasupport.util.RuntimeHelpers.invoke;
import static nokogiri.internals.NokogiriHelpers.rubyStringToString;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.EnumSet;
import java.util.concurrent.Future;
import java.util.concurrent.FutureTask;
import java.util.concurrent.ThreadFactory;

import nokogiri.internals.ClosedStreamException;
import nokogiri.internals.NokogiriBlockingQueueInputStream;
import nokogiri.internals.ParserContext;

import org.jruby.Ruby;
import org.jruby.RubyClass;
import org.jruby.RubyException;
import org.jruby.RubyFixnum;
import org.jruby.RubyObject;
import org.jruby.RubyString;
import org.jruby.anno.JRubyClass;
import org.jruby.anno.JRubyMethod;
import org.jruby.exceptions.RaiseException;
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.builtin.IRubyObject;

/**
* Class for Nokogiri::HTML::SAX::PushParser
*
* @author
* @author Piotr Szmielew <p.szmielew@ava.waw.pl> - based on Nokogiri::XML::SAX::PushParser
*/
@JRubyClass(name="Nokogiri::HTML::SAX::PushParser")
public class HtmlSaxPushParser extends RubyObject {
ParserContext.Options options;
IRubyObject optionsRuby;
IRubyObject saxParser;
NokogiriBlockingQueueInputStream stream;
ParserTask parserTask = null;
FutureTask<HtmlSaxParserContext> futureTask = null;
ExecutorService executor = null;

public HtmlSaxPushParser(Ruby ruby, RubyClass rubyClass) {
super(ruby, rubyClass);
}

@Override
public void finalize() {
terminateTask(null);
}

/**
* Silently skips provided encoding
*
*/
@JRubyMethod
public IRubyObject initialize_native(final ThreadContext context,
IRubyObject saxParser,
IRubyObject fileName,
IRubyObject encoding) {
optionsRuby
= invoke(context, context.getRuntime().getClassFromPath("Nokogiri::XML::ParseOptions"), "new");

options = new ParserContext.Options(0);
this.saxParser = saxParser;
return this;
}

/**
* Returns an integer.
*/
@JRubyMethod(name="options")
public IRubyObject getOptions(ThreadContext context) {
return invoke(context, optionsRuby, "options");
}

/**
* <code>val</code> is an integer.
*/
@JRubyMethod(name="options=")
public IRubyObject setOptions(ThreadContext context, IRubyObject val) {
invoke(context, optionsRuby, "options=", val);
options =
new ParserContext.Options(val.convertToInteger().getLongValue());
return getOptions(context);
}

@JRubyMethod
public IRubyObject native_write(ThreadContext context, IRubyObject chunk,
IRubyObject isLast) {
try {
initialize_task(context);
} catch (IOException e) {
throw context.getRuntime().newRuntimeError(e.getMessage());
}
byte[] data = null;
if (chunk instanceof RubyString || chunk.respondsTo("to_str")) {
data = chunk.convertToString().getBytes();
} else {
terminateTask(context);
XmlSyntaxError xmlSyntaxError =
(XmlSyntaxError) NokogiriService.XML_SYNTAXERROR_ALLOCATOR.allocate(context.getRuntime(), getNokogiriClass(context.getRuntime(), "Nokogiri::HTML::SyntaxError"));
throw new RaiseException(xmlSyntaxError);
}

int errorCount0 = parserTask.getErrorCount();;


if (isLast.isTrue()) {
IRubyObject document = invoke(context, this, "document");
invoke(context, document, "end_document");
terminateTask(context);
} else {
try {
Future<Void> task = stream.addChunk(new ByteArrayInputStream(data));
task.get();
} catch (ClosedStreamException ex) {
// this means the stream is closed, ignore this exception
} catch (Exception e) {
throw context.getRuntime().newRuntimeError(e.getMessage());
}

}

if (!options.recover && parserTask.getErrorCount() > errorCount0) {
terminateTask(context);
throw new RaiseException(parserTask.getLastError(), true);
}

return this;
}

private void initialize_task(ThreadContext context) throws IOException {
if (futureTask == null || stream == null) {
stream = new NokogiriBlockingQueueInputStream();

parserTask = new ParserTask(context, saxParser);
futureTask = new FutureTask<HtmlSaxParserContext>(parserTask);
executor = Executors.newSingleThreadExecutor(new ThreadFactory() {
@Override
public Thread newThread(Runnable r) {
Thread t = new Thread(r);
t.setName("HtmlSaxPushParser");
t.setDaemon(true);
return t;
}
});
executor.submit(futureTask);
}
}

private synchronized void terminateTask(ThreadContext context) {
try {
Future<Void> task = stream.addChunk(NokogiriBlockingQueueInputStream.END);
task.get();
} catch (ClosedStreamException ex) {
// ignore this exception, it means the stream was closed
} catch (Exception e) {
if (context != null)
throw context.getRuntime().newRuntimeError(e.getMessage());
}
futureTask.cancel(true);
executor.shutdown();
executor = null;
stream = null;
futureTask = null;
}


private class ParserTask implements Callable<HtmlSaxParserContext> {
private final ThreadContext context;
private final IRubyObject handler;
private final HtmlSaxParserContext parser;

private ParserTask(ThreadContext context, IRubyObject handler) {
RubyClass klazz = getNokogiriClass(context.getRuntime(), "Nokogiri::HTML::SAX::ParserContext");
this.context = context;
this.handler = handler;
this.parser = (HtmlSaxParserContext) HtmlSaxParserContext.parse_stream(context, klazz, stream);
}

@Override
public HtmlSaxParserContext call() throws Exception {
try {
parser.parse_with(context, handler);
} finally {
// we have to close the stream before exiting, otherwise someone
// can add a chunk and block on task.get() forever.
stream.close();
}
return parser;
}

private synchronized int getErrorCount() {
// check for null because thread may not have started yet
if (parser.getNokogiriHandler() == null) return 0;
else return parser.getNokogiriHandler().getErrorCount();
}

private synchronized RubyException getLastError() {
return (RubyException) parser.getNokogiriHandler().getLastError();
}
}
}
9 changes: 9 additions & 0 deletions ext/java/nokogiri/NokogiriService.java
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,9 @@ private void createSaxModule(Ruby ruby, RubyModule xmlSaxModule, RubyModule html
RubyClass xmlSaxPushParser = xmlSaxModule.defineClassUnder("PushParser", ruby.getObject(), XML_SAXPUSHPARSER_ALLOCATOR);
xmlSaxPushParser.defineAnnotatedMethods(XmlSaxPushParser.class);

RubyClass htmlSaxPushParser = htmlSaxModule.defineClassUnder("PushParser", ruby.getObject(), HTML_SAXPUSHPARSER_ALLOCATOR);
htmlSaxPushParser.defineAnnotatedMethods(HtmlSaxPushParser.class);

RubyClass htmlSaxParserContext = htmlSaxModule.defineClassUnder("ParserContext", xmlSaxParserContext, HTML_SAXPARSER_CONTEXT_ALLOCATOR);
htmlSaxParserContext.defineAnnotatedMethods(HtmlSaxParserContext.class);
}
Expand Down Expand Up @@ -536,6 +539,12 @@ public IRubyObject allocate(Ruby runtime, RubyClass klazz) {
return new XmlSaxPushParser(runtime, klazz);
}
};

private static ObjectAllocator HTML_SAXPUSHPARSER_ALLOCATOR = new ObjectAllocator() {
public IRubyObject allocate(Ruby runtime, RubyClass klazz) {
return new HtmlSaxPushParser(runtime, klazz);
}
};

public static final ObjectAllocator XML_SCHEMA_ALLOCATOR = new ObjectAllocator() {
private XmlSchema xmlSchema = null;
Expand Down
24 changes: 22 additions & 2 deletions lib/nokogiri/html/sax/push_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,33 @@ module Nokogiri
module HTML
module SAX
class PushParser
def initialize(doc = XML::SAX::Document.new, file_name = nil, encoding = 'UTF-8')

# The Nokogiri::HTML::SAX::Document on which the PushParser will be
# operating
attr_accessor :document

def initialize(doc = HTML::SAX::Document.new, file_name = nil, encoding = 'UTF-8')
@document = doc
@encoding = encoding
@sax_parser = HTML::SAX::Parser.new(doc, @encoding)

## Create our push parser context
initialize_native(@sax_parser, file_name, @encoding)
initialize_native(@sax_parser, file_name, encoding)
end

###
# Write a +chunk+ of HTML to the PushParser. Any callback methods
# that can be called will be called immediately.
def write chunk, last_chunk = false
native_write(chunk, last_chunk)
end
alias :<< :write

###
# Finish the parsing. This method is only necessary for
# Nokogiri::HTML::SAX::Document#end_document to be called.
def finish
write '', true
end
end
end
Expand Down
Loading

0 comments on commit f990e6d

Please sign in to comment.