diff --git a/resources/etc/configuration.xml b/resources/etc/configuration.xml index e6517934..1b48a355 100644 --- a/resources/etc/configuration.xml +++ b/resources/etc/configuration.xml @@ -163,6 +163,9 @@ + + diff --git a/resources/etc/extension-library.xml b/resources/etc/extension-library.xml index d7ff69c4..767be300 100644 --- a/resources/etc/extension-library.xml +++ b/resources/etc/extension-library.xml @@ -57,6 +57,12 @@ + + + + + + diff --git a/src/com/xmlcalabash/extensions/RDFa.java b/src/com/xmlcalabash/extensions/RDFa.java new file mode 100644 index 00000000..c162d910 --- /dev/null +++ b/src/com/xmlcalabash/extensions/RDFa.java @@ -0,0 +1,261 @@ +package com.xmlcalabash.extensions; + +import com.xmlcalabash.core.XProcConstants; +import com.xmlcalabash.core.XProcException; +import com.xmlcalabash.core.XProcRuntime; +import com.xmlcalabash.io.ReadablePipe; +import com.xmlcalabash.io.WritablePipe; +import com.xmlcalabash.library.DefaultStep; +import com.xmlcalabash.runtime.XAtomicStep; +import com.xmlcalabash.util.S9apiUtils; +import com.xmlcalabash.util.TreeWriter; +import net.sf.saxon.s9api.Axis; +import net.sf.saxon.s9api.QName; +import net.sf.saxon.s9api.SaxonApiException; +import net.sf.saxon.s9api.Serializer; +import net.sf.saxon.s9api.XdmNode; +import net.sf.saxon.s9api.XdmSequenceIterator; +import org.semarglproject.rdf.ParseException; +import org.semarglproject.rdf.rdfa.RdfaParser; +import org.semarglproject.sink.TripleSink; +import org.semarglproject.source.StreamProcessor; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.StringWriter; +import java.util.Calendar; +import java.util.Random; +import java.util.Vector; + +/** + * Created by IntelliJ IDEA. + * User: ndw + * Date: Oct 8, 2008 + * Time: 7:44:07 AM + * To change this template use File | Settings | File Templates. + */ + +public class RDFa extends DefaultStep { + private static final QName sem_triples = new QName("sem","http://marklogic.com/semantics", "triples"); + private static final QName sem_triple = new QName("sem","http://marklogic.com/semantics", "triple"); + private static final QName sem_subject = new QName("sem","http://marklogic.com/semantics", "subject"); + private static final QName sem_predicate = new QName("sem","http://marklogic.com/semantics", "predicate"); + private static final QName sem_object = new QName("sem","http://marklogic.com/semantics", "object"); + private static final QName _datatype = new QName("", "datatype"); + private static final QName _max_triples = new QName("", "max-triples-per-document"); + private ReadablePipe source = null; + private WritablePipe result = null; + private long limit = 100; + private long count = 0; + + /** + * Creates a new instance of Identity + */ + public RDFa(XProcRuntime runtime, XAtomicStep step) { + super(runtime,step); + } + + public void setInput(String port, ReadablePipe pipe) { + source = pipe; + } + + public void setOutput(String port, WritablePipe pipe) { + result = pipe; + } + + public void reset() { + source.resetReader(); + result.resetWriter(); + } + + public void run() throws SaxonApiException { + super.run(); + + String limitStr = getOption(_max_triples).getString(); + try { + limit = Integer.parseInt(limitStr); + } catch (NumberFormatException nfe) { + throw XProcException.dynamicError(19, "The max-triples-per-document on cx:rdf-a must be an integer"); + } + + XdmNode doc = source.read(); + + try { + Sink sink = new Sink(); + StreamProcessor sp = new StreamProcessor(RdfaParser.connect(sink)); + + // HACK!!! + // FIXME: set serializer properties appropriately! + Serializer serializer = makeSerializer(); + StringWriter writer = new StringWriter(); + serializer.setOutputWriter(writer); + S9apiUtils.serialize(runtime, doc, serializer); + writer.close(); + + ByteArrayInputStream bais = new ByteArrayInputStream(writer.toString().getBytes("UTF-8")); + sp.process(bais, doc.getBaseURI().toASCIIString()); + } catch (IOException e) { + throw new XProcException(e); + } catch (ParseException e) { + throw new XProcException(e); + } + } + + private class Sink implements TripleSink { + TreeWriter tree = null; + String baseURI = null; + long randomValue = 0; + long milliSecs = 0; + + public Sink() { + Random random = new Random(); + randomValue = random.nextLong(); + Calendar cal = Calendar.getInstance(); + milliSecs = cal.getTimeInMillis(); + } + + @Override + public void addNonLiteral(String subj, String pred, String obj) { + /* + tree.addStartElement(sem_triple); + tree.startContent(); + tree.addStartElement(sem_subject); + tree.startContent(); + tree.addText(patchURI(subj)); + tree.addEndElement(); + tree.addStartElement(sem_predicate); + tree.startContent(); + tree.addText(patchURI(pred)); + tree.addEndElement(); + tree.addStartElement(sem_object); + tree.startContent(); + tree.addText(patchURI(obj)); + tree.addEndElement(); + tree.addEndElement(); + */ + nextFile(); + } + + @Override + public void addPlainLiteral(String subj, String pred, String obj, String lang) { + tree.addStartElement(sem_triple); + tree.startContent(); + tree.addStartElement(sem_subject); + tree.startContent(); + tree.addText(patchURI(subj)); + tree.addEndElement(); + tree.addStartElement(sem_predicate); + tree.startContent(); + tree.addText(patchURI(pred)); + tree.addEndElement(); + tree.addStartElement(sem_object); + + if (lang == null || "".equals(lang)) { + tree.addAttribute(_datatype, "http://www.w3.org/2001/XMLSchema#string"); + } else { + tree.addAttribute(XProcConstants.xml_lang,lang); + } + + tree.startContent(); + tree.addText(obj); + tree.addEndElement(); + tree.addEndElement(); + nextFile(); + } + + @Override + public void addTypedLiteral(String subj, String pred, String obj, String datatype) { + if (datatype == null) { + datatype = "http://www.w3.org/2001/XMLSchema#string"; + } + tree.addStartElement(sem_triple); + tree.startContent(); + tree.addStartElement(sem_subject); + tree.startContent(); + tree.addText(patchURI(subj)); + tree.addEndElement(); + tree.addStartElement(sem_predicate); + tree.startContent(); + tree.addText(patchURI(pred)); + tree.addEndElement(); + tree.addStartElement(sem_object); + tree.addAttribute(_datatype, datatype); + tree.startContent(); + tree.addText(obj); + tree.addEndElement(); + tree.addEndElement(); + nextFile(); + } + + @Override + public void setBaseUri(String s) { + baseURI = s; + } + + @Override + public void startStream() throws ParseException { + tree = new TreeWriter(runtime); + tree.startDocument(step.getNode().getBaseURI()); + tree.addStartElement(sem_triples); + tree.startContent(); + } + + @Override + public void endStream() throws ParseException { + tree.addEndElement(); + tree.endDocument(); + if (count > 0) { + XdmNode out = tree.getResult(); + result.write(out); + } + } + + @Override + public boolean setProperty(String key, Object value) { + return false; + } + + private void nextFile() { + count += 1; + if (count >= limit) { + tree.addEndElement(); + tree.endDocument(); + + XdmNode out = tree.getResult(); + result.write(out); + + tree = new TreeWriter(runtime); + tree.startDocument(step.getNode().getBaseURI()); + tree.addStartElement(sem_triples); + tree.startContent(); + + count = 0; + } + } + + private String patchURI(String uri) { + if (uri.startsWith("_:")) { + return "http://marklogic.com/semantics/blank/" + + Long.toHexString(fuse(scramble(milliSecs),randomValue)) + + "/" + uri; + } else { + return uri; + } + } + + private long rotl(long x, long y) + { + return (x<>(64-y)); + } + + private long fuse(long a, long b) + { + return rotl(a,8)^b; + } + + private long scramble(long x) + { + return x^rotl(x,20)^rotl(x,40); + } + } +} \ No newline at end of file