Permalink
Browse files

Copied in PDFParser from Tika (based on PDFBox) as of Tika trunk on 15th

April 2012.
  • Loading branch information...
1 parent 1ea8d73 commit 2a7ac5d73b954ff4cd38caa371107cd93d3b5e47 @anjackson anjackson committed Apr 15, 2012
@@ -0,0 +1,253 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package uk.bl.wap.tika.parser.pdf.pdfbox;
+
+import java.io.IOException;
+import java.io.Writer;
+
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.util.PDFTextStripper;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup;
+import org.apache.pdfbox.util.TextPosition;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOExceptionWithCause;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Utility class that overrides the {@link PDFTextStripper} functionality
+ * to produce a semi-structured XHTML SAX events instead of a plain text
+ * stream.
+ */
+class PDF2XHTML extends PDFTextStripper {
+
+ // TODO: remove once PDFBOX-1130 is fixed:
+ private boolean inParagraph = false;
+
+ /**
+ * Converts the given PDF document (and related metadata) to a stream
+ * of XHTML SAX events sent to the given content handler.
+ *
+ * @param document PDF document
+ * @param handler SAX content handler
+ * @param metadata PDF metadata
+ * @throws SAXException if the content handler fails to process SAX events
+ * @throws TikaException if the PDF document can not be processed
+ */
+ public static void process(
+ PDDocument document, ContentHandler handler, Metadata metadata,
+ boolean extractAnnotationText, boolean enableAutoSpace,
+ boolean suppressDuplicateOverlappingText, boolean sortByPosition)
+ throws SAXException, TikaException {
+ try {
+ // Extract text using a dummy Writer as we override the
+ // key methods to output to the given content handler.
+ new PDF2XHTML(handler, metadata,
+ extractAnnotationText, enableAutoSpace,
+ suppressDuplicateOverlappingText, sortByPosition).writeText(document, new Writer() {
+ @Override
+ public void write(char[] cbuf, int off, int len) {
+ }
+ @Override
+ public void flush() {
+ }
+ @Override
+ public void close() {
+ }
+ });
+ } catch (IOException e) {
+ if (e.getCause() instanceof SAXException) {
+ throw (SAXException) e.getCause();
+ } else {
+ throw new TikaException("Unable to extract PDF content", e);
+ }
+ }
+ }
+
+ private final XHTMLContentHandler handler;
+ private final boolean extractAnnotationText;
+
+ private PDF2XHTML(ContentHandler handler, Metadata metadata,
+ boolean extractAnnotationText, boolean enableAutoSpace,
+ boolean suppressDuplicateOverlappingText, boolean sortByPosition)
+ throws IOException {
+ this.handler = new XHTMLContentHandler(handler, metadata);
+ this.extractAnnotationText = extractAnnotationText;
+ setForceParsing(true);
+ setSortByPosition(sortByPosition);
+ if (enableAutoSpace) {
+ setWordSeparator(" ");
+ } else {
+ setWordSeparator("");
+ }
+ // TODO: maybe expose setting these too:
+ //setAverageCharTolerance(1.0f);
+ //setSpacingTolerance(1.0f);
+ setSuppressDuplicateOverlappingText(suppressDuplicateOverlappingText);
+ }
+
+ @Override
+ protected void startDocument(PDDocument pdf) throws IOException {
+ try {
+ handler.startDocument();
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause("Unable to start a document", e);
+ }
+ }
+
+ @Override
+ protected void endDocument(PDDocument pdf) throws IOException {
+ try {
+ handler.endDocument();
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause("Unable to end a document", e);
+ }
+ }
+
+ @Override
+ protected void startPage(PDPage page) throws IOException {
+ try {
+ handler.startElement("div", "class", "page");
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause("Unable to start a page", e);
+ }
+ writeParagraphStart();
+ }
+
+ @Override
+ protected void endPage(PDPage page) throws IOException {
+
+ try {
+ writeParagraphEnd();
+ // TODO: remove once PDFBOX-1143 is fixed:
+ if (extractAnnotationText) {
+ for(Object o : page.getAnnotations()) {
+ if ((o instanceof PDAnnotation) && PDAnnotationMarkup.SUB_TYPE_FREETEXT.equals(((PDAnnotation) o).getSubtype())) {
+ // It's a text annotation:
+ PDAnnotationMarkup annot = (PDAnnotationMarkup) o;
+ String title = annot.getTitlePopup();
+ String subject = annot.getTitlePopup();
+ String contents = annot.getContents();
+ // TODO: maybe also annot.getRichContents()?
+ if (title != null || subject != null || contents != null) {
+ handler.startElement("div", "class", "annotation");
+
+ if (title != null) {
+ handler.startElement("div", "class", "annotationTitle");
+ handler.characters(title);
+ handler.endElement("div");
+ }
+
+ if (subject != null) {
+ handler.startElement("div", "class", "annotationSubject");
+ handler.characters(subject);
+ handler.endElement("div");
+ }
+
+ if (contents != null) {
+ handler.startElement("div", "class", "annotationContents");
+ handler.characters(contents);
+ handler.endElement("div");
+ }
+
+ handler.endElement("div");
+ }
+ }
+ }
+ }
+ handler.endElement("div");
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause("Unable to end a page", e);
+ }
+ }
+
+ @Override
+ protected void writeParagraphStart() throws IOException {
+ // TODO: remove once PDFBOX-1130 is fixed
+ if (inParagraph) {
+ // Close last paragraph
+ writeParagraphEnd();
+ }
+ assert !inParagraph;
+ inParagraph = true;
+ try {
+ handler.startElement("p");
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause("Unable to start a paragraph", e);
+ }
+ }
+
+ @Override
+ protected void writeParagraphEnd() throws IOException {
+ // TODO: remove once PDFBOX-1130 is fixed
+ if (!inParagraph) {
+ writeParagraphStart();
+ }
+ assert inParagraph;
+ inParagraph = false;
+ try {
+ handler.endElement("p");
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause("Unable to end a paragraph", e);
+ }
+ }
+
+ @Override
+ protected void writeString(String text) throws IOException {
+ try {
+ handler.characters(text);
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause(
+ "Unable to write a string: " + text, e);
+ }
+ }
+
+ @Override
+ protected void writeCharacters(TextPosition text) throws IOException {
+ try {
+ handler.characters(text.getCharacter());
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause(
+ "Unable to write a character: " + text.getCharacter(), e);
+ }
+ }
+
+ @Override
+ protected void writeWordSeparator() throws IOException {
+ try {
+ handler.characters(getWordSeparator());
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause(
+ "Unable to write a space character", e);
+ }
+ }
+
+ @Override
+ protected void writeLineSeparator() throws IOException {
+ try {
+ handler.characters("\n");
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause(
+ "Unable to write a newline character", e);
+ }
+ }
+
+}
Oops, something went wrong.

0 comments on commit 2a7ac5d

Please sign in to comment.