From 18c7d5b9a203fc909109310d053d6f742bc9d69a Mon Sep 17 00:00:00 2001 From: modesty Date: Fri, 23 May 2025 16:32:46 -0700 Subject: [PATCH 1/2] maint: remove dependency on @xmldom/xmldom --- lib/ptixmlinject.js | 46 ++++++--- lib/simpleXmlParser.js | 190 ++++++++++++++++++++++++++++++++++++ package.json | 4 +- rollup.config.js | 1 - rollup/bundle-pdfjs-base.js | 2 +- 5 files changed, 226 insertions(+), 17 deletions(-) create mode 100644 lib/simpleXmlParser.js diff --git a/lib/ptixmlinject.js b/lib/ptixmlinject.js index 2034586f..13669bbf 100644 --- a/lib/ptixmlinject.js +++ b/lib/ptixmlinject.js @@ -1,29 +1,43 @@ import fs from "fs"; -import { DOMParser } from "@xmldom/xmldom"; +import { DOMParser } from "./simpleXmlParser.js"; +/** + * XML Parser for PTI format + * @class + */ export default class PTIXmlParser { + /** @type {string|null} */ xmlData = null; + /** @type {Array} */ ptiPageArray = []; - // constructor + /** + * Create a new PTIXmlParser + */ constructor() { this.xmlData = null; this.ptiPageArray = []; } + /** + * Parse an XML file + * @param {string} filePath - The path to the XML file + * @param {Function} callback - The callback function + */ parseXml(filePath, callback) { fs.readFile(filePath, 'utf8', (err, data) => { if (err) { callback(err); } else { + /** @type {string} */ this.xmlData = data; var parser = new DOMParser(); var dom = parser.parseFromString(this.xmlData); var root = dom.documentElement; - var xmlFields = root.getElementsByTagName("field"); + var xmlFields = root ? root.getElementsByTagName("field") : []; var fields = []; for (var i = 0; i < xmlFields.length; i++) { @@ -37,31 +51,34 @@ export default class PTIXmlParser { var fontName = xmlFields[i].getAttribute('fontName'); var fontSize = xmlFields[i].getAttribute('fontSize'); + /** @type {Record} */ var item = {}; - var rectLeft = parseInt(xPos) - 21; //was 23.5 - var rectTop = parseInt(yPos) - 20;//was 23 - var rectRight = parseInt(rectLeft) + parseInt(width) - 4; - var rectBottom = parseInt(rectTop) + parseInt(height) - 4; + var rectLeft = parseInt(xPos || '0') - 21; //was 23.5 + var rectTop = parseInt(yPos || '0') - 20;//was 23 + var rectRight = parseInt(String(rectLeft)) + parseInt(width || '0') - 4; + var rectBottom = parseInt(String(rectTop)) + parseInt(height || '0') - 4; item.fieldType = "Tx"; if (type === "Boolean") { item.fieldType="Btn"; } else if (type === "SSN" || type === "Phone" || type === "zip") { - item.TName = type.toLowerCase(); + item.TName = type ? type.toLowerCase() : ''; } item.alternativeText = ""; - item.fullName = id; - item.fontSize = fontSize; - item.fontName = fontName; + item.fullName = id || ''; + item.fontSize = fontSize || ''; + item.fontName = fontName || ''; item.subtype = "Widget"; item.rect = [rectLeft, rectTop, rectRight, rectBottom]; fields.push(item); - this.ptiPageArray[parseInt(page)]=fields; + if (page) { + this.ptiPageArray[parseInt(page)] = fields; + } } } @@ -69,6 +86,11 @@ export default class PTIXmlParser { }); } + /** + * Get fields for a specific page + * @param {number} pageNum - The page number + * @returns {Array|undefined} The fields for the page + */ getFields(pageNum) { return this.ptiPageArray[pageNum]; } diff --git a/lib/simpleXmlParser.js b/lib/simpleXmlParser.js new file mode 100644 index 00000000..09114d30 --- /dev/null +++ b/lib/simpleXmlParser.js @@ -0,0 +1,190 @@ +// A simple XML parser to replace @xmldom/xmldom dependency +// This implements just enough functionality to support the existing code + +/** + * A simple XML Element implementation + * @class + */ +class Element { + /** + * Create a new Element + * @param {string} nodeName - The name of the node/tag + */ + constructor(nodeName) { + /** @type {string} */ + this.nodeName = nodeName; + /** @type {Array} */ + this.childNodes = []; + /** @type {Object.} */ + this.attributes = {}; + /** @type {string} */ + this.textContent = ""; + } + + /** + * Get attribute value by name + * @param {string} name - The attribute name + * @returns {string|null} The attribute value or null + */ + getAttribute(name) { + return this.attributes[name] || null; + } + + /** + * Get elements by tag name + * @param {string} tagName - The tag name to search for + * @returns {Array} The matching elements + */ + getElementsByTagName(tagName) { + /** @type {Array} */ + let results = []; + + // Check if this element matches + if (this.nodeName === tagName) { + results.push(this); + } + + // Check child elements recursively + for (const child of this.childNodes) { + if (child instanceof Element) { + if (tagName === "*" || child.nodeName === tagName) { + results.push(child); + } + + // Add matching descendants + const childMatches = child.getElementsByTagName(tagName); + results = results.concat(childMatches); + } + } + + return results; + } +} + +/** + * A simple XML Document implementation + * @class + */ +class Document { + constructor() { + /** @type {Element|null} */ + this.documentElement = null; + } +} + +/** + * A minimal DOMParser implementation that supports the basic features needed + * @class + */ +class SimpleDOMParser { + /** + * Parse XML string into a Document + * @param {string} xmlString - The XML string to parse + * @returns {Document} The parsed document + */ + parseFromString(xmlString) { + const doc = new Document(); + + // Remove XML declaration if present + xmlString = xmlString.replace(/<\?xml[^?]*\?>/, "").trim(); + + // Parse the document + doc.documentElement = this.parseElement(xmlString); + + return doc; + } + + /** + * Parse an XML element + * @param {string} xmlString - The XML string to parse + * @returns {Element|null} The parsed element or null + */ + parseElement(xmlString) { + // Regular expressions for parsing XML + const startTagRegex = /<([^\s/>]+)([^>]*)>/; + const attributeRegex = /([^\s=]+)=(?:"([^"]*)"|'([^']*)')/g; + + // Find the start tag + const startMatch = xmlString.match(startTagRegex); + if (!startMatch) { + return null; + } + + const tagName = startMatch[1]; + const attributeString = startMatch[2]; + + // Create the element + const element = new Element(tagName); + + // Parse attributes + let attributeMatch; + while ((attributeMatch = attributeRegex.exec(attributeString)) !== null) { + const attrName = attributeMatch[1]; + const attrValue = attributeMatch[2] || attributeMatch[3]; // Use whichever capture group matched + element.attributes[attrName] = attrValue; + } + + // Find the content between start and end tags + const startTagEnd = startMatch[0].length; + const endTagSearch = new RegExp(``); + const endMatch = xmlString.slice(startTagEnd).search(endTagSearch); + + if (endMatch === -1) { + // Self-closing or malformed tag + return element; + } + + const contentString = xmlString.slice(startTagEnd, startTagEnd + endMatch); + + // Parse child elements + let remainingContent = contentString.trim(); + while (remainingContent.length > 0) { + // Check if there's a child element + if (remainingContent.startsWith("<") && !remainingContent.startsWith("`); + const childEndIndex = remainingContent.search(childEndTagSearch); + + if (childEndIndex !== -1) { + // Extract the complete child element string (including its end tag) + const childEndTagLength = childTagName.length + 3; // "" + const childXmlString = remainingContent.slice(0, childEndIndex + childEndTagLength); + + // Parse the child element and add it to parent + const childElement = this.parseElement(childXmlString); + if (childElement) { + element.childNodes.push(childElement); + } + + // Remove the processed child from remaining content + remainingContent = remainingContent.slice(childXmlString.length).trim(); + continue; + } + } + } + + // Handle text content + const nextTagIndex = remainingContent.indexOf("<"); + if (nextTagIndex === -1) { + // The rest is all text + element.textContent += remainingContent.trim(); + break; + } else if (nextTagIndex > 0) { + // There's some text before the next tag + element.textContent += remainingContent.slice(0, nextTagIndex).trim(); + remainingContent = remainingContent.slice(nextTagIndex).trim(); + } else { + // Can't parse further, just break + break; + } + } + + return element; + } +} + +// Export DOMParser as a class +export { SimpleDOMParser as DOMParser }; diff --git a/package.json b/package.json index b8cc0423..b469077f 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "pdf2json", - "version": "3.1.5", + "version": "3.1.6", "description": "PDF file parser that converts PDF binaries to JSON and text, powered by porting a fork of PDF.JS to Node.js", "keywords": [ "pdf", @@ -68,10 +68,8 @@ "pdf2json": "./bin/pdf2json.js" }, "dependencies": { - "@xmldom/xmldom": "^0.9.6" }, "bundleDependencies": [ - "@xmldom/xmldom" ], "devDependencies": { "@rollup/plugin-commonjs": "^28.0.2", diff --git a/rollup.config.js b/rollup.config.js index 12897812..aed05be5 100644 --- a/rollup.config.js +++ b/rollup.config.js @@ -18,7 +18,6 @@ const external = [ "url", "buffer", "stream", - "@xmldom/xmldom", ]; export default [ diff --git a/rollup/bundle-pdfjs-base.js b/rollup/bundle-pdfjs-base.js index f6e983dd..8053d20b 100644 --- a/rollup/bundle-pdfjs-base.js +++ b/rollup/bundle-pdfjs-base.js @@ -64,7 +64,7 @@ const _baseCode = _pdfjsFiles.reduce( fs.writeFileSync(path.join(__dirname, "../lib/pdfjs-code.js"), ` - ${"import nodeUtil from 'util';import { Blob } from 'buffer';import { DOMParser } from '@xmldom/xmldom';import PDFAnno from './pdfanno.js';import Image from './pdfimage.js';import { createScratchCanvas } from './pdfcanvas.js';"} + ${"import nodeUtil from 'util';import { Blob } from 'buffer';import { DOMParser } from './simpleXmlParser.js';import PDFAnno from './pdfanno.js';import Image from './pdfimage.js';import { createScratchCanvas } from './pdfcanvas.js';"} ${"export const PDFJS = {};"} ${"const globalScope = { console };"} ${_baseCode} From a2fcd7f9c3689233592554ca7869c019b764843c Mon Sep 17 00:00:00 2001 From: modesty Date: Fri, 23 May 2025 16:49:05 -0700 Subject: [PATCH 2/2] doc: update readme for zero dependency update --- readme.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/readme.md b/readme.md index 3af884ff..3647504b 100644 --- a/readme.md +++ b/readme.md @@ -8,15 +8,16 @@ ![GitHub top language](https://img.shields.io/github/languages/top/modesty/pdf2json) ![GitHub last commit](https://img.shields.io/github/last-commit/modesty/pdf2json?color=red) -pdf2json is a [node.js](http://nodejs.org/) module converts binary PDF to JSON and text. Built with [pdf.js](https://github.com/mozilla/pdf.js/), it extracts text content and interactive form elements for server-side processing and command-line use. +pdf2json is a [node.js](http://nodejs.org/) module that converts binary PDF to JSON and text. Built with [pdf.js](https://github.com/mozilla/pdf.js/), it extracts text content and interactive form elements for server-side processing and command-line use. ## Features - **PDF text extraction**: extracts textual content of PDF documents into structured JSON. - **Form element handling**: parses interactive form fields within PDFs for flexible data capture. - **Server-side and command-line versatility**: Integrate with web services for remote PDF processing or use as a standalone command-line tool for local file conversion. -- **Swift Performance**: fast performance with minimal depdendencies +- **Swift Performance**: fast performance with zero dependencies (since v3.1.6) - **Community driven**: decade+ long community driven development ensures continuous improvement. +- **Zero dependencies**: completely dependency-free since v3.1.6, only pure JavaScript code. ## Install