Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 34 additions & 12 deletions lib/ptixmlinject.js
Original file line number Diff line number Diff line change
@@ -1,29 +1,43 @@
import fs from "fs";
import { DOMParser } from "@xmldom/xmldom";
import { DOMParser } from "./simpleXmlParser.js";

/**
* XML Parser for PTI format
* @class
*/
export default class PTIXmlParser {
/** @type {string|null} */
xmlData = null;
/** @type {Array<any>} */
ptiPageArray = [];

// constructor
/**
* Create a new PTIXmlParser
*/
constructor() {
this.xmlData = null;
this.ptiPageArray = [];
}

/**
* Parse an XML file
* @param {string} filePath - The path to the XML file
* @param {Function} callback - The callback function
*/
parseXml(filePath, callback) {
fs.readFile(filePath, 'utf8', (err, data) => {
if (err) {
callback(err);
}
else {
/** @type {string} */
this.xmlData = data;

var parser = new DOMParser();
var dom = parser.parseFromString(this.xmlData);
var root = dom.documentElement;

var xmlFields = root.getElementsByTagName("field");
var xmlFields = root ? root.getElementsByTagName("field") : [];
var fields = [];

for (var i = 0; i < xmlFields.length; i++) {
Expand All @@ -37,38 +51,46 @@ export default class PTIXmlParser {
var fontName = xmlFields[i].getAttribute('fontName');
var fontSize = xmlFields[i].getAttribute('fontSize');

/** @type {Record<string, any>} */
var item = {};

var rectLeft = parseInt(xPos) - 21; //was 23.5
var rectTop = parseInt(yPos) - 20;//was 23
var rectRight = parseInt(rectLeft) + parseInt(width) - 4;
var rectBottom = parseInt(rectTop) + parseInt(height) - 4;
var rectLeft = parseInt(xPos || '0') - 21; //was 23.5
var rectTop = parseInt(yPos || '0') - 20;//was 23
var rectRight = parseInt(String(rectLeft)) + parseInt(width || '0') - 4;
var rectBottom = parseInt(String(rectTop)) + parseInt(height || '0') - 4;

item.fieldType = "Tx";
if (type === "Boolean") {
item.fieldType="Btn";
}
else if (type === "SSN" || type === "Phone" || type === "zip") {
item.TName = type.toLowerCase();
item.TName = type ? type.toLowerCase() : '';
}
item.alternativeText = "";
item.fullName = id;
item.fontSize = fontSize;
item.fontName = fontName;
item.fullName = id || '';
item.fontSize = fontSize || '';
item.fontName = fontName || '';
item.subtype = "Widget";

item.rect = [rectLeft, rectTop, rectRight, rectBottom];

fields.push(item);

this.ptiPageArray[parseInt(page)]=fields;
if (page) {
this.ptiPageArray[parseInt(page)] = fields;
}
}

}
callback();
});
}

/**
* Get fields for a specific page
* @param {number} pageNum - The page number
* @returns {Array<any>|undefined} The fields for the page
*/
getFields(pageNum) {
return this.ptiPageArray[pageNum];
}
Expand Down
190 changes: 190 additions & 0 deletions lib/simpleXmlParser.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
// A simple XML parser to replace @xmldom/xmldom dependency
// This implements just enough functionality to support the existing code

/**
* A simple XML Element implementation
* @class
*/
class Element {
/**
* Create a new Element
* @param {string} nodeName - The name of the node/tag
*/
constructor(nodeName) {
/** @type {string} */
this.nodeName = nodeName;
/** @type {Array<Element>} */
this.childNodes = [];
/** @type {Object.<string, string>} */
this.attributes = {};
/** @type {string} */
this.textContent = "";
}

/**
* Get attribute value by name
* @param {string} name - The attribute name
* @returns {string|null} The attribute value or null
*/
getAttribute(name) {
return this.attributes[name] || null;
}

/**
* Get elements by tag name
* @param {string} tagName - The tag name to search for
* @returns {Array<Element>} The matching elements
*/
getElementsByTagName(tagName) {
/** @type {Array<Element>} */
let results = [];

// Check if this element matches
if (this.nodeName === tagName) {
results.push(this);
}

// Check child elements recursively
for (const child of this.childNodes) {
if (child instanceof Element) {
if (tagName === "*" || child.nodeName === tagName) {
results.push(child);
}

// Add matching descendants
const childMatches = child.getElementsByTagName(tagName);
results = results.concat(childMatches);
}
}

return results;
}
}

/**
* A simple XML Document implementation
* @class
*/
class Document {
constructor() {
/** @type {Element|null} */
this.documentElement = null;
}
}

/**
* A minimal DOMParser implementation that supports the basic features needed
* @class
*/
class SimpleDOMParser {
/**
* Parse XML string into a Document
* @param {string} xmlString - The XML string to parse
* @returns {Document} The parsed document
*/
parseFromString(xmlString) {
const doc = new Document();

// Remove XML declaration if present
xmlString = xmlString.replace(/<\?xml[^?]*\?>/, "").trim();

// Parse the document
doc.documentElement = this.parseElement(xmlString);

return doc;
}

/**
* Parse an XML element
* @param {string} xmlString - The XML string to parse
* @returns {Element|null} The parsed element or null
*/
parseElement(xmlString) {
// Regular expressions for parsing XML
const startTagRegex = /<([^\s/>]+)([^>]*)>/;
const attributeRegex = /([^\s=]+)=(?:"([^"]*)"|'([^']*)')/g;

// Find the start tag
const startMatch = xmlString.match(startTagRegex);
if (!startMatch) {
return null;
}

const tagName = startMatch[1];
const attributeString = startMatch[2];

// Create the element
const element = new Element(tagName);

// Parse attributes
let attributeMatch;
while ((attributeMatch = attributeRegex.exec(attributeString)) !== null) {
const attrName = attributeMatch[1];
const attrValue = attributeMatch[2] || attributeMatch[3]; // Use whichever capture group matched
element.attributes[attrName] = attrValue;
}

// Find the content between start and end tags
const startTagEnd = startMatch[0].length;
const endTagSearch = new RegExp(`</${tagName}>`);
const endMatch = xmlString.slice(startTagEnd).search(endTagSearch);

if (endMatch === -1) {
// Self-closing or malformed tag
return element;
}

const contentString = xmlString.slice(startTagEnd, startTagEnd + endMatch);

// Parse child elements
let remainingContent = contentString.trim();
while (remainingContent.length > 0) {
// Check if there's a child element
if (remainingContent.startsWith("<") && !remainingContent.startsWith("</")) {
// Find the next child element
const childStartMatch = remainingContent.match(startTagRegex);
if (childStartMatch) {
const childTagName = childStartMatch[1];
const childEndTagSearch = new RegExp(`</${childTagName}>`);
const childEndIndex = remainingContent.search(childEndTagSearch);

if (childEndIndex !== -1) {
// Extract the complete child element string (including its end tag)
const childEndTagLength = childTagName.length + 3; // "</tag>"
const childXmlString = remainingContent.slice(0, childEndIndex + childEndTagLength);

// Parse the child element and add it to parent
const childElement = this.parseElement(childXmlString);
if (childElement) {
element.childNodes.push(childElement);
}

// Remove the processed child from remaining content
remainingContent = remainingContent.slice(childXmlString.length).trim();
continue;
}
}
}

// Handle text content
const nextTagIndex = remainingContent.indexOf("<");
if (nextTagIndex === -1) {
// The rest is all text
element.textContent += remainingContent.trim();
break;
} else if (nextTagIndex > 0) {
// There's some text before the next tag
element.textContent += remainingContent.slice(0, nextTagIndex).trim();
remainingContent = remainingContent.slice(nextTagIndex).trim();
} else {
// Can't parse further, just break
break;
}
}

return element;
}
}

// Export DOMParser as a class
export { SimpleDOMParser as DOMParser };
4 changes: 1 addition & 3 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "pdf2json",
"version": "3.1.5",
"version": "3.1.6",
"description": "PDF file parser that converts PDF binaries to JSON and text, powered by porting a fork of PDF.JS to Node.js",
"keywords": [
"pdf",
Expand Down Expand Up @@ -68,10 +68,8 @@
"pdf2json": "./bin/pdf2json.js"
},
"dependencies": {
"@xmldom/xmldom": "^0.9.6"
},
"bundleDependencies": [
"@xmldom/xmldom"
],
"devDependencies": {
"@rollup/plugin-commonjs": "^28.0.2",
Expand Down
5 changes: 3 additions & 2 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,16 @@
![GitHub top language](https://img.shields.io/github/languages/top/modesty/pdf2json)
![GitHub last commit](https://img.shields.io/github/last-commit/modesty/pdf2json?color=red)

pdf2json is a [node.js](http://nodejs.org/) module converts binary PDF to JSON and text. Built with [pdf.js](https://github.com/mozilla/pdf.js/), it extracts text content and interactive form elements for server-side processing and command-line use.
pdf2json is a [node.js](http://nodejs.org/) module that converts binary PDF to JSON and text. Built with [pdf.js](https://github.com/mozilla/pdf.js/), it extracts text content and interactive form elements for server-side processing and command-line use.

## Features

- **PDF text extraction**: extracts textual content of PDF documents into structured JSON.
- **Form element handling**: parses interactive form fields within PDFs for flexible data capture.
- **Server-side and command-line versatility**: Integrate with web services for remote PDF processing or use as a standalone command-line tool for local file conversion.
- **Swift Performance**: fast performance with minimal depdendencies
- **Swift Performance**: fast performance with zero dependencies (since v3.1.6)
- **Community driven**: decade+ long community driven development ensures continuous improvement.
- **Zero dependencies**: completely dependency-free since v3.1.6, only pure JavaScript code.

## Install

Expand Down
1 change: 0 additions & 1 deletion rollup.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ const external = [
"url",
"buffer",
"stream",
"@xmldom/xmldom",
];

export default [
Expand Down
2 changes: 1 addition & 1 deletion rollup/bundle-pdfjs-base.js
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ const _baseCode = _pdfjsFiles.reduce(

fs.writeFileSync(path.join(__dirname, "../lib/pdfjs-code.js"),
`
${"import nodeUtil from 'util';import { Blob } from 'buffer';import { DOMParser } from '@xmldom/xmldom';import PDFAnno from './pdfanno.js';import Image from './pdfimage.js';import { createScratchCanvas } from './pdfcanvas.js';"}
${"import nodeUtil from 'util';import { Blob } from 'buffer';import { DOMParser } from './simpleXmlParser.js';import PDFAnno from './pdfanno.js';import Image from './pdfimage.js';import { createScratchCanvas } from './pdfcanvas.js';"}
${"export const PDFJS = {};"}
${"const globalScope = { console };"}
${_baseCode}
Expand Down