Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 11 additions & 11 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ name: Node.js CI/CD

on:
pull_request:
branches: [ "*" ]
branches: ["*"]
workflow_dispatch: # Allow manual triggering

jobs:
Expand All @@ -11,31 +11,31 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: Setup Deno 2.4.3
uses: denoland/setup-deno@v1
with:
deno-version: 2.4.3

- name: Setup Bun 1.2.19
uses: oven-sh/setup-bun@v1
with:
bun-version: 1.2.19

- name: Setup Node.js 20.18.0
uses: actions/setup-node@v3
uses: actions/setup-node@v4
with:
node-version: 20.18.0

- name: Clean install dependencies
run: rm -rf node_modules package-lock.json && npm i

# - name: Install dependencies
# run: npm ci
- name: Clean install dependencies
run: rm -rf node_modules && npm i

- name: Build project
run: npm run build

- name: Run tests
run: npm run test
- name: Type check
run: npx --no-install tsc --noEmit

- name: Run tests (with coverage)
run: npm run test
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
## generic files to ignore
*~
*.lock
*.DS_Store
*.swp
*.out
Expand Down
3 changes: 2 additions & 1 deletion .prettierrc.cjs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
module.exports = {
...require('prettier-config-standard'),
semi: false,
singleQuote: true,
trailingComma: 'es5',
}
2 changes: 0 additions & 2 deletions eslint.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -207,8 +207,6 @@ export default [
destructuredArrayIgnorePattern: "^_?"
}
],
"@typescript-eslint/no-extra-semi": "off",
"@typescript-eslint/no-var-requires": "off",
"arrow-body-style": ["error", "as-needed"],
"dot-notation": ["error"],
"eqeqeq": ["error", "always"],
Expand Down
7 changes: 5 additions & 2 deletions jest.config.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@
"bail": false,
"testFailureExitCode": 1,
"transform": {},
"testTimeout": 6000,
"testTimeout": 15000,
"moduleFileExtensions": ["js", "cjs", "mjs", "json"],
"verbose": true,
"forceExit": true,
"maxWorkers": "50%"
"maxWorkers": "50%",
"coverageDirectory": "coverage",
"coverageReporters": ["text", "lcov"],
"coveragePathIgnorePatterns": ["/node_modules/", "/test/", "/base/", "/lib/pdfjs-code.js"]
}
1,979 changes: 940 additions & 1,039 deletions package-lock.json

Large diffs are not rendered by default.

18 changes: 12 additions & 6 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,10 @@
},
"main": "./dist/pdfparser.cjs",
"module": "./dist/pdfparser.js",
"typings": "./dist/pdfparser.d.ts",
"types": "./dist/pdfparser.d.ts",
"scripts": {
"test:jest": "jest --config ./jest.config.json --detectOpenHandles",
"test:coverage": "jest --config ./jest.config.json --coverage --detectOpenHandles",
"test": "npm run test:jest && npm run parse-r && npm run parse-fd && npm run test:deno && npm run test:bun",
"test:forms": "cd ./test && sh p2j.forms.sh",
"test:misc": "cd ./test && sh p2j.one.sh misc . \"Expected: 16 success, 6 fail exception with stack trace\" ",
Expand All @@ -50,7 +51,7 @@
"parse-e": "./bin/pdf2json.js -f ./test/pdf/misc/i418_precompilato_fake.pdf -o ./test/target/misc -c",
"build:rollup": "npx rollup -c ./rollup.config.js",
"build:bundle-pdfjs-base": "node rollup/bundle-pdfjs-base.js",
"build": "npm run build:bundle-pdfjs-base && npm run build:rollup",
"build": "npm run build:bundle-pdfjs-base && npm run build:rollup && cp dist/pdfparser.d.ts dist/pdfparser.d.cts",
"build:clean": "rm -rf node_modules && rm -f package-lock.json && npm i && npm run build",
"test:deno": "deno --allow-read --allow-write --allow-net --allow-env --no-check ./bin/pdf2json.js -f ./test/pdf/fd/form/ -o ./test/target/fd/form -t -c -m -r",
"test:bun": "bun ./bin/pdf2json.js -f ./test/pdf/fd/form/ -o ./test/target/fd/form -t -c -m -r"
Expand All @@ -76,7 +77,7 @@
"@rollup/plugin-eslint": "^9.1.0",
"@rollup/plugin-json": "^6.1.0",
"@rollup/plugin-node-resolve": "^16.0.2",
"@rollup/plugin-terser": "^0.4.4",
"@rollup/plugin-terser": "^1.0.0",
"@rollup/plugin-typescript": "^12.1.4",
"@types/node": "^25.3.3",
"@typescript-eslint/eslint-plugin": "^8.46.0",
Expand Down Expand Up @@ -105,9 +106,14 @@
"readme": "https://github.com/modesty/pdf2json/blob/master/readme.md",
"exports": {
".": {
"types": "./dist/pdfparser.d.ts",
"import": "./dist/pdfparser.js",
"require": "./dist/pdfparser.cjs"
"import": {
"types": "./dist/pdfparser.d.ts",
"default": "./dist/pdfparser.js"
},
"require": {
"types": "./dist/pdfparser.d.cts",
"default": "./dist/pdfparser.cjs"
}
}
},
"publishConfig": {
Expand Down
60 changes: 55 additions & 5 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ After install, run command line:

> npm test

`pretest` step builds bundles and source maps for both ES Module and CommonJS, output to `./dist` directory. The Jest test suit is defined in `./test/_test_.cjs` with commonJS, test run will also cover `parse-r` and `parse-fd` with ES Modules via command line.
`pretest` step builds bundles and source maps for both ES Module and CommonJS, output to `./dist` directory. The Jest test suites (7 suites, 74+ tests) are defined in `./test/_test_*.cjs` with CommonJS, test run will also cover `parse-r` and `parse-fd` with ES Modules via command line.

The default Jest test suits are essential tests for all PRs. But it only covers a portion of all testing PDFs, for more broader coverage, run:

Expand Down Expand Up @@ -920,15 +920,15 @@ To use the command line utility to transcode a folder or a file:
node pdf2json.js -f [input directory or pdf file]
```

When -f is a PDF file, it'll be converted to json file with the same name and saved in the same directory. If -f is a directory, it'll scan all ".pdf" files within the specified directory to transcode them one by one.
When -f is a PDF file, it'll be converted to json file with the same name and saved in the same directory. If -f is a directory, it'll scan all ".pdf" files within the specified directory to transcode them one by one (dotfiles are skipped).

Optionally, you can specify the output directory: -o:

```javascript
node pdf2json.js -f [input directory or pdf file] -o [output directory]
```

The output directory must exist, otherwise, it'll exit with an error.
The output directory will be created automatically if it does not exist.

Additionally, you can also use -v or --version to show version number or to display more help info with -h.

Expand All @@ -952,7 +952,57 @@ or
pdf2json -f [input directory or pdf file] -o [output directory]
```

v0.5.4 added "-s" or "--silent" command line argument to suppress informative logging output. When using pdf2json as a command line tool, the default verbosity is 5 (INFOS). While when running as a web service, default verbosity is 9 (ERRORS).
### CLI Flags

| Flag | Long | Description |
|------|------|-------------|
| `-f` | `--file` | (required) Path to a PDF file or a directory of PDF files to parse |
| `-o` | `--output` | Output directory (created automatically if it doesn't exist; defaults to input directory) |
| `-s` | `--silent` | Suppress informational output; only errors are printed |
| `-t` | `--fieldTypes` | Generate a `.fields.json` file with form field ids and types |
| `-c` | `--content` | Generate a `.content.txt` file with extracted text content |
| `-m` | `--merge` | Generate a `.merged.json` file with auto-merged broken text blocks |
| `-r` | `--stream` | Use stream-based parsing instead of loading the entire file into memory |
| `-si` | `--singleton` | Reuse a single PDFParser instance across all files in a directory (reduces memory for batch processing) |
| `-j` | `--json` | Output a structured JSON summary to stdout (version, file paths, stats, errors). Implies `-s` |
| `-q` | `--quiet` | Suppress all non-error output including timer and status messages. Stricter than `-s` |
| `-v` | `--version` | Print the version number and exit |
| `-h` | `--help` | Print help message and exit |

### Exit Codes

| Code | Meaning |
|------|---------|
| `0` | All files parsed successfully |
| `1` | One or more files failed to parse |
| `2` | Invalid arguments or usage error (e.g. missing `-f`) |
| `3` | I/O error (file not found, permission denied) |

### `--json` Output Schema

When using the `--json` flag, pdf2json outputs a structured JSON summary to stdout:

```json
{
"version": "4.0.3",
"input": "/path/to/input.pdf",
"outputs": [
{ "type": "json", "path": "/path/to/output.json" },
{ "type": "fields", "path": "/path/to/output.fields.json" },
{ "type": "content", "path": "/path/to/output.content.txt" },
{ "type": "merged", "path": "/path/to/output.merged.json" }
],
"stats": { "input": 1, "success": 1, "failed": 0 },
"errors": [],
"elapsedMs": 234
}
```

Note: The PDF engine may print warnings to stdout (e.g. `Warning: Setting up fake worker.`). Pipe through `grep '^{'` to isolate the JSON line.

### Verbosity

When using pdf2json as a command line tool, the default verbosity is 5 (INFOS). While when running as a web service, default verbosity is 9 (ERRORS).
Examples to suppress logging info from command line:

```javascript
Expand All @@ -973,7 +1023,7 @@ var pdfParser = new PFParser();
pdfParser.loadPDF(pdfFilePath, 5);
```

v0.5.7 added the capability to skip input PDF files if filename begins with any one of "!@#$%^&\*()+=[]\\\';,/{}|\":<>?~`.-\_ ", usually these files are created by PDF authoring tools as backup files.
When scanning a directory, pdf2json only skips hidden files (dotfiles). All other `.pdf` files are processed.

v0.6.2 added "-t" command line argument to generate fields json file in addition to parsed json. The fields json file will contain one Array which contains fieldInfo object for each field, and each fieldInfo object will have 4 fields:

Expand Down
41 changes: 20 additions & 21 deletions rollup.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ const external = [
];

export default [
// Build 1: Main library bundle (pdfparser.js -> dist/)
// Must complete before Build 2, as the CLI imports from dist/pdfparser.js
{
input: "./pdfparser.js",
external,
Expand All @@ -34,46 +36,43 @@ export default [
sourcemap: true,
},
],
treeshake: false,
treeshake: false, // Required: PDF.js base has global side effects that tree-shaking would break
plugins: [
json(),
eslint({
throwOnError: true
throwOnError: true,
}),
nodeResolve({
preferBuiltins: true, // Prefer Node.js built-in modules
browser: false // Set to true only if targeting browsers
}),
terser()
]
preferBuiltins: true,
browser: false,
}),
terser(),
],
},
// Build 2: CLI bundle (src/cli/ -> bin/cli/)
// Depends on Build 1: imports dist/pdfparser.js as external at runtime
{
input: "./src/cli/p2jcli.ts",
external: [...external, "../../dist/pdfparser.js"],
output: [
// {
// file: "dist/pdfparser_cli.cjs",
// format: "cjs",
// sourcemap: true,
// },
{
file: "bin/cli/pdfparser_cli.js",
format: "es",
sourcemap: true,
},
],
treeshake: false,
treeshake: true,
plugins: [
typescript({ tsconfig: './tsconfig.json' }),
typescript({ tsconfig: "./tsconfig.json" }),
json(),
eslint({
throwOnError: true
throwOnError: true,
}),
nodeResolve({
preferBuiltins: true, // Prefer Node.js built-in modules
browser: false // Set to true only if targeting browsers
}),
terser()
]
}
preferBuiltins: true,
browser: false,
}),
terser(),
],
},
];
Loading
Loading