diff --git a/src/input/sources/localInputSource.ts b/src/input/sources/localInputSource.ts index 36cf07c1..40c8f00f 100644 --- a/src/input/sources/localInputSource.ts +++ b/src/input/sources/localInputSource.ts @@ -1,7 +1,7 @@ import { errorHandler } from "../../errors/handler"; import { logger } from "../../logger"; import { compressImage } from "../../imageOperations"; -import { compressPdf } from "../../pdf"; +import { compressPdf, countPages } from "../../pdf"; import path from "path"; import * as fileType from "file-type"; import { PageOptions } from "../pageOptions"; @@ -83,6 +83,22 @@ export abstract class LocalInputSource extends InputSource { return mimeType; } + /** + * Returns the file object as a Buffer. + * @returns Buffer representation of the file object + * @protected + */ + protected getBuffer(): Buffer { + if (typeof this.fileObject === "string") { + return Buffer.from(this.fileObject); + } + return this.fileObject; + } + + /** + * Determines whether the current file is a PDF. + * @returns {boolean} Returns true if the file is a PDF; otherwise, returns false. + */ isPdf(): boolean { if (!this.initialized) { throw new Error( @@ -97,15 +113,9 @@ export abstract class LocalInputSource extends InputSource { * @param pageOptions */ public async applyPageOptions(pageOptions: PageOptions) { - if (!this.initialized) { - await this.init(); - } - if (!(this.fileObject instanceof Buffer)) { - throw new Error( - `Cannot modify an input source of type ${this.inputType}.` - ); - } - const processedPdf = await extractPages(this.fileObject, pageOptions); + await this.init(); + const buffer = this.getBuffer(); + const processedPdf = await extractPages(buffer, pageOptions); this.fileObject = processedPdf.file; } @@ -137,15 +147,8 @@ export abstract class LocalInputSource extends InputSource { forceSourceText: boolean = false, disableSourceText: boolean = true ) { - if (!this.initialized) { - await this.init(); - } - let buffer: Buffer; - if (typeof this.fileObject === "string") { - buffer = Buffer.from(this.fileObject); - } else { - buffer = this.fileObject; - } + await this.init(); + const buffer = this.getBuffer(); if (this.isPdf()){ this.fileObject = await compressPdf(buffer, quality, forceSourceText, disableSourceText); } else { @@ -158,13 +161,25 @@ export abstract class LocalInputSource extends InputSource { * @return boolean */ public async hasSourceText() { - if (!this.initialized) { - await this.init(); - } + await this.init(); if (!this.isPdf()){ return false; } - const buffer = typeof this.fileObject === "string" ? Buffer.from(this.fileObject) : this.fileObject; + const buffer = this.getBuffer(); return hasSourceText(buffer); } + + /** + * Returns the number of pages in the input source. + * For PDFs, returns the actual page count. For images, returns 1. + * @return Promise The number of pages + */ + public async getPageCount(): Promise { + await this.init(); + if (!this.isPdf()) { + return 1; + } + const buffer = this.getBuffer(); + return countPages(buffer); + } } diff --git a/src/pdf/pdfOperation.ts b/src/pdf/pdfOperation.ts index b1831cf1..50999812 100644 --- a/src/pdf/pdfOperation.ts +++ b/src/pdf/pdfOperation.ts @@ -84,6 +84,11 @@ export async function extractPages( return { file: fileBuffer, totalPagesRemoved: sumRemovedPages }; } +/** + * Count the number of pages in a pdf file. + * @param file + * @returns the number of pages in the file. + */ export async function countPages(file: Buffer): Promise { const currentPdf = await PDFDocument.load(file, { ignoreEncryption: true, diff --git a/src/pdf/pdfUtils.ts b/src/pdf/pdfUtils.ts index bf7e7d27..3618d9f9 100644 --- a/src/pdf/pdfUtils.ts +++ b/src/pdf/pdfUtils.ts @@ -64,10 +64,6 @@ export async function extractTextFromPdf(pdfBuffer: Buffer): Promise { // don't provide an extension to see if we can detect MIME // type based on contents const filename = "receipt"; - const input = new Base64Input({ + const inputSource = new Base64Input({ inputString: b64String, filename: filename, }); - await input.init(); - expect(input.inputType).to.equals(INPUT_TYPE_BASE64); - expect(input.filename).to.equals(filename); - expect(input.mimeType).to.equals("image/jpeg"); + await inputSource.init(); + expect(inputSource.inputType).to.equals(INPUT_TYPE_BASE64); + expect(inputSource.filename).to.equals(filename); + expect(inputSource.mimeType).to.equals("image/jpeg"); + expect(inputSource.isPdf()).to.false; + expect(await inputSource.getPageCount()).to.equals(1); // we need to insert a newline very 76 chars to match the format // of the input file. - const expectedString = input.fileObject + const expectedString = inputSource.fileObject .toString("base64") .replace(/(.{76})/gm, "$1\n"); expect(expectedString).to.eqls(b64String); }); it("should accept JPEG files from a path", async () => { - const input = new PathInput({ + const inputSource = new PathInput({ inputPath: path.join(__dirname, "../data/products/expense_receipts/default_sample.jpg"), }); - await input.init(); + await inputSource.init(); const expectedResult = await fs.promises.readFile( path.join(__dirname, "../data/products/expense_receipts/default_sample.jpg") ); - expect(input.inputType).to.equals(INPUT_TYPE_PATH); - expect(input.filename).to.equals("default_sample.jpg"); - expect(input.mimeType).to.equals("image/jpeg"); - expect(input.fileObject).to.eqls(expectedResult); + expect(inputSource.inputType).to.equals(INPUT_TYPE_PATH); + expect(inputSource.filename).to.equals("default_sample.jpg"); + expect(inputSource.mimeType).to.equals("image/jpeg"); + expect(inputSource.isPdf()).to.false; + expect(await inputSource.getPageCount()).to.equals(1); + expect(inputSource.fileObject).to.eqls(expectedResult); }); it("should accept TIFF from a path", async () => { - const input = new PathInput({ + const inputSource = new PathInput({ inputPath: path.join(__dirname, "../data/file_types/receipt.tif"), }); - await input.init(); + await inputSource.init(); const expectedResult = await fs.promises.readFile( path.join(__dirname, "../data/file_types/receipt.tif") ); - expect(input.inputType).to.equals(INPUT_TYPE_PATH); - expect(input.filename).to.equals("receipt.tif"); - expect(input.mimeType).to.equals("image/tiff"); - expect(input.fileObject).to.eqls(expectedResult); + expect(inputSource.inputType).to.equals(INPUT_TYPE_PATH); + expect(inputSource.filename).to.equals("receipt.tif"); + expect(inputSource.mimeType).to.equals("image/tiff"); + expect(inputSource.isPdf()).to.false; + expect(await inputSource.getPageCount()).to.equals(1); + expect(inputSource.fileObject).to.eqls(expectedResult); }); it("should accept HEIC from a path", async () => { - const input = new PathInput({ + const inputSource = new PathInput({ inputPath: path.join(__dirname, "../data/file_types/receipt.heic"), }); - await input.init(); + await inputSource.init(); const expectedResult = await fs.promises.readFile( path.join(__dirname, "../data/file_types/receipt.heic") ); - expect(input.inputType).to.equals(INPUT_TYPE_PATH); - expect(input.filename).to.equals("receipt.heic"); - expect(input.mimeType).to.equals("image/heic"); - expect(input.fileObject).to.eqls(expectedResult); + expect(inputSource.inputType).to.equals(INPUT_TYPE_PATH); + expect(inputSource.filename).to.equals("receipt.heic"); + expect(inputSource.mimeType).to.equals("image/heic"); + expect(inputSource.isPdf()).to.false; + expect(await inputSource.getPageCount()).to.equals(1); + expect(inputSource.fileObject).to.eqls(expectedResult); }); it("should accept read streams", async () => { const filePath = path.join(__dirname, "../data/products/expense_receipts/default_sample.jpg"); const stream = fs.createReadStream(filePath); const filename = "default_sample.jpg"; - const input = new StreamInput({ + const inputSource = new StreamInput({ inputStream: stream, filename: filename, }); - await input.init(); - expect(input.inputType).to.equals(INPUT_TYPE_STREAM); - expect(input.filename).to.equals(filename); - expect(input.mimeType).to.equals("image/jpeg"); + await inputSource.init(); + expect(inputSource.inputType).to.equals(INPUT_TYPE_STREAM); + expect(inputSource.filename).to.equals(filename); + expect(inputSource.mimeType).to.equals("image/jpeg"); + expect(inputSource.isPdf()).to.false; + expect(await inputSource.getPageCount()).to.equals(1); const expectedResult = await fs.promises.readFile(filePath); - expect(input.fileObject.toString()).to.eqls(expectedResult.toString()); + expect(inputSource.fileObject.toString()).to.eqls(expectedResult.toString()); }); it("should accept raw bytes", async () => { @@ -116,16 +126,18 @@ describe("Test different types of input", () => { // don't provide an extension to see if we can detect MIME // type based on contents const filename = "receipt"; - const input = new BytesInput({ + const inputSource = new BytesInput({ inputBytes: inputBytes, filename: filename, }); - await input.init(); - expect(input.inputType).to.equal(INPUT_TYPE_BYTES); - expect(input.filename).to.equal(filename); - expect(input.mimeType).to.equal("image/jpeg"); + await inputSource.init(); + expect(inputSource.inputType).to.equal(INPUT_TYPE_BYTES); + expect(inputSource.filename).to.equal(filename); + expect(inputSource.mimeType).to.equal("image/jpeg"); + expect(inputSource.isPdf()).to.false; + expect(await inputSource.getPageCount()).to.equals(1); const expectedResult = await fs.promises.readFile(filePath); - expect(Buffer.compare(input.fileObject, expectedResult)).to.equal(0); + expect(Buffer.compare(inputSource.fileObject, expectedResult)).to.equal(0); }); it("should accept a Buffer", async () => { @@ -135,15 +147,16 @@ describe("Test different types of input", () => { path.join(__dirname, "../data/products/invoices/invoice_10p.pdf") ) ); - const input = new BufferInput({ + const inputSource = new BufferInput({ buffer: buffer, filename: filename, }); - await input.init(); - expect(input.inputType).to.equals(INPUT_TYPE_BUFFER); - expect(input.filename).to.equals(filename); - expect(input.isPdf()).to.be.true; - expect(input.fileObject).to.be.instanceOf(Buffer); + await inputSource.init(); + expect(inputSource.inputType).to.equals(INPUT_TYPE_BUFFER); + expect(inputSource.filename).to.equals(filename); + expect(inputSource.isPdf()).to.be.true; + expect(await inputSource.getPageCount()).to.equals(10); + expect(inputSource.fileObject).to.be.instanceOf(Buffer); });