Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 38 additions & 23 deletions src/input/sources/localInputSource.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { errorHandler } from "../../errors/handler";
import { logger } from "../../logger";
import { compressImage } from "../../imageOperations";
import { compressPdf } from "../../pdf";
import { compressPdf, countPages } from "../../pdf";
import path from "path";
import * as fileType from "file-type";
import { PageOptions } from "../pageOptions";
Expand Down Expand Up @@ -83,6 +83,22 @@ export abstract class LocalInputSource extends InputSource {
return mimeType;
}

/**
* Returns the file object as a Buffer.
* @returns Buffer representation of the file object
* @protected
*/
protected getBuffer(): Buffer {
if (typeof this.fileObject === "string") {
return Buffer.from(this.fileObject);
}
return this.fileObject;
}

/**
* Determines whether the current file is a PDF.
* @returns {boolean} Returns true if the file is a PDF; otherwise, returns false.
*/
isPdf(): boolean {
if (!this.initialized) {
throw new Error(
Expand All @@ -97,15 +113,9 @@ export abstract class LocalInputSource extends InputSource {
* @param pageOptions
*/
public async applyPageOptions(pageOptions: PageOptions) {
if (!this.initialized) {
await this.init();
}
if (!(this.fileObject instanceof Buffer)) {
throw new Error(
`Cannot modify an input source of type ${this.inputType}.`
);
}
const processedPdf = await extractPages(this.fileObject, pageOptions);
await this.init();
const buffer = this.getBuffer();
const processedPdf = await extractPages(buffer, pageOptions);
this.fileObject = processedPdf.file;
}

Expand Down Expand Up @@ -137,15 +147,8 @@ export abstract class LocalInputSource extends InputSource {
forceSourceText: boolean = false,
disableSourceText: boolean = true
) {
if (!this.initialized) {
await this.init();
}
let buffer: Buffer;
if (typeof this.fileObject === "string") {
buffer = Buffer.from(this.fileObject);
} else {
buffer = this.fileObject;
}
await this.init();
const buffer = this.getBuffer();
if (this.isPdf()){
this.fileObject = await compressPdf(buffer, quality, forceSourceText, disableSourceText);
} else {
Expand All @@ -158,13 +161,25 @@ export abstract class LocalInputSource extends InputSource {
* @return boolean
*/
public async hasSourceText() {
if (!this.initialized) {
await this.init();
}
await this.init();
if (!this.isPdf()){
return false;
}
const buffer = typeof this.fileObject === "string" ? Buffer.from(this.fileObject) : this.fileObject;
const buffer = this.getBuffer();
return hasSourceText(buffer);
}

/**
* Returns the number of pages in the input source.
* For PDFs, returns the actual page count. For images, returns 1.
* @return Promise<number> The number of pages
*/
public async getPageCount(): Promise<number> {
await this.init();
if (!this.isPdf()) {
return 1;
}
const buffer = this.getBuffer();
return countPages(buffer);
}
}
5 changes: 5 additions & 0 deletions src/pdf/pdfOperation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,11 @@ export async function extractPages(
return { file: fileBuffer, totalPagesRemoved: sumRemovedPages };
}

/**
* Count the number of pages in a pdf file.
* @param file
* @returns the number of pages in the file.
*/
export async function countPages(file: Buffer): Promise<number> {
const currentPdf = await PDFDocument.load(file, {
ignoreEncryption: true,
Expand Down
4 changes: 0 additions & 4 deletions src/pdf/pdfUtils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,6 @@ export async function extractTextFromPdf(pdfBuffer: Buffer): Promise<ExtractedPd
};
}





/**
* Checks if a PDF contains source text.
*
Expand Down
97 changes: 55 additions & 42 deletions tests/inputs/sources.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,79 +35,89 @@ describe("Test different types of input", () => {
// don't provide an extension to see if we can detect MIME
// type based on contents
const filename = "receipt";
const input = new Base64Input({
const inputSource = new Base64Input({
inputString: b64String,
filename: filename,
});
await input.init();
expect(input.inputType).to.equals(INPUT_TYPE_BASE64);
expect(input.filename).to.equals(filename);
expect(input.mimeType).to.equals("image/jpeg");
await inputSource.init();
expect(inputSource.inputType).to.equals(INPUT_TYPE_BASE64);
expect(inputSource.filename).to.equals(filename);
expect(inputSource.mimeType).to.equals("image/jpeg");
expect(inputSource.isPdf()).to.false;
expect(await inputSource.getPageCount()).to.equals(1);
// we need to insert a newline very 76 chars to match the format
// of the input file.
const expectedString = input.fileObject
const expectedString = inputSource.fileObject
.toString("base64")
.replace(/(.{76})/gm, "$1\n");
expect(expectedString).to.eqls(b64String);
});

it("should accept JPEG files from a path", async () => {
const input = new PathInput({
const inputSource = new PathInput({
inputPath: path.join(__dirname, "../data/products/expense_receipts/default_sample.jpg"),
});
await input.init();
await inputSource.init();

const expectedResult = await fs.promises.readFile(
path.join(__dirname, "../data/products/expense_receipts/default_sample.jpg")
);
expect(input.inputType).to.equals(INPUT_TYPE_PATH);
expect(input.filename).to.equals("default_sample.jpg");
expect(input.mimeType).to.equals("image/jpeg");
expect(input.fileObject).to.eqls(expectedResult);
expect(inputSource.inputType).to.equals(INPUT_TYPE_PATH);
expect(inputSource.filename).to.equals("default_sample.jpg");
expect(inputSource.mimeType).to.equals("image/jpeg");
expect(inputSource.isPdf()).to.false;
expect(await inputSource.getPageCount()).to.equals(1);
expect(inputSource.fileObject).to.eqls(expectedResult);
});

it("should accept TIFF from a path", async () => {
const input = new PathInput({
const inputSource = new PathInput({
inputPath: path.join(__dirname, "../data/file_types/receipt.tif"),
});
await input.init();
await inputSource.init();
const expectedResult = await fs.promises.readFile(
path.join(__dirname, "../data/file_types/receipt.tif")
);
expect(input.inputType).to.equals(INPUT_TYPE_PATH);
expect(input.filename).to.equals("receipt.tif");
expect(input.mimeType).to.equals("image/tiff");
expect(input.fileObject).to.eqls(expectedResult);
expect(inputSource.inputType).to.equals(INPUT_TYPE_PATH);
expect(inputSource.filename).to.equals("receipt.tif");
expect(inputSource.mimeType).to.equals("image/tiff");
expect(inputSource.isPdf()).to.false;
expect(await inputSource.getPageCount()).to.equals(1);
expect(inputSource.fileObject).to.eqls(expectedResult);
});

it("should accept HEIC from a path", async () => {
const input = new PathInput({
const inputSource = new PathInput({
inputPath: path.join(__dirname, "../data/file_types/receipt.heic"),
});
await input.init();
await inputSource.init();
const expectedResult = await fs.promises.readFile(
path.join(__dirname, "../data/file_types/receipt.heic")
);
expect(input.inputType).to.equals(INPUT_TYPE_PATH);
expect(input.filename).to.equals("receipt.heic");
expect(input.mimeType).to.equals("image/heic");
expect(input.fileObject).to.eqls(expectedResult);
expect(inputSource.inputType).to.equals(INPUT_TYPE_PATH);
expect(inputSource.filename).to.equals("receipt.heic");
expect(inputSource.mimeType).to.equals("image/heic");
expect(inputSource.isPdf()).to.false;
expect(await inputSource.getPageCount()).to.equals(1);
expect(inputSource.fileObject).to.eqls(expectedResult);
});

it("should accept read streams", async () => {
const filePath = path.join(__dirname, "../data/products/expense_receipts/default_sample.jpg");
const stream = fs.createReadStream(filePath);
const filename = "default_sample.jpg";
const input = new StreamInput({
const inputSource = new StreamInput({
inputStream: stream,
filename: filename,
});
await input.init();
expect(input.inputType).to.equals(INPUT_TYPE_STREAM);
expect(input.filename).to.equals(filename);
expect(input.mimeType).to.equals("image/jpeg");
await inputSource.init();
expect(inputSource.inputType).to.equals(INPUT_TYPE_STREAM);
expect(inputSource.filename).to.equals(filename);
expect(inputSource.mimeType).to.equals("image/jpeg");
expect(inputSource.isPdf()).to.false;
expect(await inputSource.getPageCount()).to.equals(1);
const expectedResult = await fs.promises.readFile(filePath);
expect(input.fileObject.toString()).to.eqls(expectedResult.toString());
expect(inputSource.fileObject.toString()).to.eqls(expectedResult.toString());
});

it("should accept raw bytes", async () => {
Expand All @@ -116,16 +126,18 @@ describe("Test different types of input", () => {
// don't provide an extension to see if we can detect MIME
// type based on contents
const filename = "receipt";
const input = new BytesInput({
const inputSource = new BytesInput({
inputBytes: inputBytes,
filename: filename,
});
await input.init();
expect(input.inputType).to.equal(INPUT_TYPE_BYTES);
expect(input.filename).to.equal(filename);
expect(input.mimeType).to.equal("image/jpeg");
await inputSource.init();
expect(inputSource.inputType).to.equal(INPUT_TYPE_BYTES);
expect(inputSource.filename).to.equal(filename);
expect(inputSource.mimeType).to.equal("image/jpeg");
expect(inputSource.isPdf()).to.false;
expect(await inputSource.getPageCount()).to.equals(1);
const expectedResult = await fs.promises.readFile(filePath);
expect(Buffer.compare(input.fileObject, expectedResult)).to.equal(0);
expect(Buffer.compare(inputSource.fileObject, expectedResult)).to.equal(0);
});

it("should accept a Buffer", async () => {
Expand All @@ -135,15 +147,16 @@ describe("Test different types of input", () => {
path.join(__dirname, "../data/products/invoices/invoice_10p.pdf")
)
);
const input = new BufferInput({
const inputSource = new BufferInput({
buffer: buffer,
filename: filename,
});
await input.init();
expect(input.inputType).to.equals(INPUT_TYPE_BUFFER);
expect(input.filename).to.equals(filename);
expect(input.isPdf()).to.be.true;
expect(input.fileObject).to.be.instanceOf(Buffer);
await inputSource.init();
expect(inputSource.inputType).to.equals(INPUT_TYPE_BUFFER);
expect(inputSource.filename).to.equals(filename);
expect(inputSource.isPdf()).to.be.true;
expect(await inputSource.getPageCount()).to.equals(10);
expect(inputSource.fileObject).to.be.instanceOf(Buffer);
});


Expand Down