Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[api-minor] Fix issues in text selection #13424

Merged
merged 1 commit into from Oct 18, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
292 changes: 154 additions & 138 deletions src/core/evaluator.js

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/display/text_layer.js
Expand Up @@ -188,7 +188,7 @@ function appendText(task, geom, styles, ctx) {
(task._enhanceTextSelection && AllWhitespaceRegexp.test(geom.str))
) {
shouldScaleText = true;
} else if (geom.transform[0] !== geom.transform[3]) {
} else if (geom.str !== " " && geom.transform[0] !== geom.transform[3]) {
const absScaleX = Math.abs(geom.transform[0]),
absScaleY = Math.abs(geom.transform[3]);
// When the horizontal/vertical scaling differs significantly, also scale
Expand Down
5 changes: 5 additions & 0 deletions test/pdfs/.gitignore
Expand Up @@ -13,6 +13,7 @@
!issue1155r.pdf
!issue2017r.pdf
!bug1727053.pdf
!issue11913.pdf
!issue2391-1.pdf
!issue2391-2.pdf
!issue14046.pdf
Expand Down Expand Up @@ -182,6 +183,7 @@
!issue11931.pdf
!issue1655r.pdf
!issue6541.pdf
!issue10640.pdf
!issue2948.pdf
!issue6231_1.pdf
!issue10402.pdf
Expand Down Expand Up @@ -285,6 +287,7 @@
!issue2840.pdf
!issue4061.pdf
!issue4668.pdf
!issue13226.pdf
!PDFJS-7562-reduced.pdf
!issue11768_reduced.pdf
!issue5039.pdf
Expand Down Expand Up @@ -440,6 +443,7 @@
!annotation-fileattachment.pdf
!annotation-text-widget.pdf
!annotation-choice-widget.pdf
!issue10900.pdf
!annotation-button-widget.pdf
!annotation-polyline-polygon.pdf
!annotation-polyline-polygon-without-appearance.pdf
Expand All @@ -462,6 +466,7 @@
!issue9972-3.pdf
!tiling-pattern-box.pdf
!tiling-pattern-large-steps.pdf
!issue13201.pdf
!issue11555.pdf
!issue12337.pdf
!pr12564.pdf
Expand Down
Binary file added test/pdfs/issue10640.pdf
Binary file not shown.
Binary file added test/pdfs/issue10900.pdf
Binary file not shown.
Binary file added test/pdfs/issue11913.pdf
Binary file not shown.
Binary file added test/pdfs/issue13201.pdf
Binary file not shown.
Binary file added test/pdfs/issue13226.pdf
Binary file not shown.
115 changes: 113 additions & 2 deletions test/unit/api_spec.js
Expand Up @@ -73,6 +73,10 @@ describe("api", function () {
}, WAIT_TIMEOUT);
}

function mergeText(items) {
return items.map(chunk => chunk.str + (chunk.hasEOL ? "\n" : "")).join("");
}

describe("getDocument", function () {
it("creates pdf doc from URL-string", async function () {
const urlStr = TEST_PDFS_PATH + basicApiFileName;
Expand Down Expand Up @@ -1604,11 +1608,17 @@ describe("api", function () {
const data = await Promise.all([defaultPromise, parametersPromise]);

expect(!!data[0].items).toEqual(true);
expect(data[0].items.length).toEqual(12);
expect(data[0].items.length).toEqual(11);
expect(!!data[0].styles).toEqual(true);

const page1 = mergeText(data[0].items);
expect(page1).toEqual(`Table Of Content
Chapter 1 .......................................................... 2
Paragraph 1.1 ...................................................... 3
page 1 / 3`);

expect(!!data[1].items).toEqual(true);
expect(data[1].items.length).toEqual(7);
expect(data[1].items.length).toEqual(6);
expect(!!data[1].styles).toEqual(true);
});

Expand Down Expand Up @@ -1643,6 +1653,107 @@ describe("api", function () {
await loadingTask.destroy();
});

it("gets text content, with no extra spaces (issue 13226)", async function () {
const loadingTask = getDocument(buildGetDocumentParams("issue13226.pdf"));
const pdfDoc = await loadingTask.promise;
const pdfPage = await pdfDoc.getPage(1);
const { items } = await pdfPage.getTextContent();
const text = mergeText(items);

expect(text).toEqual(
"Mitarbeiterinnen und Mitarbeiter arbeiten in über 100 Ländern engagiert im Dienste"
);

await loadingTask.destroy();
});

it("gets text content, with merged spaces (issue 13201)", async function () {
const loadingTask = getDocument(buildGetDocumentParams("issue13201.pdf"));
const pdfDoc = await loadingTask.promise;
const pdfPage = await pdfDoc.getPage(1);
const { items } = await pdfPage.getTextContent();
const text = mergeText(items);

expect(
text.includes(
"Abstract. A purely peer-to-peer version of electronic cash would allow online"
)
).toEqual(true);
expect(
text.includes(
"avoid mediating disputes. The cost of mediation increases transaction costs, limiting the"
)
).toEqual(true);
expect(
text.includes(
"system is secure as long as honest nodes collectively control more CPU power than any"
)
).toEqual(true);

await loadingTask.destroy();
});

it("gets text content, with no spaces between letters of words (issue 11913)", async function () {
const loadingTask = getDocument(buildGetDocumentParams("issue11913.pdf"));
const pdfDoc = await loadingTask.promise;
const pdfPage = await pdfDoc.getPage(1);
const { items } = await pdfPage.getTextContent();
const text = mergeText(items);

expect(
text.includes(
"1. The first of these cases arises from the tragic handicap which has blighted the life of the Plaintiff, and from the response of the"
)
).toEqual(true);
expect(
text.includes(
"argued in this Court the appeal raises narrower, but important, issues which may be summarised as follows:-"
)
).toEqual(true);
await loadingTask.destroy();
});

it("gets text content, with merged spaces (issue 10900)", async function () {
const loadingTask = getDocument(buildGetDocumentParams("issue10900.pdf"));
const pdfDoc = await loadingTask.promise;
const pdfPage = await pdfDoc.getPage(1);
const { items } = await pdfPage.getTextContent();
const text = mergeText(items);

expect(
text.includes(`3 3 3 3
851.5 854.9 839.3 837.5
633.6 727.8 789.9 796.2
1,485.1 1,582.7 1,629.2 1,633.7
114.2 121.7 125.3 130.7
13.0x 13.0x 13.0x 12.5x`)
).toEqual(true);

await loadingTask.destroy();
});

it("gets text content, with spaces (issue 10640)", async function () {
const loadingTask = getDocument(buildGetDocumentParams("issue10640.pdf"));
const pdfDoc = await loadingTask.promise;
const pdfPage = await pdfDoc.getPage(1);
const { items } = await pdfPage.getTextContent();
const text = mergeText(items);

expect(
text.includes(`Open Sans is a humanist sans serif typeface designed by Steve Matteson.
Open Sans was designed with an upright stress, open forms and a neu-
tral, yet friendly appearance. It was optimized for print, web, and mobile
interfaces, and has excellent legibility characteristics in its letterforms (see
figure \x81 on the following page). This font is available from the Google Font
Directory [\x81] as TrueType files licensed under the Apache License version \x82.\x80.
This package provides support for this font in LATEX. It includes Type \x81
versions of the fonts, converted for this package using FontForge from its
sources, for full support with Dvips.`)
).toEqual(true);

await loadingTask.destroy();
});

it("gets empty structure tree", async function () {
const tree = await page.getStructTree();

Expand Down
2 changes: 1 addition & 1 deletion test/unit/pdf_find_controller_spec.js
Expand Up @@ -268,7 +268,7 @@ describe("pdf_find_controller", function () {
pageIndex: 0,
matchIndex: 0,
},
pageMatches: [[19, 48, 66]],
pageMatches: [[19, 46, 62]],
pageMatchesLength: [[8, 8, 8]],
});
});
Expand Down