run example/node/getinfo.js could not get the right character #7712

wanghaisheng · 2016-10-12T07:12:32Z

i just clone the whole latest source and run gulp dist
when run node getinfo.js against the following pdf ,it output like this


[wanghs@db2 node]$ node getinfo.js
# Document Loaded
Number of Pages: 1

# Metadata Is Loaded
## Info
{
  "PDFFormatVersion": "1.4",
  "IsAcroFormPresent": false,
  "IsXFAPresent": false,
  "Creator": "Online2PDF.com",
  "Producer": "Online2PDF.com",
  "CreationDate": "D:20160813074220"
}

# Page 1
Size: 594.75x419.25

## Text Content
                    # # $$$        $     BBB B$  F F  F F F KK  QQ SS SB$SSSSSS  \\\\\\\\\  eee hhhhh  p p p p p p
                                                                                                                      p p p p p p  p  p p p p p p p p p p p p p  p p   p p p p
#  e    p  p p            h �     �

# End of Document

should i change the encoding after or just pdf.js could not deal with this pdf

        var strings = content.items.map(function (item) {

11.pdf
when debug with abode reader you see

first i assume it was caused by embedding fonts,
when i test another pdf which content is the same but embedding fonts are not
3.pdf

and
广西壮族自治区人民医院检验报告单1.xps.pdf

The text was updated successfully, but these errors were encountered:

wanghaisheng · 2016-10-12T09:26:36Z

further using xpdf tools


root@1f079ec22e77:/tmp/clear-pdf# pdffonts   original-file/11.pdf
name                                 type              encoding         emb sub uni object ID
------------------------------------ ----------------- ---------------- --- --- --- ---------
SRPUEP+SimSun                        TrueType          WinAnsi          yes yes yes     13  0

edit getinfo.js

var iconv = require('iconv-lite');
.......
            item.str =   iconv.decode(item.str, '1252');

it still not working

wanghaisheng · 2016-10-12T09:49:27Z

         console.log(   JSON.stringify(strings));

will output


## Text Content
["\u0000","\u0001","\u0002","\u0003","\u0004","\u0005","\u0006","\u0007","\b","\t","\u0002","\n","\u000b","\f","\r","\u000e","\u000f","\u0010","\u0010","\u0010","\u0010","\u0010","\u0010","\u0010","\u0010","\u0010","\u0018","\u0018","\u001a","\u0010","\u001a","\u001a","\u001a","\u001e\u001e\u001e\u001e\u001e","#","#","$\u001e$\u001e$","\u001e","\u0010\u0010\u0010\u0017\u0010\u0017\u001a\u0017\u001a\u0010   \u0017\u0017\u0002\u0017\u001e\u0017\u0017\u0017\u0017\u001a\u001a\u0017\u0017","\u0017\u0017\u0017\u0017\u001e\u001a\u0010\u0010\u0017\u0010\u0010\u0010\u0010\u0010\u0010\u001e\u0010\u001a\u0017\u0017\u001a","\u001f\u001e\u001e\u001f\u001e\u001f","\u001f\u001f\u001f$\u001e\u0010\u0017\u0010\u001a\u001a\u001a\u0010\u0017\u0010\u0010   \u001f\u001e\u001f\u001e\u001f\u001e\u001f\u001f\u001f\u0017\u001f\u0010\u0010\u0010\u0017","\u001f\u001f\u0017\u0017\u001e\u001a\u0010\u0010\u0017\u0010\u0010\u0010\u0010\u0010\u0010\u001e\u0010\u0010\u0017\u0017\u0017","B\u001e\u001eB\u001e\u0010\u0010B","B\u001e\u001e\u001e\u001e$\u001e","\u0003\u001e\u001e\u001e\u001e\u001f\u001e","F","F","\u0017","F","F","F","KK\u0017\u0017\u001e\u001a\u0010\u0010\u0017\u0010\u0010\u0010\u0010\u0010\u0010\u001e\u0010\u0010\u0017\u0017\u0010","\u001f\u001e\u001e\u001f\u001e\u001f\u001f","\u001f\u001fQQ\u001e","SS","SB$S\u001e\u001f\u001fSSS\u001fSS","\u001e\u0010","\\\\\\\\\\\\\\\u001f\\\\","\u0014\u0010\u0010\u0010\u0010","e\u0017ee","h\u001a\u0010\u0010\u0010\u001e\u001e\u001eh\u0005hhh","\u0010\u0010\u0010\u0017\u0015\u0015\u0015","p","p","p","p","p","p","\u000b","\u001f","\u001f","p","p","p","p","p","p","\u0017","p","\u0010","p","p","p","p","p","p","p","p","p","p","p","p","p","\u0017","p","p","p","p","p","p","p","\r\u000e\u0017\u0017\u001e","\u001a\u0010\u0010\u0017\u0010\u0010\u0010\u0010\u0010\u0010\u001e\u0010\u0017\u0017\u0017\u001a \u000b\f\u001f\u001e#\u001b   \u001bp\u001f\u001e\u001b\u001b\\","\u0018","e","\u0018","\u001e","\u0018","p","\u001e","p","p","\u0018","\u0018","\u0017","\u000f","\u001e","\u0018","\u0018","\u001e","\u0018","\u0018","\u001e","h","�","\u001e","\u001e","\u001e","\u001e","�","\u001e"]

but

         console.log(   JSON.stringify(iconv.encode(strings, 'win1250')));

output


## Text Content
{"type":"Buffer","data":[0,44,1,44,2,44,3,44,4,44,5,44,6,44,7,44,8,44,9,44,2,44,10,44,11,44,12,44,13,44,14,44,15,44,16,44,16,44,16,44,16,44,16,44,16,44,16,44,16,44,16,44,24,44,24,44,26,44,16,44,26,44,26,44,26,44,30,30,30,30,30,44,35,44,35,44,36,30,36,30,36,44,30,44,16,16,16,23,16,23,26,23,26,16,32,32,32,23,23,2,23,30,23,23,23,23,26,26,23,23,44,23,23,23,23,30,26,16,16,23,16,16,16,16,16,16,30,16,26,23,23,26,44,31,30,30,31,30,31,44,31,31,31,36,30,16,23,16,26,26,26,16,23,16,16,32,32,32,31,30,31,30,31,30,31,31,31,23,31,16,16,16,23,44,31,31,23,23,30,26,16,16,23,16,16,16,16,16,16,30,16,16,23,23,23,44,66,30,30,66,30,16,16,66,44,66,30,30,30,30,36,30,44,3,30,30,30,30,31,30,44,70,44,70,44,23,44,70,44,70,44,70,44,75,75,23,23,30,26,16,16,23,16,16,16,16,16,16,30,16,16,23,23,16,44,31,30,30,31,30,31,31,44,31,31,81,81,30,44,83,83,44,83,66,36,83,30,31,31,83,83,83,31,83,83,44,30,16,44,92,92,92,92,92,92,92,31,92,92,44,20,16,16,16,16,44,101,23,101,101,44,104,26,16,16,16,30,30,30,104,5,104,104,104,44,16,16,16,23,21,21,21,44,112,44,112,44,112,44,112,44,112,44,112,44,11,44,31,44,31,44,112,44,112,44,112,44,112,44,112,44,112,44,23,44,112,44,16,44,112,44,112,44,112,44,112,44,112,44,112,44,112,44,112,44,112,44,112,44,112,44,112,44,112,44,23,44,112,44,112,44,112,44,112,44,112,44,112,44,112,44,13,14,23,23,30,44,26,16,16,23,16,16,16,16,16,16,30,16,23,23,23,26,32,11,12,31,30,35,27,32,32,32,27,112,31,30,27,27,92,44,24,44,101,44,24,44,30,44,24,44,112,44,30,44,112,44,112,44,24,44,24,44,23,44,15,44,30,44,24,44,24,44,30,44,24,44,24,44,30,44,104,44,63,44,30,44,30,44,30,44,30,44,63,44,30]}

yurydelendik · 2016-10-12T13:50:31Z

Looks like PDF removed character encoding -- I cannot extract any text using Mac Preview or Adobe Reader. There is nothing can be done -- the PDF must contain proper text encoding information in its fonts. Closing as won't fix.

wanghaisheng · 2016-10-12T15:27:19Z

how do you find out this file removed character encoding? can you suggest any workaround i may take to re-generate pdf

yurydelendik · 2016-10-12T15:37:31Z

how do you find out this file removed character encoding?

mostly by testing the pdf file in other viewers -- if even Adobe Reader cannot do it, there is really low chance any other readers can.

Using OCR to recognize glyphs is out of scope of this project.

wanghaisheng · 2016-10-12T15:45:49Z

you mean with mouse right click to copy the text to somewhere then find out got messy up? then i can say this pdf should leave to OCR engine to deal with that ?

yurydelendik · 2016-10-12T15:48:09Z

@wanghaisheng make sure your PDF documents published in PDF/A standard to not have such issues.

wanghaisheng · 2016-10-12T15:55:04Z

@yurydelendik world is cruel, all these pdf files are from client which out of my control..

yurydelendik closed this as completed Oct 12, 2016

wanghaisheng mentioned this issue Oct 12, 2016

how to regenerate a pdf file without embedding fonts #7710

Closed

wanghaisheng mentioned this issue Oct 12, 2016

xps/pdf/png/json转换 clear-datacenter/plan#18

Open

wanghaisheng mentioned this issue Oct 12, 2016

PDF Font encoding angea/PDF101#2

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

run example/node/getinfo.js could not get the right character #7712

run example/node/getinfo.js could not get the right character #7712

wanghaisheng commented Oct 12, 2016

wanghaisheng commented Oct 12, 2016

wanghaisheng commented Oct 12, 2016

yurydelendik commented Oct 12, 2016

wanghaisheng commented Oct 12, 2016

yurydelendik commented Oct 12, 2016 •

edited

wanghaisheng commented Oct 12, 2016

yurydelendik commented Oct 12, 2016

wanghaisheng commented Oct 12, 2016

run example/node/getinfo.js could not get the right character #7712

run example/node/getinfo.js could not get the right character #7712

Comments

wanghaisheng commented Oct 12, 2016

wanghaisheng commented Oct 12, 2016

wanghaisheng commented Oct 12, 2016

yurydelendik commented Oct 12, 2016

wanghaisheng commented Oct 12, 2016

yurydelendik commented Oct 12, 2016 • edited

wanghaisheng commented Oct 12, 2016

yurydelendik commented Oct 12, 2016

wanghaisheng commented Oct 12, 2016

yurydelendik commented Oct 12, 2016 •

edited