/
repairs.go
282 lines (240 loc) · 7.25 KB
/
repairs.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
// Routines related to repairing malformed pdf files.
package core
import (
"errors"
"fmt"
"os"
"regexp"
"bufio"
"io"
"strconv"
"github.com/nareshrajaram/unipdf/common"
)
var repairReXrefTable = regexp.MustCompile(`[\r\n]\s*(xref)\s*[\r\n]`)
// Locates a standard Xref table by looking for the "xref" entry.
// Xref object stream not supported.
func (parser *PdfParser) repairLocateXref() (int64, error) {
readBuf := int64(1000)
parser.rs.Seek(-readBuf, os.SEEK_CUR)
curOffset, err := parser.rs.Seek(0, os.SEEK_CUR)
if err != nil {
return 0, err
}
b2 := make([]byte, readBuf)
parser.rs.Read(b2)
results := repairReXrefTable.FindAllStringIndex(string(b2), -1)
if len(results) < 1 {
common.Log.Debug("ERROR: Repair: xref not found!")
return 0, errors.New("repair: xref not found")
}
localOffset := int64(results[len(results)-1][0])
xrefOffset := curOffset + localOffset
return xrefOffset, nil
}
// Renumbers the xref table.
// Useful when the cross reference is pointing to an object with the wrong number.
// Update the table.
func (parser *PdfParser) rebuildXrefTable() error {
newXrefs := XrefTable{}
newXrefs.ObjectMap = map[int]XrefObject{}
for objNum, xref := range parser.xrefs.ObjectMap {
obj, _, err := parser.lookupByNumberWrapper(objNum, false)
if err != nil {
common.Log.Debug("ERROR: Unable to look up object (%s)", err)
common.Log.Debug("ERROR: Xref table completely broken - attempting to repair ")
xrefTable, err := parser.repairRebuildXrefsTopDown()
if err != nil {
common.Log.Debug("ERROR: Failed xref rebuild repair (%s)", err)
return err
}
parser.xrefs = *xrefTable
common.Log.Debug("Repaired xref table built")
return nil
}
actObjNum, actGenNum, err := getObjectNumber(obj)
if err != nil {
return err
}
xref.ObjectNumber = int(actObjNum)
xref.Generation = int(actGenNum)
newXrefs.ObjectMap[int(actObjNum)] = xref
}
parser.xrefs = newXrefs
common.Log.Debug("New xref table built")
printXrefTable(parser.xrefs)
return nil
}
// Parses and returns the object and generation number from a string such as "12 0 obj" -> (12,0,nil).
func parseObjectNumberFromString(str string) (int, int, error) {
result := reIndirectObject.FindStringSubmatch(str)
if len(result) < 3 {
return 0, 0, errors.New("unable to detect indirect object signature")
}
on, _ := strconv.Atoi(result[1])
gn, _ := strconv.Atoi(result[2])
return on, gn, nil
}
// Parse the entire file from top down.
// Goes through the file byte-by-byte looking for "<num> <generation> obj" patterns.
// N.B. This collects the XrefTypeTableEntry data only.
func (parser *PdfParser) repairRebuildXrefsTopDown() (*XrefTable, error) {
if parser.repairsAttempted {
// Avoid multiple repairs (only try once).
return nil, fmt.Errorf("repair failed")
}
parser.repairsAttempted = true
// Go to beginning, reset reader.
parser.rs.Seek(0, os.SEEK_SET)
parser.reader = bufio.NewReader(parser.rs)
// Keep a running buffer of last bytes.
bufLen := 20
last := make([]byte, bufLen)
xrefTable := XrefTable{}
xrefTable.ObjectMap = make(map[int]XrefObject)
for {
b, err := parser.reader.ReadByte()
if err != nil {
if err == io.EOF {
break
} else {
return nil, err
}
}
// Format:
// object number - whitespace - generation number - obj
// e.g. "12 0 obj"
if b == 'j' && last[bufLen-1] == 'b' && last[bufLen-2] == 'o' && IsWhiteSpace(last[bufLen-3]) {
i := bufLen - 4
// Go past whitespace
for IsWhiteSpace(last[i]) && i > 0 {
i--
}
if i == 0 || !IsDecimalDigit(last[i]) {
continue
}
// Go past generation number
for IsDecimalDigit(last[i]) && i > 0 {
i--
}
if i == 0 || !IsWhiteSpace(last[i]) {
continue
}
// Go past whitespace
for IsWhiteSpace(last[i]) && i > 0 {
i--
}
if i == 0 || !IsDecimalDigit(last[i]) {
continue
}
// Go past object number.
for IsDecimalDigit(last[i]) && i > 0 {
i--
}
if i == 0 {
continue // Probably too long to be a valid object...
}
objOffset := parser.GetFileOffset() - int64(bufLen-i)
objstr := append(last[i+1:], b)
objNum, genNum, err := parseObjectNumberFromString(string(objstr))
if err != nil {
common.Log.Debug("Unable to parse object number: %v", err)
return nil, err
}
// Create and insert the XREF entry if not existing, or the generation number is higher.
if curXref, has := xrefTable.ObjectMap[objNum]; !has || curXref.Generation < genNum {
// Make the entry for the cross ref table.
xrefEntry := XrefObject{}
xrefEntry.XType = XrefTypeTableEntry
xrefEntry.ObjectNumber = int(objNum)
xrefEntry.Generation = int(genNum)
xrefEntry.Offset = objOffset
xrefTable.ObjectMap[objNum] = xrefEntry
}
}
last = append(last[1:bufLen], b)
}
return &xrefTable, nil
}
// Look for first sign of xref table from end of file.
func (parser *PdfParser) repairSeekXrefMarker() error {
// Get the file size.
fSize, err := parser.rs.Seek(0, os.SEEK_END)
if err != nil {
return err
}
reXrefTableStart := regexp.MustCompile(`\sxref\s*`)
// Define the starting point (from the end of the file) to search from.
var offset int64
// Define an buffer length in terms of how many bytes to read from the end of the file.
var buflen int64 = 1000
for offset < fSize {
if fSize <= (buflen + offset) {
buflen = fSize - offset
}
// Move back enough (as we need to read forward).
_, err := parser.rs.Seek(-offset-buflen, os.SEEK_END)
if err != nil {
return err
}
// Read the data.
b1 := make([]byte, buflen)
parser.rs.Read(b1)
common.Log.Trace("Looking for xref : \"%s\"", string(b1))
ind := reXrefTableStart.FindAllStringIndex(string(b1), -1)
if ind != nil {
// Found it.
lastInd := ind[len(ind)-1]
common.Log.Trace("Ind: % d", ind)
parser.rs.Seek(-offset-buflen+int64(lastInd[0]), os.SEEK_END)
parser.reader = bufio.NewReader(parser.rs)
// Go past whitespace, finish at 'x'.
for {
bb, err := parser.reader.Peek(1)
if err != nil {
return err
}
common.Log.Trace("B: %d %c", bb[0], bb[0])
if !IsWhiteSpace(bb[0]) {
break
}
parser.reader.Discard(1)
}
return nil
}
common.Log.Debug("Warning: EOF marker not found! - continue seeking")
offset += buflen
}
common.Log.Debug("Error: Xref table marker was not found.")
return errors.New("xref not found ")
}
// Called when Pdf version not found normally. Looks for the PDF version by scanning top-down.
// %PDF-1.7
func (parser *PdfParser) seekPdfVersionTopDown() (int, int, error) {
// Go to beginning, reset reader.
parser.rs.Seek(0, os.SEEK_SET)
parser.reader = bufio.NewReader(parser.rs)
// Keep a running buffer of last bytes.
bufLen := 20
last := make([]byte, bufLen)
for {
b, err := parser.reader.ReadByte()
if err != nil {
if err == io.EOF {
break
} else {
return 0, 0, err
}
}
// Format:
// object number - whitespace - generation number - obj
// e.g. "12 0 obj"
if IsDecimalDigit(b) && last[bufLen-1] == '.' && IsDecimalDigit(last[bufLen-2]) && last[bufLen-3] == '-' &&
last[bufLen-4] == 'F' && last[bufLen-5] == 'D' && last[bufLen-6] == 'P' {
major := int(last[bufLen-2] - '0')
minor := int(b - '0')
return major, minor, nil
}
last = append(last[1:bufLen], b)
}
return 0, 0, errors.New("version not found")
}