Skip to content

Commit 86b3445

Browse files
committed
Fix #82, #86 repairs corrupt xref sections
1 parent 33da2dd commit 86b3445

File tree

2 files changed

+199
-57
lines changed

2 files changed

+199
-57
lines changed

pkg/pdfcpu/context.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ type ReadContext struct {
156156
FileName string // The input PDF-File.
157157
FileSize int64
158158
rs io.ReadSeeker
159+
EolCount int // 1 or 2 characters used for eol.
159160
BinaryTotalSize int64 // total stream data
160161
BinaryImageSize int64 // total image stream data
161162
BinaryFontSize int64 // total font stream data (fontfiles)

pkg/pdfcpu/read.go

Lines changed: 198 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -764,7 +764,38 @@ func parseTrailerDict(trailerDict Dict, ctx *Context) (*int64, error) {
764764
return offset, nil
765765
}
766766

767-
func scanLine(s *bufio.Scanner) (string, error) {
767+
func scanLineRaw(s *bufio.Scanner) (string, error) {
768+
if ok := s.Scan(); !ok {
769+
if s.Err() != nil {
770+
return "", s.Err()
771+
}
772+
return "", errors.New("scanner: returning nothing")
773+
}
774+
return s.Text(), nil
775+
}
776+
777+
func scanLine(s *bufio.Scanner) (s1 string, err error) {
778+
for i := 0; i <= 1; i++ {
779+
s1, err = scanLineRaw(s)
780+
if err != nil {
781+
return "", err
782+
}
783+
if len(s1) > 0 {
784+
break
785+
}
786+
}
787+
788+
// Remove comment.
789+
i := strings.Index(s1, "%")
790+
if i >= 0 {
791+
s1 = s1[:i]
792+
}
793+
794+
return s1, nil
795+
}
796+
797+
// scanLine ignores comments and empty lines.
798+
func scanLineOrig(s *bufio.Scanner) (string, error) {
768799
for i := 0; i <= 1; i++ {
769800
if ok := s.Scan(); !ok {
770801
err := s.Err()
@@ -906,6 +937,39 @@ func scanTrailerDict(s *bufio.Scanner, startTag bool) (string, error) {
906937
return buf.String(), nil
907938
}
908939

940+
func processTrailer(ctx *Context, s *bufio.Scanner, line string) (*int64, error) {
941+
942+
var trailerString string
943+
944+
if line != "trailer" {
945+
trailerString = line[7:]
946+
log.Read.Printf("processTrailer: trailer leftover: <%s>\n", trailerString)
947+
} else {
948+
log.Read.Printf("line (len %d) <%s>\n", len(line), line)
949+
}
950+
951+
trailerString, err := scanTrailer(s, trailerString)
952+
if err != nil {
953+
return nil, err
954+
}
955+
956+
log.Read.Printf("processTrailer: trailerString: (len:%d) <%s>\n", len(trailerString), trailerString)
957+
958+
o, err := parseObject(&trailerString)
959+
if err != nil {
960+
return nil, err
961+
}
962+
963+
trailerDict, ok := o.(Dict)
964+
if !ok {
965+
return nil, errors.New("processTrailer: corrupt trailer dict")
966+
}
967+
968+
log.Read.Printf("processTrailer: trailerDict:\n%s\n", trailerDict)
969+
970+
return parseTrailerDict(trailerDict, ctx)
971+
}
972+
909973
// Parse xRef section into corresponding number of xRef table entries.
910974
func parseXRefSection(s *bufio.Scanner, ctx *Context) (*int64, error) {
911975

@@ -950,42 +1014,7 @@ func parseXRefSection(s *bufio.Scanner, ctx *Context) (*int64, error) {
9501014

9511015
log.Read.Println("parseXRefSection: parsing trailer dict..")
9521016

953-
var trailerString string
954-
955-
if line != "trailer" {
956-
trailerString = line[7:]
957-
log.Read.Printf("parseXRefSection: trailer leftover: <%s>\n", trailerString)
958-
} else {
959-
log.Read.Printf("line (len %d) <%s>\n", len(line), line)
960-
}
961-
962-
trailerString, err = scanTrailer(s, trailerString)
963-
if err != nil {
964-
return nil, err
965-
}
966-
967-
log.Read.Printf("parseXRefSection: trailerString: (len:%d) <%s>\n", len(trailerString), trailerString)
968-
969-
o, err := parseObject(&trailerString)
970-
if err != nil {
971-
return nil, err
972-
}
973-
974-
trailerDict, ok := o.(Dict)
975-
if !ok {
976-
return nil, errors.New("parseXRefSection: corrupt trailer dict")
977-
}
978-
979-
log.Read.Printf("parseXRefSection: trailerDict:\n%s\n", trailerDict)
980-
981-
offset, err := parseTrailerDict(trailerDict, ctx)
982-
if err != nil {
983-
return nil, err
984-
}
985-
986-
log.Read.Println("parseXRefSection end")
987-
988-
return offset, nil
1017+
return processTrailer(ctx, s, line)
9891018
}
9901019

9911020
// Get version from first line of file.
@@ -994,22 +1023,40 @@ func parseXRefSection(s *bufio.Scanner, ctx *Context) (*int64, error) {
9941023
// if present, shall be used instead of the version specified in the Header.
9951024
// Save PDF Version from header to xRefTable.
9961025
// The header version comes as the first line of the file.
997-
func headerVersion(rs io.ReadSeeker) (*Version, error) {
1026+
// eolCount is the number of characters used for eol (1 or 2).
1027+
func headerVersion(rs io.ReadSeeker) (v *Version, eolCount int, err error) {
9981028

9991029
log.Read.Println("headerVersion begin")
10001030

1031+
var errCorruptHeader = errors.New("headerVersion: corrupt pfd file - no header version available")
1032+
10011033
// Get first line of file which holds the version of this PDFFile.
10021034
// We call this the header version.
10031035

1004-
_, err := rs.Seek(0, io.SeekStart)
1036+
_, err = rs.Seek(0, io.SeekStart)
10051037
if err != nil {
1006-
return nil, err
1038+
return nil, 0, err
10071039
}
10081040

1009-
buf := make([]byte, 10)
1041+
buf := make([]byte, 20)
10101042
_, err = rs.Read(buf)
10111043
if err != nil {
1012-
return nil, err
1044+
return nil, 0, err
1045+
}
1046+
1047+
b := string(buf)
1048+
1049+
// Detect the used eol which should be 1 (0x00, 0x0D) or 2 chars (0x0D0A)long.
1050+
// %PDF-1.x{eol}
1051+
if b[8] == 0x0A {
1052+
eolCount = 1
1053+
} else if b[8] == 0x0D {
1054+
eolCount = 1
1055+
if b[9] == 0x0A {
1056+
eolCount = 2
1057+
}
1058+
} else {
1059+
return nil, 0, errCorruptHeader
10131060
}
10141061

10151062
// Parse the PDF-Version.
@@ -1019,17 +1066,113 @@ func headerVersion(rs io.ReadSeeker) (*Version, error) {
10191066
s := strings.TrimSpace(string(buf))
10201067

10211068
if len(s) < 8 || !strings.HasPrefix(s, prefix) {
1022-
return nil, errors.New("headerVersion: corrupt pfd file - no header version available")
1069+
return nil, 0, errCorruptHeader
10231070
}
10241071

10251072
pdfVersion, err := PDFVersion(s[len(prefix) : len(prefix)+3])
10261073
if err != nil {
1027-
return nil, errors.Wrapf(err, "headerVersion: unknown PDF Header Version")
1074+
return nil, 0, errors.Wrapf(err, "headerVersion: unknown PDF Header Version")
10281075
}
10291076

10301077
log.Read.Printf("headerVersion: end, found header version: %s\n", pdfVersion)
10311078

1032-
return &pdfVersion, nil
1079+
return &pdfVersion, eolCount, nil
1080+
}
1081+
1082+
// bypassXrefSection is a hack for digesting corrupt xref sections.
1083+
// It populates the xRefTable by reading in all indirect objects line by line
1084+
// and works on the assumption of a single xref section - meaning no incremental updates have been made.
1085+
func bypassXrefSection(ctx *Context) error {
1086+
var z int64
1087+
g := FreeHeadGeneration
1088+
ctx.Table[0] = &XRefTableEntry{
1089+
Free: true,
1090+
Offset: &z,
1091+
Generation: &g}
1092+
1093+
rs := ctx.Read.rs
1094+
eolCount := ctx.Read.EolCount
1095+
var off, offset int64
1096+
1097+
rd, err := newPositionedReader(rs, &offset)
1098+
if err != nil {
1099+
return err
1100+
}
1101+
1102+
s := bufio.NewScanner(rd)
1103+
s.Split(scanLines)
1104+
1105+
bb := []byte{}
1106+
var (
1107+
withinObj bool
1108+
withinXref bool
1109+
withinTrailer bool
1110+
)
1111+
1112+
for {
1113+
line, err := scanLineRaw(s)
1114+
if err != nil {
1115+
break
1116+
}
1117+
if withinXref {
1118+
offset += int64(len(line) + eolCount)
1119+
if withinTrailer {
1120+
bb = append(bb, ' ')
1121+
bb = append(bb, line...)
1122+
i := strings.Index(line, "startxref")
1123+
if i >= 0 {
1124+
// Parse trailer.
1125+
_, err = processTrailer(ctx, s, string(bb))
1126+
return err
1127+
}
1128+
continue
1129+
}
1130+
// Ignore all until "trailer".
1131+
i := strings.Index(line, "trailer")
1132+
if i >= 0 {
1133+
bb = append(bb, line...)
1134+
withinTrailer = true
1135+
}
1136+
continue
1137+
}
1138+
i := strings.Index(line, "xref")
1139+
if i >= 0 {
1140+
offset += int64(len(line) + eolCount)
1141+
withinXref = true
1142+
continue
1143+
}
1144+
if !withinObj {
1145+
i := strings.Index(line, "obj")
1146+
if i >= 0 {
1147+
withinObj = true
1148+
off = offset
1149+
bb = append(bb, line[:i+3]...)
1150+
}
1151+
offset += int64(len(line) + eolCount)
1152+
continue
1153+
}
1154+
1155+
// within obj
1156+
offset += int64(len(line) + eolCount)
1157+
bb = append(bb, ' ')
1158+
bb = append(bb, line...)
1159+
i = strings.Index(line, "endobj")
1160+
if i >= 0 {
1161+
l := string(bb)
1162+
objNr, generation, err := parseObjectAttributes(&l)
1163+
if err != nil {
1164+
return err
1165+
}
1166+
of := off
1167+
ctx.Table[*objNr] = &XRefTableEntry{
1168+
Free: false,
1169+
Offset: &of,
1170+
Generation: generation}
1171+
bb = nil
1172+
withinObj = false
1173+
}
1174+
}
1175+
return nil
10331176
}
10341177

10351178
// Build XRefTable by reading XRef streams or XRef sections.
@@ -1039,12 +1182,13 @@ func buildXRefTableStartingAt(ctx *Context, offset *int64) error {
10391182

10401183
rs := ctx.Read.rs
10411184

1042-
hv, err := headerVersion(rs)
1185+
hv, eolCount, err := headerVersion(rs)
10431186
if err != nil {
10441187
return err
10451188
}
10461189

10471190
ctx.HeaderVersion = hv
1191+
ctx.Read.EolCount = eolCount
10481192

10491193
for offset != nil {
10501194

@@ -1063,26 +1207,23 @@ func buildXRefTableStartingAt(ctx *Context, offset *int64) error {
10631207

10641208
log.Read.Printf("line: <%s>\n", line)
10651209

1066-
if strings.TrimSpace(line) != "xref" {
1210+
if strings.TrimSpace(line) == "xref" {
1211+
log.Read.Println("buildXRefTableStartingAt: found xref section")
1212+
if offset, err = parseXRefSection(s, ctx); err != nil {
1213+
return err
1214+
}
1215+
} else {
10671216

10681217
log.Read.Println("buildXRefTableStartingAt: found xref stream")
10691218
ctx.Read.UsingXRefStreams = true
10701219
rd, err = newPositionedReader(rs, offset)
10711220
if err != nil {
10721221
return err
10731222
}
1074-
10751223
if offset, err = parseXRefStream(rd, offset, ctx); err != nil {
1076-
return err
1224+
// Try fix for corrupt single xref section.
1225+
return bypassXrefSection(ctx)
10771226
}
1078-
1079-
} else {
1080-
1081-
log.Read.Println("buildXRefTableStartingAt: found xref section")
1082-
if offset, err = parseXRefSection(s, ctx); err != nil {
1083-
return err
1084-
}
1085-
10861227
}
10871228
}
10881229

0 commit comments

Comments
 (0)