@@ -764,7 +764,38 @@ func parseTrailerDict(trailerDict Dict, ctx *Context) (*int64, error) {
764
764
return offset , nil
765
765
}
766
766
767
- func scanLine (s * bufio.Scanner ) (string , error ) {
767
+ func scanLineRaw (s * bufio.Scanner ) (string , error ) {
768
+ if ok := s .Scan (); ! ok {
769
+ if s .Err () != nil {
770
+ return "" , s .Err ()
771
+ }
772
+ return "" , errors .New ("scanner: returning nothing" )
773
+ }
774
+ return s .Text (), nil
775
+ }
776
+
777
+ func scanLine (s * bufio.Scanner ) (s1 string , err error ) {
778
+ for i := 0 ; i <= 1 ; i ++ {
779
+ s1 , err = scanLineRaw (s )
780
+ if err != nil {
781
+ return "" , err
782
+ }
783
+ if len (s1 ) > 0 {
784
+ break
785
+ }
786
+ }
787
+
788
+ // Remove comment.
789
+ i := strings .Index (s1 , "%" )
790
+ if i >= 0 {
791
+ s1 = s1 [:i ]
792
+ }
793
+
794
+ return s1 , nil
795
+ }
796
+
797
+ // scanLine ignores comments and empty lines.
798
+ func scanLineOrig (s * bufio.Scanner ) (string , error ) {
768
799
for i := 0 ; i <= 1 ; i ++ {
769
800
if ok := s .Scan (); ! ok {
770
801
err := s .Err ()
@@ -906,6 +937,39 @@ func scanTrailerDict(s *bufio.Scanner, startTag bool) (string, error) {
906
937
return buf .String (), nil
907
938
}
908
939
940
+ func processTrailer (ctx * Context , s * bufio.Scanner , line string ) (* int64 , error ) {
941
+
942
+ var trailerString string
943
+
944
+ if line != "trailer" {
945
+ trailerString = line [7 :]
946
+ log .Read .Printf ("processTrailer: trailer leftover: <%s>\n " , trailerString )
947
+ } else {
948
+ log .Read .Printf ("line (len %d) <%s>\n " , len (line ), line )
949
+ }
950
+
951
+ trailerString , err := scanTrailer (s , trailerString )
952
+ if err != nil {
953
+ return nil , err
954
+ }
955
+
956
+ log .Read .Printf ("processTrailer: trailerString: (len:%d) <%s>\n " , len (trailerString ), trailerString )
957
+
958
+ o , err := parseObject (& trailerString )
959
+ if err != nil {
960
+ return nil , err
961
+ }
962
+
963
+ trailerDict , ok := o .(Dict )
964
+ if ! ok {
965
+ return nil , errors .New ("processTrailer: corrupt trailer dict" )
966
+ }
967
+
968
+ log .Read .Printf ("processTrailer: trailerDict:\n %s\n " , trailerDict )
969
+
970
+ return parseTrailerDict (trailerDict , ctx )
971
+ }
972
+
909
973
// Parse xRef section into corresponding number of xRef table entries.
910
974
func parseXRefSection (s * bufio.Scanner , ctx * Context ) (* int64 , error ) {
911
975
@@ -950,42 +1014,7 @@ func parseXRefSection(s *bufio.Scanner, ctx *Context) (*int64, error) {
950
1014
951
1015
log .Read .Println ("parseXRefSection: parsing trailer dict.." )
952
1016
953
- var trailerString string
954
-
955
- if line != "trailer" {
956
- trailerString = line [7 :]
957
- log .Read .Printf ("parseXRefSection: trailer leftover: <%s>\n " , trailerString )
958
- } else {
959
- log .Read .Printf ("line (len %d) <%s>\n " , len (line ), line )
960
- }
961
-
962
- trailerString , err = scanTrailer (s , trailerString )
963
- if err != nil {
964
- return nil , err
965
- }
966
-
967
- log .Read .Printf ("parseXRefSection: trailerString: (len:%d) <%s>\n " , len (trailerString ), trailerString )
968
-
969
- o , err := parseObject (& trailerString )
970
- if err != nil {
971
- return nil , err
972
- }
973
-
974
- trailerDict , ok := o .(Dict )
975
- if ! ok {
976
- return nil , errors .New ("parseXRefSection: corrupt trailer dict" )
977
- }
978
-
979
- log .Read .Printf ("parseXRefSection: trailerDict:\n %s\n " , trailerDict )
980
-
981
- offset , err := parseTrailerDict (trailerDict , ctx )
982
- if err != nil {
983
- return nil , err
984
- }
985
-
986
- log .Read .Println ("parseXRefSection end" )
987
-
988
- return offset , nil
1017
+ return processTrailer (ctx , s , line )
989
1018
}
990
1019
991
1020
// Get version from first line of file.
@@ -994,22 +1023,40 @@ func parseXRefSection(s *bufio.Scanner, ctx *Context) (*int64, error) {
994
1023
// if present, shall be used instead of the version specified in the Header.
995
1024
// Save PDF Version from header to xRefTable.
996
1025
// The header version comes as the first line of the file.
997
- func headerVersion (rs io.ReadSeeker ) (* Version , error ) {
1026
+ // eolCount is the number of characters used for eol (1 or 2).
1027
+ func headerVersion (rs io.ReadSeeker ) (v * Version , eolCount int , err error ) {
998
1028
999
1029
log .Read .Println ("headerVersion begin" )
1000
1030
1031
+ var errCorruptHeader = errors .New ("headerVersion: corrupt pfd file - no header version available" )
1032
+
1001
1033
// Get first line of file which holds the version of this PDFFile.
1002
1034
// We call this the header version.
1003
1035
1004
- _ , err : = rs .Seek (0 , io .SeekStart )
1036
+ _ , err = rs .Seek (0 , io .SeekStart )
1005
1037
if err != nil {
1006
- return nil , err
1038
+ return nil , 0 , err
1007
1039
}
1008
1040
1009
- buf := make ([]byte , 10 )
1041
+ buf := make ([]byte , 20 )
1010
1042
_ , err = rs .Read (buf )
1011
1043
if err != nil {
1012
- return nil , err
1044
+ return nil , 0 , err
1045
+ }
1046
+
1047
+ b := string (buf )
1048
+
1049
+ // Detect the used eol which should be 1 (0x00, 0x0D) or 2 chars (0x0D0A)long.
1050
+ // %PDF-1.x{eol}
1051
+ if b [8 ] == 0x0A {
1052
+ eolCount = 1
1053
+ } else if b [8 ] == 0x0D {
1054
+ eolCount = 1
1055
+ if b [9 ] == 0x0A {
1056
+ eolCount = 2
1057
+ }
1058
+ } else {
1059
+ return nil , 0 , errCorruptHeader
1013
1060
}
1014
1061
1015
1062
// Parse the PDF-Version.
@@ -1019,17 +1066,113 @@ func headerVersion(rs io.ReadSeeker) (*Version, error) {
1019
1066
s := strings .TrimSpace (string (buf ))
1020
1067
1021
1068
if len (s ) < 8 || ! strings .HasPrefix (s , prefix ) {
1022
- return nil , errors . New ( "headerVersion: corrupt pfd file - no header version available" )
1069
+ return nil , 0 , errCorruptHeader
1023
1070
}
1024
1071
1025
1072
pdfVersion , err := PDFVersion (s [len (prefix ) : len (prefix )+ 3 ])
1026
1073
if err != nil {
1027
- return nil , errors .Wrapf (err , "headerVersion: unknown PDF Header Version" )
1074
+ return nil , 0 , errors .Wrapf (err , "headerVersion: unknown PDF Header Version" )
1028
1075
}
1029
1076
1030
1077
log .Read .Printf ("headerVersion: end, found header version: %s\n " , pdfVersion )
1031
1078
1032
- return & pdfVersion , nil
1079
+ return & pdfVersion , eolCount , nil
1080
+ }
1081
+
1082
+ // bypassXrefSection is a hack for digesting corrupt xref sections.
1083
+ // It populates the xRefTable by reading in all indirect objects line by line
1084
+ // and works on the assumption of a single xref section - meaning no incremental updates have been made.
1085
+ func bypassXrefSection (ctx * Context ) error {
1086
+ var z int64
1087
+ g := FreeHeadGeneration
1088
+ ctx .Table [0 ] = & XRefTableEntry {
1089
+ Free : true ,
1090
+ Offset : & z ,
1091
+ Generation : & g }
1092
+
1093
+ rs := ctx .Read .rs
1094
+ eolCount := ctx .Read .EolCount
1095
+ var off , offset int64
1096
+
1097
+ rd , err := newPositionedReader (rs , & offset )
1098
+ if err != nil {
1099
+ return err
1100
+ }
1101
+
1102
+ s := bufio .NewScanner (rd )
1103
+ s .Split (scanLines )
1104
+
1105
+ bb := []byte {}
1106
+ var (
1107
+ withinObj bool
1108
+ withinXref bool
1109
+ withinTrailer bool
1110
+ )
1111
+
1112
+ for {
1113
+ line , err := scanLineRaw (s )
1114
+ if err != nil {
1115
+ break
1116
+ }
1117
+ if withinXref {
1118
+ offset += int64 (len (line ) + eolCount )
1119
+ if withinTrailer {
1120
+ bb = append (bb , ' ' )
1121
+ bb = append (bb , line ... )
1122
+ i := strings .Index (line , "startxref" )
1123
+ if i >= 0 {
1124
+ // Parse trailer.
1125
+ _ , err = processTrailer (ctx , s , string (bb ))
1126
+ return err
1127
+ }
1128
+ continue
1129
+ }
1130
+ // Ignore all until "trailer".
1131
+ i := strings .Index (line , "trailer" )
1132
+ if i >= 0 {
1133
+ bb = append (bb , line ... )
1134
+ withinTrailer = true
1135
+ }
1136
+ continue
1137
+ }
1138
+ i := strings .Index (line , "xref" )
1139
+ if i >= 0 {
1140
+ offset += int64 (len (line ) + eolCount )
1141
+ withinXref = true
1142
+ continue
1143
+ }
1144
+ if ! withinObj {
1145
+ i := strings .Index (line , "obj" )
1146
+ if i >= 0 {
1147
+ withinObj = true
1148
+ off = offset
1149
+ bb = append (bb , line [:i + 3 ]... )
1150
+ }
1151
+ offset += int64 (len (line ) + eolCount )
1152
+ continue
1153
+ }
1154
+
1155
+ // within obj
1156
+ offset += int64 (len (line ) + eolCount )
1157
+ bb = append (bb , ' ' )
1158
+ bb = append (bb , line ... )
1159
+ i = strings .Index (line , "endobj" )
1160
+ if i >= 0 {
1161
+ l := string (bb )
1162
+ objNr , generation , err := parseObjectAttributes (& l )
1163
+ if err != nil {
1164
+ return err
1165
+ }
1166
+ of := off
1167
+ ctx .Table [* objNr ] = & XRefTableEntry {
1168
+ Free : false ,
1169
+ Offset : & of ,
1170
+ Generation : generation }
1171
+ bb = nil
1172
+ withinObj = false
1173
+ }
1174
+ }
1175
+ return nil
1033
1176
}
1034
1177
1035
1178
// Build XRefTable by reading XRef streams or XRef sections.
@@ -1039,12 +1182,13 @@ func buildXRefTableStartingAt(ctx *Context, offset *int64) error {
1039
1182
1040
1183
rs := ctx .Read .rs
1041
1184
1042
- hv , err := headerVersion (rs )
1185
+ hv , eolCount , err := headerVersion (rs )
1043
1186
if err != nil {
1044
1187
return err
1045
1188
}
1046
1189
1047
1190
ctx .HeaderVersion = hv
1191
+ ctx .Read .EolCount = eolCount
1048
1192
1049
1193
for offset != nil {
1050
1194
@@ -1063,26 +1207,23 @@ func buildXRefTableStartingAt(ctx *Context, offset *int64) error {
1063
1207
1064
1208
log .Read .Printf ("line: <%s>\n " , line )
1065
1209
1066
- if strings .TrimSpace (line ) != "xref" {
1210
+ if strings .TrimSpace (line ) == "xref" {
1211
+ log .Read .Println ("buildXRefTableStartingAt: found xref section" )
1212
+ if offset , err = parseXRefSection (s , ctx ); err != nil {
1213
+ return err
1214
+ }
1215
+ } else {
1067
1216
1068
1217
log .Read .Println ("buildXRefTableStartingAt: found xref stream" )
1069
1218
ctx .Read .UsingXRefStreams = true
1070
1219
rd , err = newPositionedReader (rs , offset )
1071
1220
if err != nil {
1072
1221
return err
1073
1222
}
1074
-
1075
1223
if offset , err = parseXRefStream (rd , offset , ctx ); err != nil {
1076
- return err
1224
+ // Try fix for corrupt single xref section.
1225
+ return bypassXrefSection (ctx )
1077
1226
}
1078
-
1079
- } else {
1080
-
1081
- log .Read .Println ("buildXRefTableStartingAt: found xref section" )
1082
- if offset , err = parseXRefSection (s , ctx ); err != nil {
1083
- return err
1084
- }
1085
-
1086
1227
}
1087
1228
}
1088
1229
0 commit comments