From d52fbbb0409df15dd633a81fd871aaac08162321 Mon Sep 17 00:00:00 2001 From: Trenton H Date: Mon, 24 Oct 2022 13:16:14 -0700 Subject: [PATCH] More smoothly handle the case of a password protected PDF for barcodes --- src/documents/barcodes.py | 49 ++++++++++++++---- .../tests/samples/password-is-test.pdf | Bin 0 -> 8398 bytes src/documents/tests/test_barcodes.py | 39 +++++++++++++- 3 files changed, 76 insertions(+), 12 deletions(-) create mode 100755 src/documents/tests/samples/password-is-test.pdf diff --git a/src/documents/barcodes.py b/src/documents/barcodes.py index 13e78e1813b..1f5e33d376c 100644 --- a/src/documents/barcodes.py +++ b/src/documents/barcodes.py @@ -10,9 +10,12 @@ import magic from django.conf import settings from pdf2image import convert_from_path +from pdf2image.exceptions import PDFPageCountError from pikepdf import Page +from pikepdf import PasswordError from pikepdf import Pdf from pikepdf import PdfImage +from pikepdf.models.image import HifiPrintImageNotTranscodableError from PIL import Image from PIL import ImageSequence from pyzbar import pyzbar @@ -120,7 +123,9 @@ def _pikepdf_barcode_scan(pdf_filepath: str): pdfimage = PdfImage(page.images[image_key]) if "/CCITTFaxDecode" in pdfimage.filters: - raise BarcodeImageFormatError() + raise BarcodeImageFormatError( + "Unable to decode CCITTFaxDecode images", + ) # Not all images can be transcoded to a PIL image, which # is what pyzbar expects to receive @@ -132,7 +137,7 @@ def _pikepdf_barcode_scan(pdf_filepath: str): separator_page_numbers.append(page_num) def _pdf2image_barcode_scan(pdf_filepath: str): - # use a temporary directory in case the file os too big to handle in memory + # use a temporary directory in case the file is too big to handle in memory with tempfile.TemporaryDirectory() as path: pages_from_path = convert_from_path(pdf_filepath, output_folder=path) for current_page_number, page in enumerate(pages_from_path): @@ -150,20 +155,42 @@ def _pdf2image_barcode_scan(pdf_filepath: str): if mime_type == "image/tiff": pdf_filepath = convert_from_tiff_to_pdf(filepath) + # Chose the scanner if settings.CONSUMER_USE_LEGACY_DETECTION: - _pdf2image_barcode_scan(pdf_filepath) + logger.debug("Using pdf2image for barcodes") + scanner_function = _pdf2image_barcode_scan else: - try: - _pikepdf_barcode_scan(pdf_filepath) - except Exception as e: + logger.debug("Using pikepdf for barcodes") + scanner_function = _pikepdf_barcode_scan - logger.warning( - f"Exception using pikepdf for barcodes," - f" falling back to pdf2image: {e}", - ) - # Reset this incase pikepdf got part way through + # Run the scanner + try: + scanner_function(pdf_filepath) + # Neither method can handle password protected PDFs without it being + # provided. Log it and continue + except (PasswordError, PDFPageCountError) as e: + logger.warning( + f"File is likely password protected, not splitting: {e}", + ) + # Handle pikepdf related image decoding issues with a fallback + except (BarcodeImageFormatError, HifiPrintImageNotTranscodableError) as e: + logger.warning( + f"Falling back to pdf2image because: {e}", + ) + try: separator_page_numbers = [] _pdf2image_barcode_scan(pdf_filepath) + # This file is really borked, allow the consumption to continue + # but it may fail further on + except Exception as e: # pragma: no cover + logger.warning( + f"Exception during barcode scanning: {e}", + ) + # We're not sure what happened, but allow the consumption to continue + except Exception as e: # pragma: no cover + logger.warning( + f"Exception during barcode scanning: {e}", + ) else: logger.warning( diff --git a/src/documents/tests/samples/password-is-test.pdf b/src/documents/tests/samples/password-is-test.pdf new file mode 100755 index 0000000000000000000000000000000000000000..b16b023c33cd3caa086db4f624ebd681a3dc62fb GIT binary patch literal 8398 zcma)i2UL^G);1jp9i<8y0RbO=q5CLIx}(tD92O+Wz=1p#R) zNEN9fO*(>f|KL66o^$T~zW@Jctt9jA*|YcTJ_?v?mcE=u_qpx#Eh5Nz$6+mEjw_z7TE}y&EGJY_dNa zd~{X}m2+NVW0aw?Wzs*N6X#tyoGgvd!1lM#^;)Vomz~-W(-TG~g{M-{ZPy7>%Z{9P zjOKDKnTUAX6J-D4dLxzEDBBt|8LUb(XzNoHs zDPcBN{<8z^M~8LE}2Ry=*Mq|ysY9<0p&N}0D?(g|v5fd}K%QH4fLnc>Nx0Njk`kB*+AE{~`NLyML zj49_1n`*nt3M}tIe)2f!%3ALCg0J5%Tx(VXh;(h6*eWRxZksiRO*bJZN|4vL6F^s^ z@;;Pj?|%hZa^}t9@AfvENBvbC zHWK5^Zr)~Dl@n%euGdKERq6uJ-A*a^Vy3Zu+sWDS z=b5JCgZ11?AM7Zf?P^jlr7W?v;y0)C_pMLMczPLCwQ%MM8ld;9{RUhER#P^AFgmTC z=8|TPF^{06IP6_r9?GsQ)_i8%! z);W79mdjaO_NohIkl7$Dq?jDt*Ed-p&^~M?d*j>I`#2K@c8|2$-4$NM#-~vu7V50- z5O)qxnGu@)?aU?2gV%;RGfW*8Re5xiKA1HUx6@HgQT1X~444ahUU38Ywt8J_e#m&o zRI2LpT1Wj-A$9~BPTUuL!VR7TtPKCwxEIVgELz6H97?x^%{IVYg=O_Hf9t?8$>Bane*WMbnshk-U_q@PY} zIWxfdxe`_U@r@VRm#`MJa@R7nllx;=B^)rJ?5&fYp=IjKM|K}kwhnbahUzrT!_3W6 zZcDvIa4v@&#;=_HkO6ujqFz1z2GfhA$7opi^wveMm2$*>i^;OA&(AJ(=>ZbvyFjV; zRIK$20p$t4^&J|%XFR4h4l^GcLKWN}$?Dxy|MG@@za(Q%|G8k2>J#K$q+qoOHO~#y zAmmHh`eLc=b1N!rfXSHrfwtGcfTpE#-8~Z zNs}|aRibFrbGr6RRh1d#=k>Q60tv0FIs=%bz4?kj`8_plIKBKMe`gs#@J15+EYA*~ zdp1gP%!^HEp~I?P>0Lp0F&(<*D|p6o=gTW(ujlhoB{Oy7D#(Ul>F{{tqeV&otz@*= zLopVnh&lF(hh;Yz)RLJ(u;>+lFs^R2?nRRS{5g!WP+;7>Yd6Xf?3%jjR1mqD0AnpN z@BVYuHfzA|Uq&E3)ZKejpGQ?sd4HzD%=*ety#cwb$9=hZX|k?kTY+%uEF#f-#l3cN zXFy0;aLVe{NbsVN=u3)6;RYAJ(wkUpjD}OEd~^Gf3+dQAMh<7kIT)(Qegp8cc5(5r z$H(k&-@ZsM-?Bd?Zdi7nlZA2Lr=Iyy_WNRtYVU`HE8orGGgIy0P>A8!k@HUujnC>Z zj`GE%yU7XBkKV)y1?3!IQS${902sj=M~g&pUZ7z18!{@G4;jnzeOWP#k^S4!>P?p- zG-&y+B+i|)l3w|$e*Jmd*s*5~RB8t^I)Y#R;*=CEfA>(4w_EkrftX1?zBtwP%lLEU z)Z+x^;$tqanDW)wCKfYD0qL9jkBUL@p0dsc!DAJN_+uEL%0uwcV(;_Ek};zZH=5(` zzr?2h%#4eRWR|STj+ErSuCuFea!yi!_0EO$;MW?oqd`$T{BMd}2&rW{y9buy4&JmZ z6a1=K%Y#6|Sy_4A)BT8RnvV|r+oSG{JRwId%MM zjfN}j7t}*xO(i^Fm4Z6YgypxvLlKp&8Asmd*VI!2<@#N5Kc^rs70%)v=*}2lvI;|f zXmV^jQ?r(xMlHVdOYr4-u^sKekpthR1g#6dApS{hrPmLUo^t6iz%8#=HNowPf~i7; z%WO4Y#O=EC@+(AQ zJS2pDpYh>qii1n}B$q`mo!ym}il?;?YI|OKxt=L^9Z6h%SCVQ9f3TudJXJ5CrsBn8 z?bqa7nRkw|;KnH@s>@kd()t5RD_aiMr*&k?0KGyNl$v^jBk@P^ zFkKf7z%koomxjkiepD_N!l;9NE@bTUdiQ?l}3||2~wZtyaD0H@Hx6R@1p_uQ{UEWUO4B4YuAxfGv*}jW<7P+c+N(E2#Lo5YIms)Ba(aek zY_;Cxz3BTZWXq9W`&ZEci0d(R7Pb z`5*6!n12tF8+^9IC@*%k_(<8varDRfK*Ue)yl2w(_wIBK_3&ogx`4dHv;h@MWtjcJ zA($3U^JPy1`tht$uQbHNU#!H*fAkq&av9{D#Ot_x|1AYdLL-mIWfKiVjf2zr+#Suw z)*IU1@tc{;V^6}-+JQIjUk#R@<(3{=PStpFSGc)b=#vq19az!nicbP}+x_uN^6?6% zBXnQGuY#awpwYRmD)Z8^tEMlF8u;lAKPbK*n6P`2ZdXy}7e4KtAh1r2HMXC=J`v>R z{pQ8Mt*PR!(KL6Y_#67gnHA%$HIACj#|;kCy7B2z##^>?_?zF;01wjB6mIl}9hUCt z4fMZaW?7gS9TUHlxDEBwzjd_W7Zw8Fkzptw8;&@$Ir1|jWykmcpdj6Pi129kPS_Q( zx0-%dWVsN>{}`8V!!&6>QEJ>|B#XGJEw-zUOp|nwSWG`E*zB8+5+u>6eP)n(`xTnK z9=z-#Avtx|z&Mb39`$aS;|*ff3~}*W=fnEWuSzfKQ3_8(+zuYA*5mCK)fKJnw!~sv z-&YG*f4cPTvQTu*+A$5C60Adr(Jhnel3c0}vswSGj@>l@3e4AU14&s}?*jkzTAr+~ zp;ZwKt6!;=XPR0d^Cu^>JnE+Hx)?9^rO9T&kjW`v>;9$V`0|xU6hY>$VYRW5N}BYW zRXC32ZWQii!bQW)QTj#aUu8T_n>(}8n-Z;>7g24oLU}3$>et)@Io)Hhdc$BOWdYsx%#M>?53~50O3Owz8`czyUS{;ct2K7;mdvP6Z^uwfry&;`= zmlj~*Y7@Kh7ODRLbRp_Gby?YURA`cgW*2Xm=;x{per}C;L7WgR}%O;>bW z(#%6>&wP4Z(dRg|eK$Q%qoV}%G}xW{m!?o;pGA#}Xn|ZqVIwo~wp(a=ZRd_Lm#f0Z z{_%<7IMMLV*p<7p6}vcr0n?qVEM~+)gD$Bj?M*J)zo17ldVW}|MOV3X5lm6PdS+Bw z7eXD6QD(RK>ie{3@j|)_q#W?Hr5O9LMUwg6*o)E3A0qfSYgezlO=bGdGs5bI`7o4J z?e^$mhLFGG!(D17Oqz3iO1w{TNuhY_K3m9IrMiWL;oyXV?z*f=0#np7V32dQ(JNZv z$*XfK$HRj8od;1b5ys~Jc@JPvTw$>Vr&Fr520wR`-bI>3U;LO;dlj{&dUu3pO-FMl zbbI7l-AwwR4uLiAd27K@DQyj-bok_^@HvWy25y09*6QrzBg(VeEhgh5Slol_eXM9( zgCADjaWqB6dvvheuaW>3n5YP6Y}WF*Gadr?O^xS;c*DSr#n`teNNcTDv?&`K_Rn zPc)QDV?x4KcHDxtvN^+{R;IjcMsD2!{xWDmccadzs#Ix9+y`K-P@>#gLHi*t5`MWQ zQ}ao%cx9a`R~8R^r_d%TtV7A*+zX$I*KJY(N5y7-@5_^sWdZgmj_>SHi74DT-vGv$ z@9A}to=&ydc7dVe38KMlyZ8#aHws(1R!?6$Z^~B0D6nzT#&}-RjaK`j^Q$&z1I-Ud6+^n9dd&AXTdU-l7dwr0Cf`(mGwD!eQ zNB%JGuUD$A^&;#|M;TW{Un5dCj~<8SN||ewMR>ty!SSyq#2!e?=O^_67CQzqrT0u zRDyE%(vM4GB!%+|%&0HspBoa)zc~f$Dv{5;4+s$5du7vdIZN2hnEGkcMmKY^{do*2 zcVME=IJ^(OyWTYzzZI!y?p?3FQRt_)bX%)nw=3k`4Deo~;#Y_bn>yPOA=e$ZkQ!BkyN37odv0X8yBA-d~J;?lv zkm!YuwHve@b0k89V@otoNkzXD=SMAtXm6YA#i{QfJXXrcfBE+K!#n)ZtpTyA6(i4Q z67Sc%fc%a{d~comYAnY(D_H=ZCe7l3{UW634l)0Ll{pIM?uvISwz8dh4KLNIxh@PQ z_wF!#vj3^&UZ7Q0gjf_*y6oZi39rRAHC8Nac=>y}ZN>Miho_UZy~ZI=O=q%MQjTn! z!i8JxsfyKGA$}CEk%_9mri8LIx@M1z)zQ(SnA@=3E3H_|+xb2Y{N)Oirw#k&WORY^ zF8E`ywU8bACy!#v?^&l9##_g{(jWUqB*4y=k%k|s^Lj)(SI|jpa*ei|uJ^5gQdU?1 zZtp6gmp|2QQb^PJutFq{wa-Z9Vvq9VubC#9LVPPF){A*c=hFexB@7?c+=acmH`1kq zG-r)^{3qTFtRL~Wcl2A8u@Bt2pTU!o`d0L2!*pd1xH_`FwP21klGK&z+SGZ^Vz zxaVCRQT(ZZz4qF}B7qvI4A)__{>~#Yl9B%Vd#29$e9wzWm}+87(S+B9TNU;_ZUHYd zBW6!CU#1``$}k~BwO(BW#%YA|d3MPdduzo z4#tvf6s)F8b)M)7WlpEv0?kT^^dvXlRq2Us`W_tIUm$hVSiX0eHL{(aH@mRH@yGECGD{yjsvcHcmy~OY|^A}~#pwDA#t_mZ?RdzmrrNymKY3C^x zBtihlMltq=Cj0^mn3cK#y)ovsi0X$;dw8j5PI@RGFFLwWoNsu@+?R9n9)I;tY_357 zLsitr`c49{mp}y*Zq2xE2ywc1lb&z-k2h0eHYWInfUsPbw;$m=A)%D!;uvjB zgnGFV0c3RMbW8K1enwwtI!)%Sjaua-Oi80Ng)iE2>iivnnV-w2hHvg_r(jfv7wG?o z+x?S!`;#kuBp8hN_w_#Nuj_pSZ)XAs=j2Gj65K3ZiOwV^GND0!yoPrmfRr#N?TRMR zg5ZiL5uF_M39dwIzzO4F;|7G2J7i5IB_}Tv2?QJllz@R@KnN6t0!pDFW*{{Z-kxZQ zakQ}~0Kotd#?6x8NIL1AxcZ~3jCWQg5N&KpCsP0*eGUL|Cvl!^tRSIb;GtE6YSz)=s=X)w%y}&>(yaF>E@!~@_?2YSH?k5?S7gU@IHC ztE9{Y#ur6YG6^l$ztMflwIXj$JVkNT_xQ8V53ki!U-5$(#-S`3%`7;L$NE*zA~3K& ztGr#68=krvQ^p36Kh=ryJpM9ORO%I*hZR@B!ClS6kP$OO8+o#Q-G`#z8hwlZ2i!>g zmE?az1f=h7K{~;UfvY>=w^Rx5MmX82{~akrS2vQfE#CEH-!<`n^dV&E7!s{Wwr(b1 zAOe8|!pZ1?p{0N*D0#oZU?26g@XW3g8b(=}qoq(t7z!nYfTGPn%H-9M&A9lL9A`lmGVZ2KX;RCq zGrjKsj97#J3-jDf1ak;*6}6sL@Z%L7if%F5l6@TVRM0*5N0$$Aj79{68+DpFvG ziWCGY1;#;fN+<{z4ncvH;ZU3uLJFy*gn^)yq+n>Wt=}%m>&B89UQu9Uzp1P9*X4MA zx*hrVj}fn=+FNTehG4P3C**J2@drJuPr^G|;a#mjH-OMT`C|wa4blU`K-!`WpZgZ9 zl!yr;><;o~dM+;ma=p$R-U`^|TwC3hLTH+cfv$@xhFjTSWphNvS)bTyw&)#E{6`Z5 z=>R1lVCV_``sX3(nJt-!A;#~qceIPw2Co<+wa85p2R2(f`y`9O= ze;`}U(b@?J`TcJI07wlBGy%$zbEO0Z!*EC(42s35U|?vZ5(