From cbbfa044d76720cac561ab88bda61466dcb989b0 Mon Sep 17 00:00:00 2001 From: Matt Bertrand Date: Thu, 13 Jun 2024 08:55:32 -0400 Subject: [PATCH 1/4] . --- learning_resources/etl/utils.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/learning_resources/etl/utils.py b/learning_resources/etl/utils.py index 8ba96a966e..c7257827af 100644 --- a/learning_resources/etl/utils.py +++ b/learning_resources/etl/utils.py @@ -6,6 +6,7 @@ import mimetypes import os import re +import tarfile import uuid from collections import Counter from collections.abc import Generator @@ -509,11 +510,8 @@ def calc_checksum(filename) -> str: Returns: str: The md5 checksum of the file """ - hash_md5 = md5() # noqa: S324 - with Path.open(Path(filename), "rb") as f: - for chunk in iter(lambda: f.read(4096), b""): - hash_md5.update(chunk) - return hash_md5.hexdigest() + with tarfile.open(filename, "r") as tgz_file: + return str(hash(tuple(ti.chksum for ti in tgz_file.getmembers()))) def get_content_type(file_type: str) -> str: From bea33f9527ff0d034d39c373f35a4b00824bc811 Mon Sep 17 00:00:00 2001 From: Matt Bertrand Date: Thu, 13 Jun 2024 11:52:18 -0400 Subject: [PATCH 2/4] Better contentfile archive comparison --- learning_resources/etl/edx_shared.py | 1 + learning_resources/etl/utils_test.py | 19 ++++++++++++++++++ .../course-v1:MITxT+8.01.3x+3T2022.tar.gz | Bin 2361 -> 2272 bytes ...1:MITxT+8.01.3x+3T2022_minor_change.tar.gz | Bin 0 -> 3070 bytes ...e-v1:MITxT+8.01.3x+3T2022_no_change.tar.gz | Bin 0 -> 2272 bytes 5 files changed, 20 insertions(+) create mode 100644 test_json/course-v1:MITxT+8.01.3x+3T2022_minor_change.tar.gz create mode 100644 test_json/course-v1:MITxT+8.01.3x+3T2022_no_change.tar.gz diff --git a/learning_resources/etl/edx_shared.py b/learning_resources/etl/edx_shared.py index 847c908e15..3d3845eb81 100644 --- a/learning_resources/etl/edx_shared.py +++ b/learning_resources/etl/edx_shared.py @@ -135,6 +135,7 @@ def sync_edx_course_files( bucket.download_file(key, course_tarpath) checksum = calc_checksum(course_tarpath) if run.checksum == checksum: + log.info("Checksums match for %s, skipping", key) continue try: load_content_files(run, transform_content_files(course_tarpath, run)) diff --git a/learning_resources/etl/utils_test.py b/learning_resources/etl/utils_test.py index 67620f3154..a99c7b7229 100644 --- a/learning_resources/etl/utils_test.py +++ b/learning_resources/etl/utils_test.py @@ -462,3 +462,22 @@ def test_parse_certification(offered_by, availability, has_cert): def test_clean_data(input_text, output_text): """clean_data function should return expected output""" assert utils.clean_data(input_text) == output_text + + +@pytest.mark.parametrize( + ("previous_archive", "identical"), + [ + ("test_json/course-v1:MITxT+8.01.3x+3T2022_no_change.tar.gz", True), + ("test_json/course-v1:MITxT+8.01.3x+3T2022_minor_change.tar.gz", False), + ], +) +def test_calc_checksum(previous_archive, identical): + """ + calc_checksum should be able to accurately identify identical vs different archives. + All 3 tar.gz files created on OSX from the same directory. One archive has a minor + text change in one file, "400" to "500". + """ + reference_checksum = utils.calc_checksum( + "test_json/course-v1:MITxT+8.01.3x+3T2022.tar.gz" + ) + assert (utils.calc_checksum(previous_archive) == reference_checksum) is identical diff --git a/test_json/course-v1:MITxT+8.01.3x+3T2022.tar.gz b/test_json/course-v1:MITxT+8.01.3x+3T2022.tar.gz index 5ed563400ce41dc541e220c0151474619961db70..9475e6870145304e13d8f966e65f3956d827c3a4 100644 GIT binary patch literal 2272 zcmV<62p{(!iwFQj4Qplq1MOVPa^tuaby77+#hck>jbf^@$XMb_l$?&Fl>6balXi8- zcGA=7>QW#Qk}xI-HUPGy>hfP?KkH0t2eQpRRkn6&){jn^Y8Ff z-~aV-RUTblo@XCbiNC`0q>$xDi|b#yEU4os@<~0wt`8S~viHx!f0)1j)ydU&-~H`B z{4f9g_xEpRh1Gb8Cnp4ZgzLNW1Dou>VOjemc3BMyJ^MYVdCG zyXotfc678Q?3;#zE`Y%E*Q*I^%USGiSFWE5QmfZ@dXG)^uk58T{@V?E4>>z!mEkkD z|4jco?FM((f1_FHzZ_(W$9)#dU3r0f=)d7OmHx{>rvHff)TKo1qzl|#{|?LmEB%*& zO#d_-u$@f;?qUB{t6Ay44CKcD`Xumoa1W42v(Z?i|JDurx0_bA{!<3pspe74{a%P6 z4%F;R%djjBF+S{QXQ!8wOAV!&I@)2~Fzfb2)4LDNyTW!boBs(x`Clo> z>py+l_As!~|C?q(|IKF8to(l&XlI(Q9Qx;p<@tXcc~}xrAIXQef2*?pQqX38x2?m= zLldd+ASkZ?WHW$o1q-)!kB#Dm)~L_Mb-o+YJQo9{)M5 zMz#J|3U1MV@_o9cVPJ#*w;tDTA{=XD_*jJ~$w5%IsYk!YWdO1x3q4ZQp(0fnG zxJN?V_lXC2B#+6#BA-Wv8w){M*pn=xuETof#1=mwY|{cAdXp zM}GGAQo3L-cupfO%W_IDEYj3yRvCM4u>%wCe0$@a{WHR zJfz`pR{NW`ONoN`gIUiO)-6GB@V6bQZaG$cc*H zh*K{oz>eSY-x*?G?I7X~hmvq=3`AXKA>mG9PCK*M>OSxgio z1~UjB_lX+$wUbu`qnI1S2@A-W@vCC`n9$)!Rc+KwXVrkYHL5cv&vAdNB#d<=){E2)Xlr$!kwgkMuE5=KNdP3)sB=XB7Rv*)%HuUj}mehfs&(&?P+r_Bo66 zzJC|EhyI&pqtbsF$m!pUc@j5oc?a;$`ZrsqQ~mz06y)?Tsr1R#w}0=U|7OFf=KrN2 zr~h;!=Mmv@8g0)J+{6DHExXcx8OZ6My(e5P@|NrWA^dlW{@<#`|8me4#8sA#TpuE+ zj+S%;tqVa^dt`Nw)HifFq5`FBw{0D+s>G;!8oGY$5j2Ga5E~$Qd)0MxYAr zwq*1~JSbR{i4u|MnM~5Y_^AoV`X~We_YjbEQ$Xwb-~hi%yI$csCn92@$jm0~n#anR zpwIhLbR@2z*Z(0VLKT{u(M?Uv>RE^yTJ&m!B^r}H>T@<0z&dj{rzjTWw~Iz+PKKZt z5eX6TzCoZ5sZ3P)qAP*=aDOzWa)jKNbD)M`v{ZW^LdodKh_iqkpl4vO2k01Ve!wON zh;ej*2bj}(CaUa47la_?H`^GEAZte(Ng0Vpx=y@FofGVVFYuTv>WmNdh_i1&VnSy? ztp+)2$u=N#lVMCf0%5!EtBoP44>-8do*K#s*Ruwv=Q*{mPk}!$6>${7G!H~N%K+-@ zB4}yifC2{D;;1HBP3<-%Vjh zOTP*39{(Fnvs(Wv1#9#l(2xdk&=W3%P`${N_w5dw^Pfp$7&VV|IMaT*?$?B{r-RTDc_E!4%}&I`K)bc(@9%ILMXW6bnREkt4hxw ztnI8ilsd~?I8octKK$}s%;ayD4$`I<)M40b4b9HZcGSsPlshlm(cZk((BhQrtaTen uB$1{%JBWlI&;Avmp8Xj@Jv%h5s}MAmW)*=}s8Hb(!2bXUdpfHCS^xm;@RscW literal 2361 zcmZ|PX*ksV8V7I{4_UH>4r~P!f_mroxn37BrwQJL(A{- zRc)U@AC2Hkfjo<@xPVbTi5?nP5XC?CFrgyMut)jr{F+x?gVZSQC4F4#mWkD??nI?h zB?m**Eoy*j=P&2ULO1h~0fc~`xx8KTq0O%IQ|EE-ae%r1F}KU7YpO9e8PkWOCjI1R zQfJ=H|3L~nRm3lHo^&GP`nId%3L!b{v|TqQbS~7udZR;x_~m(T&9vZXSj8PIS z$>QX19Q#R0WK^!I7;r}+LG}ja7xfG~ zfET>ioq&odOrtn(vI#N}i;q@STjsD}!G;=>$iLZnvar$XQpAy%&)h$YEk+|3JbZ$0 zPjFd`wzyvc-7PvM54cVBeW{_2KcEu$qWs#Rv*9NsO>fHl>JKOyO zIy76^2+6U--kY?Wvdw>kP;V{zwHIE}|T$ zp99scOo(C%AmTEN0>MPU4UWt&;LWzdxAiM5mYqF2L{_Y?(0?Nb>ZRDC zA9yH`9I3ZatC}3gL_Pu)?ND&U6Gp+V>VS1@Ek?I;r`>Y-;O(Uc?nZKg*_rbdg#+OiWkNdBcniBc0<#GgO_Sm- z1NlCxr1UX?vc#!xK82<0yt8VDvtw_Ov0h+y^P^k8$*TuxQD`)%VqmlWEZL^!944+HOCJLVKXxTqiGV47&0R05aOIk@XE zT(Gcm2TkPg-tG1eU$r}qtyuLOZ<+*RKBBlg&l{gwo%!QBbo@s+`YFm%EHS3m-b|sO zK*sa1UFcc2#+xUVsxEbjwyq`%SQz&n%J|?~R{F7zzwKUQx1?oTL-@7C+MdA|xR*D- zWlkjQl5SD3!yPgY)o5kpYk3KYRq)AI#Mx$>=>AK}$om+s&ct4fe84iM&p773_t#3b za!X(T$I9|k{1y%=G}N{8vk!LUFdm#bXT@X5S1pCN67pOcRzP>pR~soTjYzzdcA4{i zro;8MH$iCMn#oSDK=3Hh1f3=Xr_>WW&TZ%bm$16{Jo?ZNwkQo6C4=jn03;1^mwmRx zh?HST&qL952~6bgj{@zD&={!CV_}6)PRQ4{5o=BQzHkQtO251vN5RFSbV5!lnAf95n2V5`#_N-TYcZy2u{ zmSe+GRylImEI&q<*3VqG9lbHiedQ`5`$fmOR}E)O+E4cZecS?$`8GCQd}(-@+Rrr{ zuieBJ$8IBKYe78v+PqsXLSd@w>&1}t*%F6bBO#a!P>oxMErM@caz2H*^A(7=47yk8 zjcz;Nw{8Q2q`PG3-%0<@I;ROJ=&Be$f6UM0_Y$->bCBI#;B;W9M>GE(zDI!c`z(=$ zWE617XPOjy4#0O_PT(MK5kT`M?4N!QVqq(|!-gF#8rNX z9^|?b7KkE=B4pq~-5B}~=s&qcGrBLnsB*1;o2H4auS@DI-m-T?eRKmy^gRx*7_Vlc zcRdVy^C)X&!JymqT)A}ynjWK^?8jmEIbG^?)K|lt{@iu(OWCYso4F_ERexynd||-d z_c9^y&QG)M#VEbpxZ(23I-?Zl|mRVT~Jnfp%9 zh5n2q^XV~I*;?bDc;wg+Zk}<3c>%{<9MNrAFa8OBY>EkH21UdzfwQ;s9!x2-yiMTf^hLr2 zC&xzAaO$tun*QbA)bDhTrFbPwYqlaX_|%np6my51Y2USbgYz?IcFaGSL=k5nYR1hb zsK339HqB8DebZy;`*d#c#E^(QVx#&<2x|J8j9#Pw{=0=%D!SyH#5Cf`A5-|Nuc>|# zMPqsW_mn8H_yWD4O8#_088@q4OE_-OtscH^9^I+C+s}iQbDpZ7J9kU$p`08Bix|Hh zDkFmS>ILQiiUL=LnLAg(MjxoMi5X4&2_XT*;ZXhSB=Z6F0U(k)lmG8T@7qEKWtesI Ibnx)}1#{i;y8r+H diff --git a/test_json/course-v1:MITxT+8.01.3x+3T2022_minor_change.tar.gz b/test_json/course-v1:MITxT+8.01.3x+3T2022_minor_change.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..65ea0fa01887c9a1dd326f34bdba383ffcc447e7 GIT binary patch literal 3070 zcmVWiwFRP25V*j1MOYSkK9NV?+IB5g9JGt!~qpvfe@Q<`=jli3DZL+nVHS7 z2??1Y+3af7TWwc&d&li^s_b<4Mp@;;iR%h+TW$zWX#W6i9FX`2koX6d0|yRpLZZs9 z{;;P%CNtfhWT$>gcTKxK{OY}WuU?hwIcnX(F`=lmL};2;wJeaNcA9ENB~3X@vjX&r zZ5p;_7*!o;x@qZp324hCYJiwVkT6m@Fp3(8MAOQ#V$J4^cG9kZVfwn@jcW#Te9Atzb46B0*T3c9jtaxjM`{N; zKD+qU(qGm;*WdZ^?W4~=`@_GzFxIfb<+fQ_HS-KxOIE~;jY?aJR)jV zUu!p8JA3!;?Dr1(;Xbt9YCdWHtpE7F8E!8KtGZ@^14h7|$CC+Il(|@4EN5R2q*`BI z>OJP^Ki8Ms{BKt560nv^mB0npe6ni&Cu9fz&>`xl=3eELhx!`-8rZq zM3^kW17Q5HE0yW;*Q{}+Ih`2TyQ)c;Q$vO$R(SEn^O=KuEwwIp6m2>l-| z1D0S1=IOs_MQd%(by2XoZp~U1?0Eqj zNc%w(*Y&X8(EmuU9nf*!o{HH97wm zgq54ItCvb#qpp@d|?52O!cLB7KpjDm{ zUKJ@2=?#&-NK;|7R>b0CMcjGiib%yC!3q%RjhE`Ogv5oxSJLDvpZ|ADlh^+e5)xl- ztX$qX*tuC6E(nenV4n}i{(e!K?gU&)Hx$1!Ru_o$hDcwesdNM+BqSt?#_;?PY~Xn6 z!>_Yp-ubWe{D1cRmm#136@ypu{J-((=fC;nS3m#P@4xrwUq^rY(QnV*O^L}kcK&Ax zc3__VS9Cq6|F&sq^88N`Fsg&|Kenw^*L7=MQ%!qg!!S;G_}X+c1z(0`*U<9`Z4CjLk5b^ImVWA^z^e)X9Co3e??$3l{|=vUk@Pj z0~Pbo&E$$wF{RFz1ofvL>ef*J z8$NPbnTW*b+NhlCQ75L)-*7V8jT4KG^BAd``?*O%x=zVqG=Yw91-p9_sdWO=JJw%N8Tp z_d6(v>U@kuqK>hq>uY*-avd+Aj4e)tNu4jLjfahe0cbt)yEh7;oQ4 zkOW@P8npiY2VDWz&Wd|a< zl?r!4iKriM~vQDf54eK$n}D zh4=w;=6~HZbL&4tw@vx{uMmj&|Lq6cJG&1uc0V_9xdey&UEu+rMU3pQRytlV%+vFk zw2heu;ll>vGrw~CQBEm_3UM2EP#2Sstu}93Q*~?7gt;(M2bA1_jfE1S%_3qp zyY6~!gh*!^&qOxZ?ZIPZ#z-~`qf^OZgXpjQk@n^3lQ{|DHnrcEytZ ze<8@oKXv3>L>=OVi!%hLu>V@sl=3eE8TqSq{1g$s&AU(|Fi-x5J$nChrJC6PiYe#+ zVsMi9&reEe{O6xe*5p|HC$I6ll6Vo+7*qY6s@(ziDUnpRND6{g?5d%SZnIf2tZ4)v`AX!>($^s=YBQ z0yL+twL9An$$Kpxwz>0UgC5-zA)MC6vSLziAp)e*QNM zIsX@eO^O`u=>vx;BdsaPKv1?;0oZICTig5>IBa_qq|a_Q%j_!847c_J$B$hE`YZ$Z zIB1C|AuzUn0BuCj_gMpIY6-UHh<%sk1QGNCpj+CS#rW67#HH$HIow)JZ~{1QR@5CI+I9Vd)G!6%($zXc>*Px?ZSJl|(j z1}-=XaMwp}i(QNuqge|H<($}vQyy`Sh`_zM5m)bYeg1 zt6|Vm0Q9|9P*Z%=j1<<@zK?232a=W-T-Si1fv^WOUMi|7;{=0!-Y$4kACOjNv?Zg3 zch1~O3*AK>rq!$z>z^qKhFdp1oXED%htTLqMmb>*=#1)w2}{a?C+S#x-56w?h(X3F z7-Sr0(5x}ogiq3;m%C074RJt)+N48sk1G=!^L`&}(<3(Oe-R_fD-G4?xFiPcj9ATj z^r#IZ&_xZ3kFSt^rKI0c=9zse)DFMh6H1ZZ|g}XtSa< zr5!~fy3gEL#;Vuz}jw8|mDA2E*6GG?T$u@Y;^F(5WfTCwM%G78E*KNu2~ zn}nIsI-kmz4=*~L#&alJzQ^cil+rj1F^L#U2L+h;wgBsCVt_iB6^p~Q2(NLsY}`zZ z`jFL#6G~zr!OU1oxbc{?0tU3p#M4cr(_of7J>SLX0yu^Ibz9265ajlMvrmA%1w8yMopk>L2`~St@ z72p58Dz=2QzyH@fQ$HmpV#@ySLhZo({omaCKMiIPwP3Pm6-u=z1x^&|Uz>M=>3!42;^ZZ}4b$R}$5X_K&#|ykp+^JIs6I2hK=(NLO zzWnp&|IKPep8qTaRv}E5cvX>k|DTBdDf<0SmQl^0|I=+t>VFX!y#H@Y#8gqPC_y`{9!qM?V=0q)jjIVA$sh7{t!j zc;qa|#LL!{_dig;C`xwFySYRpJIy?H5KuoJycJOuFGEyC&~%w^L8si5JJ1pm60ZUN M4|;Gwhya!V0HfzE_y7O^ literal 0 HcmV?d00001 diff --git a/test_json/course-v1:MITxT+8.01.3x+3T2022_no_change.tar.gz b/test_json/course-v1:MITxT+8.01.3x+3T2022_no_change.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..794f3dfec71caa4f5b66d90d3e092609f1957e26 GIT binary patch literal 2272 zcmV<62p{(!iwFR#25V*j1MOVPa^tuaby77+#hck>jbf^@$XMb_l$?&Fl>6balXi8- zcGA=7>QW#Qk}xI-HUPGy>hfP?KkH0t2eQpRRkn6&){jn^Y8Ff z-~aV-RUTblo@XCbiNC`0q>$xDi|b#yEU4os@<~0wt`8S~viHx!f0)1j)ydU&-~H`B z{4f9g_xEpRh1Gb8Cnp4ZgzLNW1Dou>VOjemc3BMyJ^MYVdCG zyXotfc678Q?3;#zE`Y%E*Q*I^%USGiSFWE5QmfZ@dXG)^uk58T{@V?E4>>z!mEkkD z|4jco?FM((f1_FHzZ_(W$9)#dU3r0f=)d7OmHx{>rvHff)TKo1qzl|#{|?LmEB%*& zO#d_-u$@f;?qUB{t6Ay44CKcD`Xumoa1W42v(Z?i|JDurx0_bA{!<3pspe74{a%P6 z4%F;R%djjBF+S{QXQ!8wOAV!&I@)2~Fzfb2)4LDNyTW!boBs(x`Clo> z>py+l_As!~|C?q(|IKF8to(l&XlI(Q9Qx;p<@tXcc~}xrAIXQef2*?pQqX38x2?m= zLldd+ASkZ?WHW$o1q-)!kB#Dm)~L_Mb-o+YJQo9{)M5 zMz#J|3U1MV@_o9cVPJ#*w;tDTA{=XD_*jJ~$w5%IsYk!YWdO1x3q4ZQp(0fnG zxJN?V_lXC2B#+6#BA-Wv8w){M*pn=xuETof#1=mwY|{cAdXp zM}GGAQo3L-cupfO%W_IDEYj3yRvCM4u>%wCe0$@a{WHR zJfz`pR{NW`ONoN`gIUiO)-6GB@V6bQZaG$cc*H zh*K{oz>eSY-x*?G?I7X~hmvq=3`AXKA>mG9PCK*M>OSxgio z1~UjB_lX+$wUbu`qnI1S2@A-W@vCC`n9$)!Rc+KwXVrkYHL5cv&vAdNB#d<=){E2)Xlr$!kwgkMuE5=KNdP3)sB=XB7Rv*)%HuUj}mehfs&(&?P+r_Bo66 zzJC|EhyI&pqtbsF$m!pUc@j5oc?a;$`ZrsqQ~mz06y)?Tsr1R#w}0=U|7OFf=KrN2 zr~h;!=Mmv@8g0)J+{6DHExXcx8OZ6My(e5P@|NrWA^dlW{@<#`|8me4#8sA#TpuE+ zj+S%;tqVa^dt`Nw)HifFq5`FBw{0D+s>G;!8oGY$5j2Ga5E~$Qd)0MxYAr zwq*1~JSbR{i4u|MnM~5Y_^AoV`X~We_YjbEQ$Xwb-~hi%yI$csCn92@$jm0~n#anR zpwIhLbR@2z*Z(0VLKT{u(M?Uv>RE^yTJ&m!B^r}H>T@<0z&dj{rzjTWw~Iz+PKKZt z5eX6TzCoZ5sZ3P)qAP*=aDOzWa)jKNbD)M`v{ZW^LdodKh_iqkpl4vO2k01Ve!wON zh;ej*2bj}(CaUa47la_?H`^GEAZte(Ng0Vpx=y@FofGVVFYuTv>WmNdh_i1&VnSy? ztp+)2$u=N#lVMCf0%5!EtBoP44>-8do*K#s*Ruwv=Q*{mPk}!$6>${7G!H~N%K+-@ zB4}yifC2{D;;1HBP3<-%Vjh zOTP*39{(Fnvs(Wv1#9#l(2xdk&=W3%P`${N_w5dw^Pfp$7&VV|IMaT*?$?B{r-RTDc_E!4%}&I`K)bc(@9%ILMXW6bnREkt4hxw ztnI8ilsd~?I8octKK$}s%;ayD4$`I<)M40b4b9HZcGSsPlshlm(cZk((BhQrtaTen uB$1{%JBWlI&;Avmp8Xj@Jv%h5s}MAmW)*=}s8Hb(!2bXUdpfHCS^xk*vzJW( literal 0 HcmV?d00001 From f8cfee4ef4ba1f17b0defa171a0ec5798d5a7d8d Mon Sep 17 00:00:00 2001 From: Matt Bertrand Date: Thu, 13 Jun 2024 12:41:07 -0400 Subject: [PATCH 3/4] Try generating tgz files again --- .../course-v1:MITxT+8.01.3x+3T2022.tar.gz | Bin 2272 -> 2307 bytes ...1:MITxT+8.01.3x+3T2022_minor_change.tar.gz | Bin 3070 -> 2882 bytes ...e-v1:MITxT+8.01.3x+3T2022_no_change.tar.gz | Bin 2272 -> 2307 bytes 3 files changed, 0 insertions(+), 0 deletions(-) diff --git a/test_json/course-v1:MITxT+8.01.3x+3T2022.tar.gz b/test_json/course-v1:MITxT+8.01.3x+3T2022.tar.gz index 9475e6870145304e13d8f966e65f3956d827c3a4..4bdd5548eaa0964bcdae505bea7847cc9644e7e8 100644 GIT binary patch delta 54 zcmaDL*es+g-_60%sF0n;z?__4T2!2>TV`nG>lsoJqHUpPV5nzYp=}&uWME{ZSCUx7 Ju#xK;2LP`(4~YN( delta 19 acmZn`dLYOp-_60`!keAOz`RlS8V3L~1O+4j diff --git a/test_json/course-v1:MITxT+8.01.3x+3T2022_minor_change.tar.gz b/test_json/course-v1:MITxT+8.01.3x+3T2022_minor_change.tar.gz index 65ea0fa01887c9a1dd326f34bdba383ffcc447e7..2143c33a2da211742133e5418e19c20c96deacfd 100644 GIT binary patch literal 2882 zcmV-I3%&FoiwFpN9&2U*17mM>a&u)Zb}>3lNmO`LD>yDNF)lNBD>GCwFfuYGaUSaHl)c zKD>wb@Bjj^+H`PCC@QTGnx@q(3nZbPhMG}LLk`od0=;URhOO6h(*T-oTDn;R+A4_} zA*K-|jFcXXq81|2taL0{yFI6zv?|~{{8;eGPkvCkRVp1p2RuFjUjhMsuk=H98SMHS zyZG;aU9QOO)6-+|mFM^~yAE~@(`Tf#A>;KSy_p5`mZ@>NP zf63SX`_KDVQ(`)vz`+56E+UoH*?~p+U)A&aZy8m)1O}@hCXXAg|E30dk=H{F#`?N( z-`1_Fs#WV&yM~8!xVR#6w&)P5B zza2h%VussG!n&?m;DizI=-G4vmSry1m&@6Y1F6#1 z`Y+_a(q?cu`5U#GsTmsU3)p2>O)38(AOx>A)!mckNrcG?JOIWIyIP$sf4!F1|Fr*? z{(k|ujQ_u1O8x)DAseOGx;v}L3IBgE$|dn(Lg@c^7_b66ut@(+L(l1djop?0e-W7O z|KDf*uwj_%OiIG@Znu5dp^dKRx+qw$Tl0nm`(D6W(qYiXjl+lAo%Gt--do`3vtD3^ zn?F(DBI|P_o%r{_sQ+JG@4ra@^{S>_82>YyAp8GP{&QOTUuJ5^ai;>O^6X>gbK|9hqB<9`VWiSIVnZttG#?v&041Q!#q>+6Z{Z%ebC zfLrN=;!h{?0ugSB@NF7OM?gYCLZWDh_kXa3<7ic@0I#tAt7=(t{Z|nX@(;1^IUb@b zX#(#qe@nLvDgPoMW=r`Ofz0|}b=|Q@tXncp@w3f8k%7!0FzEbIXXNYoGKtq)KKcGrmN;ascc?1-4e^e$p51hMXTxk zueknS`hSHWEB`cDb~mul{_A>9{qK7v)6xHO z2ZHZD{Zu)zKsY_Ok|KmU~WzYx4$=esEt(G0}g-vf}|PNNu8 zntVvmeC45j69us4BbTL#NQ~}{)43jXVoE(6G$R~(4$TykB>5Z#h(P~BL=x%oG6f+@ zCi>0!uQ%D}g+3f6vz9#P5pudPV9$L3j&TGdMhNu5?vTZEVkUB;AbAw4s#(^mW!*g0 z?FX91{=Uj4BiQ$QD2ST8jYJ}kQP%acUYnlB3n*iY6JgTiL#m4`7$@NTdH!N*)qEIq zV%SNlg@VcQeFRD11)Wjp&tFVLQslRrA#|om=h8KZU@M_4L-rJLrGjcEYSF}jKV+HO z(5Dj*Jx@Cl(Zf`@Q&L335RJv(ha}$w?yZ8JCj@s0qI4cb;V4Z3^z8EC_dH^f@p&PX zoGbHOa>l7vNRvmK>9+F=t@;AlPrWm59>IMcUjX|~(I9GJd^T5@(Rrd7)-E*;Z`N1o zqR;66I|RDi%q+zZSkV9LrkNZ68M-a^e-{JM|9^hGy}NguvHQ7>%OyDEXN3m57csKU zTIqSgd7Pe)q+QHB2p_f(@A;L3r#YoKSBL}LLw!un^6`D-b-KK0P1UVw9p=(V9Z~WK zww6kSHjjw;>^kz?2$9|_o{4C%H-Hz?Opt6ANb$bM9IsjQCXtJ%t&XWpz6mEMZKchJ zUz|SP-#^KEsZSuKNnWb}qxl^|X8m9C*`%I}T970@|5DxsES~?<^7dc1HEI8gKt}#- zs6$}jpeADaIU4D`e-pTZ{B6A|VQBRpt4wMZl=%&VRLQYTY!9 zx?Wqi@6S8sHK(kNXI+bY)!`?(?l`7V?~7Ait^2c1bFF9tG*U+l>ml%Ok_h!@%)l z7l9#*06q>nVwVt@7(ak6B1mNgpqVMym?QRG7869!3xIBEWsC8zkBLjwjY_z=o<1%i zg8hY%zz%_77Z?DyK{UH)6>PgM=wX7!nJ9o11PHmvRlyNFLx9FCKEeR8>R?s?E3gL` zA>-#nup=nhs0eN`qVkLj39IVM*rOhQMLS~PF~3C+UqnDBc*hCjZ}3SknQs9J*OR)? zGtc)Km4OS+0^IkJ+hI2&#%NYTLOCZk;*>|6BVyy;Lh{md<)js#QOUD>i|lehEkz?6^2tu;%O!Hp@X{EXYNJ0 z4V~DJhH4me6aam%6EqYbwIhX9b?Bpp(u1Vq1rIb}XdoN_ji-tl$|S+yke3TS)kmaN z7;Q;w;gvHt(?mz8$F!QIV)Zjc!f6db#@q(GUkzs7=~5_qj5$Ht!F?Ha%ml{`WDWJkz-vU6jPA zoH46ejh=R41p252S_Jngqgoi8L%@8G-;66Qj0^&gLlgkyehHX7BCev`7M)ShV$TPC zFX{p(CWMg&SZ~SKK2Xl1hh2hu=pNW(`g#xSGo5ea!99QpIDu_QJXNqtJm>@=!0qM+ z1YMT2p>(4tqz@_;(k9G|HhEXZe0WjebU%l(>3fWRMk$TM5R-_pbd-RJZxgVZCI+a7S+Y1RN4U%# zvq>{Gu0vKME-8tT1T$?d;l?x03K-Bn6Hh;pPK#OcbbS}2GwDR8J)8eHZ*CJ`ue($I zh>)HCU-ryjiYv_j+m?L)QwXy2zj137uK=zff8CbyF9dVm|9VS!^_$?`=l?ZZm+$`y z!5sPbyujnuBEIF%wqmljppXut~qR)S@j9UKv zziwMn|BJxr{GaK&d>RU0aIb*Ly*7WiwFRP25V*j1MOYSkK9NV?+IB5g9JGt!~qpvfe@Q<`=jli3DZL+nVHS7 z2??1Y+3af7TWwc&d&li^s_b<4Mp@;;iR%h+TW$zWX#W6i9FX`2koX6d0|yRpLZZs9 z{;;P%CNtfhWT$>gcTKxK{OY}WuU?hwIcnX(F`=lmL};2;wJeaNcA9ENB~3X@vjX&r zZ5p;_7*!o;x@qZp324hCYJiwVkT6m@Fp3(8MAOQ#V$J4^cG9kZVfwn@jcW#Te9Atzb46B0*T3c9jtaxjM`{N; zKD+qU(qGm;*WdZ^?W4~=`@_GzFxIfb<+fQ_HS-KxOIE~;jY?aJR)jV zUu!p8JA3!;?Dr1(;Xbt9YCdWHtpE7F8E!8KtGZ@^14h7|$CC+Il(|@4EN5R2q*`BI z>OJP^Ki8Ms{BKt560nv^mB0npe6ni&Cu9fz&>`xl=3eELhx!`-8rZq zM3^kW17Q5HE0yW;*Q{}+Ih`2TyQ)c;Q$vO$R(SEn^O=KuEwwIp6m2>l-| z1D0S1=IOs_MQd%(by2XoZp~U1?0Eqj zNc%w(*Y&X8(EmuU9nf*!o{HH97wm zgq54ItCvb#qpp@d|?52O!cLB7KpjDm{ zUKJ@2=?#&-NK;|7R>b0CMcjGiib%yC!3q%RjhE`Ogv5oxSJLDvpZ|ADlh^+e5)xl- ztX$qX*tuC6E(nenV4n}i{(e!K?gU&)Hx$1!Ru_o$hDcwesdNM+BqSt?#_;?PY~Xn6 z!>_Yp-ubWe{D1cRmm#136@ypu{J-((=fC;nS3m#P@4xrwUq^rY(QnV*O^L}kcK&Ax zc3__VS9Cq6|F&sq^88N`Fsg&|Kenw^*L7=MQ%!qg!!S;G_}X+c1z(0`*U<9`Z4CjLk5b^ImVWA^z^e)X9Co3e??$3l{|=vUk@Pj z0~Pbo&E$$wF{RFz1ofvL>ef*J z8$NPbnTW*b+NhlCQ75L)-*7V8jT4KG^BAd``?*O%x=zVqG=Yw91-p9_sdWO=JJw%N8Tp z_d6(v>U@kuqK>hq>uY*-avd+Aj4e)tNu4jLjfahe0cbt)yEh7;oQ4 zkOW@P8npiY2VDWz&Wd|a< zl?r!4iKriM~vQDf54eK$n}D zh4=w;=6~HZbL&4tw@vx{uMmj&|Lq6cJG&1uc0V_9xdey&UEu+rMU3pQRytlV%+vFk zw2heu;ll>vGrw~CQBEm_3UM2EP#2Sstu}93Q*~?7gt;(M2bA1_jfE1S%_3qp zyY6~!gh*!^&qOxZ?ZIPZ#z-~`qf^OZgXpjQk@n^3lQ{|DHnrcEytZ ze<8@oKXv3>L>=OVi!%hLu>V@sl=3eE8TqSq{1g$s&AU(|Fi-x5J$nChrJC6PiYe#+ zVsMi9&reEe{O6xe*5p|HC$I6ll6Vo+7*qY6s@(ziDUnpRND6{g?5d%SZnIf2tZ4)v`AX!>($^s=YBQ z0yL+twL9An$$Kpxwz>0UgC5-zA)MC6vSLziAp)e*QNM zIsX@eO^O`u=>vx;BdsaPKv1?;0oZICTig5>IBa_qq|a_Q%j_!847c_J$B$hE`YZ$Z zIB1C|AuzUn0BuCj_gMpIY6-UHh<%sk1QGNCpj+CS#rW67#HH$HIow)JZ~{1QR@5CI+I9Vd)G!6%($zXc>*Px?ZSJl|(j z1}-=XaMwp}i(QNuqge|H<($}vQyy`Sh`_zM5m)bYeg1 zt6|Vm0Q9|9P*Z%=j1<<@zK?232a=W-T-Si1fv^WOUMi|7;{=0!-Y$4kACOjNv?Zg3 zch1~O3*AK>rq!$z>z^qKhFdp1oXED%htTLqMmb>*=#1)w2}{a?C+S#x-56w?h(X3F z7-Sr0(5x}ogiq3;m%C074RJt)+N48sk1G=!^L`&}(<3(Oe-R_fD-G4?xFiPcj9ATj z^r#IZ&_xZ3kFSt^rKI0c=9zse)DFMh6H1ZZ|g}XtSa< zr5!~fy3gEL#;Vuz}jw8|mDA2E*6GG?T$u@Y;^F(5WfTCwM%G78E*KNu2~ zn}nIsI-kmz4=*~L#&alJzQ^cil+rj1F^L#U2L+h;wgBsCVt_iB6^p~Q2(NLsY}`zZ z`jFL#6G~zr!OU1oxbc{?0tU3p#M4cr(_of7J>SLX0yu^Ibz9265ajlMvrmA%1w8yMopk>L2`~St@ z72p58Dz=2QzyH@fQ$HmpV#@ySLhZo({omaCKMiIPwP3Pm6-u=z1x^&|Uz>M=>3!42;^ZZ}4b$R}$5X_K&#|ykp+^JIs6I2hK=(NLO zzWnp&|IKPep8qTaRv}E5cvX>k|DTBdDf<0SmQl^0|I=+t>VFX!y#H@Y#8gqPC_y`{9!qM?V=0q)jjIVA$sh7{t!j zc;qa|#LL!{_dig;C`xwFySYRpJIy?H5KuoJycJOuFGEyC&~%w^L8si5JJ1pm60ZUN M4|;Gwhya!V0HfzE_y7O^ diff --git a/test_json/course-v1:MITxT+8.01.3x+3T2022_no_change.tar.gz b/test_json/course-v1:MITxT+8.01.3x+3T2022_no_change.tar.gz index 794f3dfec71caa4f5b66d90d3e092609f1957e26..0cb0005843fc228cb933632c4ac28b1c9c53d5b7 100644 GIT binary patch delta 54 zcmaDL*es+g-_60HE|;Cgz?__4T2!2>TV`nG>lsoJqHUpPV5nzYp=}&uWME{ZSCUx7 Ju#xK;2LPW94=n%y delta 19 acmZn`dLYOp-_60Wnk_qxfqA3uH4Xqa%LRl0 From eec5d55406b324ae75bbbaac2d76d4531f323672 Mon Sep 17 00:00:00 2001 From: Matt Bertrand Date: Thu, 13 Jun 2024 13:46:41 -0400 Subject: [PATCH 4/4] fix tests --- learning_resources/etl/edx_shared.py | 7 ++++++- learning_resources/etl/edx_shared_test.py | 16 +--------------- 2 files changed, 7 insertions(+), 16 deletions(-) diff --git a/learning_resources/etl/edx_shared.py b/learning_resources/etl/edx_shared.py index 3d3845eb81..dfa904a054 100644 --- a/learning_resources/etl/edx_shared.py +++ b/learning_resources/etl/edx_shared.py @@ -3,6 +3,7 @@ import logging import re from pathlib import Path +from tarfile import ReadError from tempfile import TemporaryDirectory from learning_resources.etl.constants import ETLSource @@ -133,7 +134,11 @@ def sync_edx_course_files( course_tarpath = Path(export_tempdir, key.split("/")[-1]) log.info("course tarpath for run %s is %s", run.run_id, course_tarpath) bucket.download_file(key, course_tarpath) - checksum = calc_checksum(course_tarpath) + try: + checksum = calc_checksum(course_tarpath) + except ReadError: + log.exception("Error reading tar file %s, skipping", course_tarpath) + continue if run.checksum == checksum: log.info("Checksums match for %s, skipping", key) continue diff --git a/learning_resources/etl/edx_shared_test.py b/learning_resources/etl/edx_shared_test.py index 20cc53ceff..2f70de60d1 100644 --- a/learning_resources/etl/edx_shared_test.py +++ b/learning_resources/etl/edx_shared_test.py @@ -1,7 +1,6 @@ """ETL utils test""" from pathlib import Path -from subprocess import CalledProcessError import pytest @@ -121,24 +120,11 @@ def test_sync_edx_course_files_invalid_tarfile( "learning_resources.etl.edx_shared.get_learning_course_bucket", return_value=bucket, ) - mock_load_content_files = mocker.patch( - "learning_resources.etl.edx_shared.load_content_files", - autospec=True, - return_value=[], - ) - mocker.patch( - "learning_resources.etl.edx_shared.transform_content_files", - side_effect=CalledProcessError(0, ""), - ) mock_log = mocker.patch("learning_resources.etl.edx_shared.log.exception") sync_edx_course_files(platform, [run.learning_resource.id], [key]) - mock_load_content_files.assert_not_called() mock_log.assert_called_once() - assert ( - mock_log.call_args[0][0].startswith("Error ingesting OLX content data for") - is True - ) + assert mock_log.call_args[0][0].startswith("Error reading tar file") is True @pytest.mark.parametrize(