diff --git a/learning_resources/etl/edx_shared.py b/learning_resources/etl/edx_shared.py index 847c908e15..dfa904a054 100644 --- a/learning_resources/etl/edx_shared.py +++ b/learning_resources/etl/edx_shared.py @@ -3,6 +3,7 @@ import logging import re from pathlib import Path +from tarfile import ReadError from tempfile import TemporaryDirectory from learning_resources.etl.constants import ETLSource @@ -133,8 +134,13 @@ def sync_edx_course_files( course_tarpath = Path(export_tempdir, key.split("/")[-1]) log.info("course tarpath for run %s is %s", run.run_id, course_tarpath) bucket.download_file(key, course_tarpath) - checksum = calc_checksum(course_tarpath) + try: + checksum = calc_checksum(course_tarpath) + except ReadError: + log.exception("Error reading tar file %s, skipping", course_tarpath) + continue if run.checksum == checksum: + log.info("Checksums match for %s, skipping", key) continue try: load_content_files(run, transform_content_files(course_tarpath, run)) diff --git a/learning_resources/etl/edx_shared_test.py b/learning_resources/etl/edx_shared_test.py index 20cc53ceff..2f70de60d1 100644 --- a/learning_resources/etl/edx_shared_test.py +++ b/learning_resources/etl/edx_shared_test.py @@ -1,7 +1,6 @@ """ETL utils test""" from pathlib import Path -from subprocess import CalledProcessError import pytest @@ -121,24 +120,11 @@ def test_sync_edx_course_files_invalid_tarfile( "learning_resources.etl.edx_shared.get_learning_course_bucket", return_value=bucket, ) - mock_load_content_files = mocker.patch( - "learning_resources.etl.edx_shared.load_content_files", - autospec=True, - return_value=[], - ) - mocker.patch( - "learning_resources.etl.edx_shared.transform_content_files", - side_effect=CalledProcessError(0, ""), - ) mock_log = mocker.patch("learning_resources.etl.edx_shared.log.exception") sync_edx_course_files(platform, [run.learning_resource.id], [key]) - mock_load_content_files.assert_not_called() mock_log.assert_called_once() - assert ( - mock_log.call_args[0][0].startswith("Error ingesting OLX content data for") - is True - ) + assert mock_log.call_args[0][0].startswith("Error reading tar file") is True @pytest.mark.parametrize( diff --git a/learning_resources/etl/utils.py b/learning_resources/etl/utils.py index 8ba96a966e..c7257827af 100644 --- a/learning_resources/etl/utils.py +++ b/learning_resources/etl/utils.py @@ -6,6 +6,7 @@ import mimetypes import os import re +import tarfile import uuid from collections import Counter from collections.abc import Generator @@ -509,11 +510,8 @@ def calc_checksum(filename) -> str: Returns: str: The md5 checksum of the file """ - hash_md5 = md5() # noqa: S324 - with Path.open(Path(filename), "rb") as f: - for chunk in iter(lambda: f.read(4096), b""): - hash_md5.update(chunk) - return hash_md5.hexdigest() + with tarfile.open(filename, "r") as tgz_file: + return str(hash(tuple(ti.chksum for ti in tgz_file.getmembers()))) def get_content_type(file_type: str) -> str: diff --git a/learning_resources/etl/utils_test.py b/learning_resources/etl/utils_test.py index 67620f3154..a99c7b7229 100644 --- a/learning_resources/etl/utils_test.py +++ b/learning_resources/etl/utils_test.py @@ -462,3 +462,22 @@ def test_parse_certification(offered_by, availability, has_cert): def test_clean_data(input_text, output_text): """clean_data function should return expected output""" assert utils.clean_data(input_text) == output_text + + +@pytest.mark.parametrize( + ("previous_archive", "identical"), + [ + ("test_json/course-v1:MITxT+8.01.3x+3T2022_no_change.tar.gz", True), + ("test_json/course-v1:MITxT+8.01.3x+3T2022_minor_change.tar.gz", False), + ], +) +def test_calc_checksum(previous_archive, identical): + """ + calc_checksum should be able to accurately identify identical vs different archives. + All 3 tar.gz files created on OSX from the same directory. One archive has a minor + text change in one file, "400" to "500". + """ + reference_checksum = utils.calc_checksum( + "test_json/course-v1:MITxT+8.01.3x+3T2022.tar.gz" + ) + assert (utils.calc_checksum(previous_archive) == reference_checksum) is identical diff --git a/test_json/course-v1:MITxT+8.01.3x+3T2022.tar.gz b/test_json/course-v1:MITxT+8.01.3x+3T2022.tar.gz index 5ed563400c..4bdd5548ea 100644 Binary files a/test_json/course-v1:MITxT+8.01.3x+3T2022.tar.gz and b/test_json/course-v1:MITxT+8.01.3x+3T2022.tar.gz differ diff --git a/test_json/course-v1:MITxT+8.01.3x+3T2022_minor_change.tar.gz b/test_json/course-v1:MITxT+8.01.3x+3T2022_minor_change.tar.gz new file mode 100644 index 0000000000..2143c33a2d Binary files /dev/null and b/test_json/course-v1:MITxT+8.01.3x+3T2022_minor_change.tar.gz differ diff --git a/test_json/course-v1:MITxT+8.01.3x+3T2022_no_change.tar.gz b/test_json/course-v1:MITxT+8.01.3x+3T2022_no_change.tar.gz new file mode 100644 index 0000000000..0cb0005843 Binary files /dev/null and b/test_json/course-v1:MITxT+8.01.3x+3T2022_no_change.tar.gz differ