Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion learning_resources/etl/edx_shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import logging
import re
from pathlib import Path
from tarfile import ReadError
from tempfile import TemporaryDirectory

from learning_resources.etl.constants import ETLSource
Expand Down Expand Up @@ -133,8 +134,13 @@ def sync_edx_course_files(
course_tarpath = Path(export_tempdir, key.split("/")[-1])
log.info("course tarpath for run %s is %s", run.run_id, course_tarpath)
bucket.download_file(key, course_tarpath)
checksum = calc_checksum(course_tarpath)
try:
checksum = calc_checksum(course_tarpath)
except ReadError:
log.exception("Error reading tar file %s, skipping", course_tarpath)
continue
if run.checksum == checksum:
log.info("Checksums match for %s, skipping", key)
continue
try:
load_content_files(run, transform_content_files(course_tarpath, run))
Expand Down
16 changes: 1 addition & 15 deletions learning_resources/etl/edx_shared_test.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""ETL utils test"""

from pathlib import Path
from subprocess import CalledProcessError

import pytest

Expand Down Expand Up @@ -121,24 +120,11 @@ def test_sync_edx_course_files_invalid_tarfile(
"learning_resources.etl.edx_shared.get_learning_course_bucket",
return_value=bucket,
)
mock_load_content_files = mocker.patch(
"learning_resources.etl.edx_shared.load_content_files",
autospec=True,
return_value=[],
)
mocker.patch(
"learning_resources.etl.edx_shared.transform_content_files",
side_effect=CalledProcessError(0, ""),
)
mock_log = mocker.patch("learning_resources.etl.edx_shared.log.exception")

sync_edx_course_files(platform, [run.learning_resource.id], [key])
mock_load_content_files.assert_not_called()
mock_log.assert_called_once()
assert (
mock_log.call_args[0][0].startswith("Error ingesting OLX content data for")
is True
)
assert mock_log.call_args[0][0].startswith("Error reading tar file") is True


@pytest.mark.parametrize(
Expand Down
8 changes: 3 additions & 5 deletions learning_resources/etl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import mimetypes
import os
import re
import tarfile
import uuid
from collections import Counter
from collections.abc import Generator
Expand Down Expand Up @@ -509,11 +510,8 @@ def calc_checksum(filename) -> str:
Returns:
str: The md5 checksum of the file
"""
hash_md5 = md5() # noqa: S324
with Path.open(Path(filename), "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
with tarfile.open(filename, "r") as tgz_file:
return str(hash(tuple(ti.chksum for ti in tgz_file.getmembers())))


def get_content_type(file_type: str) -> str:
Expand Down
19 changes: 19 additions & 0 deletions learning_resources/etl/utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -462,3 +462,22 @@ def test_parse_certification(offered_by, availability, has_cert):
def test_clean_data(input_text, output_text):
"""clean_data function should return expected output"""
assert utils.clean_data(input_text) == output_text


@pytest.mark.parametrize(
("previous_archive", "identical"),
[
("test_json/course-v1:MITxT+8.01.3x+3T2022_no_change.tar.gz", True),
("test_json/course-v1:MITxT+8.01.3x+3T2022_minor_change.tar.gz", False),
],
)
def test_calc_checksum(previous_archive, identical):
"""
calc_checksum should be able to accurately identify identical vs different archives.
All 3 tar.gz files created on OSX from the same directory. One archive has a minor
text change in one file, "400" to "500".
"""
reference_checksum = utils.calc_checksum(
"test_json/course-v1:MITxT+8.01.3x+3T2022.tar.gz"
)
assert (utils.calc_checksum(previous_archive) == reference_checksum) is identical
Binary file modified test_json/course-v1:MITxT+8.01.3x+3T2022.tar.gz
Binary file not shown.
Binary file not shown.
Binary file not shown.