Skip to content

Commit

Permalink
Delete WARCs extracted from WACZ for cleanup post-indexing
Browse files Browse the repository at this point in the history
Re:#770
Re:#710
  • Loading branch information
machawk1 committed Jun 2, 2022
1 parent f9447d2 commit 2566b7c
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 0 deletions.
5 changes: 5 additions & 0 deletions ipwb/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@

from ipwb.util import iso8601_to_digits14, ipfs_client
from ipwb.util import is_wacz, extract_warcs_from_wacz
from ipwb.util import cleanup_warc_files_extracted_from_wacz

import requests
import datetime
Expand Down Expand Up @@ -126,9 +127,11 @@ def index_file_at(warc_paths, encryption_key=None,

warc_paths_to_append = []
warc_paths_to_remove = []
warcs_to_cleanup_post_indexing = []
for warc_path in warc_paths:
if is_wacz(warc_path):
warc_paths_to_append += extract_warcs_from_wacz(warc_path)
warcs_to_cleanup_post_indexing = warc_paths_to_append
warc_paths_to_remove.append(warc_path)

# Manipulate list of WARCs extracted from WACZ
Expand Down Expand Up @@ -184,6 +187,8 @@ def index_file_at(warc_paths, encryption_key=None,
cdxj_metadata_lines = generate_cdxj_metadata(cdxj_lines)
cdxj_lines = cdxj_metadata_lines + cdxj_lines

cleanup_warc_files_extracted_from_wacz(warcs_to_cleanup_post_indexing)

if quiet:
return cdxj_lines

Expand Down
10 changes: 10 additions & 0 deletions ipwb/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,3 +365,13 @@ def extract_warcs_from_wacz(wacz_path):
extract_warcs_to_disk(wacz_path, warc_paths)

return glob.glob('archive/*')


def cleanup_warc_files_extracted_from_wacz(warc_paths):
for temporary_warc in warc_paths:
try:
if os.path.isfile(temporary_warc):
print(f'Deleting tempporary WARC at {temporary_warc}')
os.remove(temporary_warc)
except OSError as e:
print(f'Error: {e.filename}, {e.strerror}')

0 comments on commit 2566b7c

Please sign in to comment.