Skip to content
This repository has been archived by the owner on Jan 25, 2023. It is now read-only.

Commit

Permalink
add option to remove scraped and unscraped files
Browse files Browse the repository at this point in the history
  • Loading branch information
andrea-sottana-MoJ committed Sep 28, 2020
1 parent c7a725e commit 1c90277
Showing 1 changed file with 28 additions and 3 deletions.
31 changes: 28 additions & 3 deletions pdf2embeddings/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ def __init__(
json_filename: Optional[str] = None,
from_s3_bucket: bool = False,
folder_to_copy_scraped_files: Optional[str] = None,
folder_to_copy_unscraped_files: Optional[str] = None
folder_to_copy_unscraped_files: Optional[str] = None,
delete_original_pdfs_after_copy: bool = False
) -> None:
"""
:param pdf_folder: path to the folder containing pdf files to be scraped. Can also be an S3 bucket (see below).
Expand All @@ -44,12 +45,16 @@ def __init__(
copied over in a specified folder so they can be kept track of. This is because occasionally slate3k
doesn't manage to successfully scrape a file; this is very rare but we want to track what has not been
successfully scraped. This should contain full path to folder, including bucket name if using S3.
:param delete_original_pdfs_after_copy: if set to True, and if folder_to_copy_scraped_files and
folder_to_copy_unscraped_files have been explicitly specified, it deletes the original PDF file after
having copied it to the new folder. Default: False.
"""
self.pdf_folder = pdf_folder
self.open_json = self._read_config(json_filename)
self.from_s3_bucket = from_s3_bucket
self.folder_to_copy_scraped_files = folder_to_copy_scraped_files
self.folder_to_copy_unscraped_files = folder_to_copy_unscraped_files
self.delete_original_pdfs_after_copy = delete_original_pdfs_after_copy

if self.from_s3_bucket:
assert Session().get_credentials() is not None, "You do not have any valid credentials to access AWS S3."
Expand All @@ -76,11 +81,11 @@ def _read_config(json_filename: Optional[str]) -> Dict[str, str]:
with open(json_filename, 'r') as file:
return json.load(file)

def _text_to_series_of_pages(self, pdf_name: str) -> Tuple[pd.Series, int]:
def _text_to_series_of_pages(self, pdf_name: str) -> Optional[Tuple[pd.Series, int]]:
"""
:param pdf_name: full name of pdf (including .pdf extension) to be scraped and converted into a pd.Series
:return: document_series: a pd.Series where each row contains the text of one pdf page.
num_pages: int, the number of pages of the input pdf file
num_pages: int, the number of pages of the input pdf file. If no PDFs were scraped, it returns None.
"""
assert pdf_name.endswith('.pdf'), 'Input file is not in .pdf format. The file cannot be processed.'
document_series = pd.Series()
Expand All @@ -100,6 +105,9 @@ def _text_to_series_of_pages(self, pdf_name: str) -> Tuple[pd.Series, int]:
logger.warning(
f"File {pdf_name} copied to '{os.path.join(self.folder_to_copy_unscraped_files, pdf_name)}'."
)
if self.delete_original_pdfs_after_copy:
os.remove(os.path.join(self.pdf_folder, pdf_name))
logger.warning(f"File {pdf_name} deleted from its original folder.")
elif self.folder_to_copy_unscraped_files is not None and self.from_s3_bucket:
s3_resource = resource('s3')
s3_resource.Object(
Expand All @@ -111,6 +119,13 @@ def _text_to_series_of_pages(self, pdf_name: str) -> Tuple[pd.Series, int]:
f"File {pdf_name.split('/')[-1]} copied to "
f"'{os.path.join(self.folder_to_copy_unscraped_files, pdf_name.split('/')[-1])}'."
)
if self.delete_original_pdfs_after_copy:
s3_resource.Object(
pdf_name.split("/", 1)[0], # bucket name
pdf_name.split("/", 1)[1]
# path to pdf file excluding bucket name; this allows for peculiarity of how s3fs handles names
).delete()
logger.warning(f"File {pdf_name.split('/')[-1]} deleted from its original folder.")
return None
else:
num_pages = len(pdf_reader)
Expand All @@ -127,6 +142,9 @@ def _text_to_series_of_pages(self, pdf_name: str) -> Tuple[pd.Series, int]:
logger.info(
f"File {pdf_name} copied to '{os.path.join(self.folder_to_copy_scraped_files, pdf_name)}'."
)
if self.delete_original_pdfs_after_copy:
os.remove(os.path.join(self.pdf_folder, pdf_name))
logger.warning(f"File {pdf_name} deleted from its original folder.")
elif self.folder_to_copy_scraped_files is not None and self.from_s3_bucket:
s3_resource = resource('s3')
s3_resource.Object(
Expand All @@ -138,6 +156,13 @@ def _text_to_series_of_pages(self, pdf_name: str) -> Tuple[pd.Series, int]:
f"File {pdf_name.split('/')[-1]} copied to "
f"'{os.path.join(self.folder_to_copy_scraped_files, pdf_name.split('/')[-1])}'."
)
if self.delete_original_pdfs_after_copy:
s3_resource.Object(
pdf_name.split("/", 1)[0], # bucket name
pdf_name.split("/", 1)[1]
# path to pdf file excluding bucket name; this allows for peculiarity of how s3fs handles names
).delete()
logger.warning(f"File {pdf_name.split('/')[-1]} deleted from its original folder.")
return document_series, num_pages

def _clean_text(self, text: str) -> str:
Expand Down

0 comments on commit 1c90277

Please sign in to comment.