add functionality to copy unsuccessfully scraped file to specified fo…

…lder
moj-analytical-services · Sep 17, 2020 · 6ead86a · 6ead86a
1 parent f0cedda
commit 6ead86a
Showing 1 changed file with 36 additions and 6 deletions.
diff --git a/pdf2embeddings/scraper.py b/pdf2embeddings/scraper.py
@@ -1,14 +1,14 @@
 import s3fs
+import shutil
 import slate3k
 import logging
 from tqdm import tqdm
-from boto3 import Session
+from boto3 import Session, resource
 import os
 import pandas as pd
 import json
 from typing import Tuple, Dict, Optional
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -21,18 +21,29 @@ class DocumentScraper:
     If different pdf files have different number of pages, any empty rows at the bottom of a column are filled with nan.
     It also offers support for folders stored in the cloud (AWS S3 buckets only).
     """
-    def __init__(self, pdf_folder: str, json_filename: Optional[str] = None, from_s3_bucket: bool = False) -> None:
+
+    def __init__(
+            self, pdf_folder: str,
+            json_filename: Optional[str] = None,
+            from_s3_bucket: bool = False,
+            folder_for_unscraped_files: Optional[str] = None
+    ) -> None:
         """
         :param pdf_folder: path to the folder containing pdf files to be scraped. Can also be an S3 bucket (see below).
         :param json_filename: full path of the json file created by the module json_creator.py. This json file
                contains dictionary of words to replace (e.g. Dr. --> Dr), used for text cleaning. Defaults to None, in
                which case no ad-hoc text cleaning will be performed.
         :param from_s3_bucket: a boolean specifying whether to scrape the PDFs from a folder located in an AWS S3
                bucket. If set to True, the path can either start with "s3://" or omit this prefix. Default: False.
+        :param folder_for_unscraped_files: this folder gives the user the option for the unscraped files to be copied
+               over in a specified folder so they can be kept track of. This is because occasionally slate3k doesn't
+               manage to successfully scrape a file; this is very rare but we want to track what has not been
+               successfully scraped. This should contain full path to folder, including bucket name if using S3.
         """
         self.pdf_folder = pdf_folder
         self.open_json = self._read_config(json_filename)
         self.from_s3_bucket = from_s3_bucket
+        self.folder_for_unscraped_files = folder_for_unscraped_files
 
         if self.from_s3_bucket:
             assert Session().get_credentials() is not None, "You do not have any valid credentials to access AWS S3."
@@ -71,16 +82,34 @@ def _text_to_series_of_pages(self, pdf_name: str) -> Tuple[pd.Series, int]:
             pdf = open(os.path.join(self.pdf_folder, pdf_name), 'rb')
         else:
             pdf = s3fs.S3FileSystem().open(pdf_name, 'rb')  # no need to join with self.pdf_folder as s3fs includes that
-        try:    
+        try:
             pdf_reader = slate3k.PDF(pdf)
         except Exception as err:
             logger.error(f"The following file could not be parsed: {pdf}.\nThis error was generated: {err}.")
             pdf.close()
+            if self.folder_for_unscraped_files is not None and not self.from_s3_bucket:
+                shutil.copy2(
+                    os.path.join(self.pdf_folder, pdf_name), os.path.join(self.folder_for_unscraped_files, pdf_name)
+                )
+                logger.warning(
+                    f"File {pdf_name} copied to '{os.path.join(self.folder_for_unscraped_files, pdf_name)}'."
+                )
+            elif self.folder_for_unscraped_files is not None and self.from_s3_bucket:
+                s3_resource = resource('s3')
+                s3_resource.Object(
+                    self.folder_for_unscraped_files.split("/", 1)[0],  # bucket name
+                    os.path.join(self.folder_for_unscraped_files.split("/", 1)[1], pdf_name.split("/")[-1])
+                    # path to pdf file excluding bucket name; this allows for peculiarity of how s3fs handles names
+                ).copy_from(CopySource=pdf_name)
+                logger.warning(
+                    f"File {pdf_name.split('/')[-1]} copied to "
+                    f"'{os.path.join(self.folder_for_unscraped_files, pdf_name.split('/')[-1])}'."
+                )
             return None
         else:
             num_pages = len(pdf_reader)
             for i, page in enumerate(pdf_reader):
-                logger.debug(f'Reading page {i+1} of PDF file {pdf_name}')
+                logger.debug(f'Reading page {i + 1} of PDF file {pdf_name}')
                 page_text = self._clean_text(page)
                 page_series = pd.Series(page_text)
                 document_series = document_series.append(page_series, ignore_index=True)
@@ -117,7 +146,8 @@ def document_corpus_to_pandas_df(self) -> pd.DataFrame:
             pdf_extract = self._text_to_series_of_pages(file)
             if pdf_extract is not None:  # excluding case when PDF could not be parsed due to encoding errors (it happens occasionally)
                 series, num_pages = pdf_extract
-                logger.info(f"Reading PDF file {i + 1} out of {len(pdf_list)}: \"{file}\", number of pages: {num_pages}")
+                logger.info(
+                    f"Reading PDF file {i + 1} out of {len(pdf_list)}: \"{file}\", number of pages: {num_pages}")
                 if isinstance(series, pd.Series):
                     series.rename(file.replace('.pdf', ''), inplace=True)
                     df = pd.concat([df, series], axis=1)