Skip to content
This repository has been archived by the owner on Jan 25, 2023. It is now read-only.

Commit

Permalink
Merge 6ead86a into f0cedda
Browse files Browse the repository at this point in the history
  • Loading branch information
andrea-sottana-MoJ authored Sep 17, 2020
2 parents f0cedda + 6ead86a commit e327d7b
Showing 1 changed file with 36 additions and 6 deletions.
42 changes: 36 additions & 6 deletions pdf2embeddings/scraper.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import s3fs
import shutil
import slate3k
import logging
from tqdm import tqdm
from boto3 import Session
from boto3 import Session, resource
import os
import pandas as pd
import json
from typing import Tuple, Dict, Optional


logger = logging.getLogger(__name__)


Expand All @@ -21,18 +21,29 @@ class DocumentScraper:
If different pdf files have different number of pages, any empty rows at the bottom of a column are filled with nan.
It also offers support for folders stored in the cloud (AWS S3 buckets only).
"""
def __init__(self, pdf_folder: str, json_filename: Optional[str] = None, from_s3_bucket: bool = False) -> None:

def __init__(
self, pdf_folder: str,
json_filename: Optional[str] = None,
from_s3_bucket: bool = False,
folder_for_unscraped_files: Optional[str] = None
) -> None:
"""
:param pdf_folder: path to the folder containing pdf files to be scraped. Can also be an S3 bucket (see below).
:param json_filename: full path of the json file created by the module json_creator.py. This json file
contains dictionary of words to replace (e.g. Dr. --> Dr), used for text cleaning. Defaults to None, in
which case no ad-hoc text cleaning will be performed.
:param from_s3_bucket: a boolean specifying whether to scrape the PDFs from a folder located in an AWS S3
bucket. If set to True, the path can either start with "s3://" or omit this prefix. Default: False.
:param folder_for_unscraped_files: this folder gives the user the option for the unscraped files to be copied
over in a specified folder so they can be kept track of. This is because occasionally slate3k doesn't
manage to successfully scrape a file; this is very rare but we want to track what has not been
successfully scraped. This should contain full path to folder, including bucket name if using S3.
"""
self.pdf_folder = pdf_folder
self.open_json = self._read_config(json_filename)
self.from_s3_bucket = from_s3_bucket
self.folder_for_unscraped_files = folder_for_unscraped_files

if self.from_s3_bucket:
assert Session().get_credentials() is not None, "You do not have any valid credentials to access AWS S3."
Expand Down Expand Up @@ -71,16 +82,34 @@ def _text_to_series_of_pages(self, pdf_name: str) -> Tuple[pd.Series, int]:
pdf = open(os.path.join(self.pdf_folder, pdf_name), 'rb')
else:
pdf = s3fs.S3FileSystem().open(pdf_name, 'rb') # no need to join with self.pdf_folder as s3fs includes that
try:
try:
pdf_reader = slate3k.PDF(pdf)
except Exception as err:
logger.error(f"The following file could not be parsed: {pdf}.\nThis error was generated: {err}.")
pdf.close()
if self.folder_for_unscraped_files is not None and not self.from_s3_bucket:
shutil.copy2(
os.path.join(self.pdf_folder, pdf_name), os.path.join(self.folder_for_unscraped_files, pdf_name)
)
logger.warning(
f"File {pdf_name} copied to '{os.path.join(self.folder_for_unscraped_files, pdf_name)}'."
)
elif self.folder_for_unscraped_files is not None and self.from_s3_bucket:
s3_resource = resource('s3')
s3_resource.Object(
self.folder_for_unscraped_files.split("/", 1)[0], # bucket name
os.path.join(self.folder_for_unscraped_files.split("/", 1)[1], pdf_name.split("/")[-1])
# path to pdf file excluding bucket name; this allows for peculiarity of how s3fs handles names
).copy_from(CopySource=pdf_name)
logger.warning(
f"File {pdf_name.split('/')[-1]} copied to "
f"'{os.path.join(self.folder_for_unscraped_files, pdf_name.split('/')[-1])}'."
)
return None
else:
num_pages = len(pdf_reader)
for i, page in enumerate(pdf_reader):
logger.debug(f'Reading page {i+1} of PDF file {pdf_name}')
logger.debug(f'Reading page {i + 1} of PDF file {pdf_name}')
page_text = self._clean_text(page)
page_series = pd.Series(page_text)
document_series = document_series.append(page_series, ignore_index=True)
Expand Down Expand Up @@ -117,7 +146,8 @@ def document_corpus_to_pandas_df(self) -> pd.DataFrame:
pdf_extract = self._text_to_series_of_pages(file)
if pdf_extract is not None: # excluding case when PDF could not be parsed due to encoding errors (it happens occasionally)
series, num_pages = pdf_extract
logger.info(f"Reading PDF file {i + 1} out of {len(pdf_list)}: \"{file}\", number of pages: {num_pages}")
logger.info(
f"Reading PDF file {i + 1} out of {len(pdf_list)}: \"{file}\", number of pages: {num_pages}")
if isinstance(series, pd.Series):
series.rename(file.replace('.pdf', ''), inplace=True)
df = pd.concat([df, series], axis=1)
Expand Down

0 comments on commit e327d7b

Please sign in to comment.