Skip to content
This repository has been archived by the owner on Jan 25, 2023. It is now read-only.

Commit

Permalink
add try except for dealing with occasional PDFs failing to parse
Browse files Browse the repository at this point in the history
  • Loading branch information
andrea-sottana-moj committed Aug 13, 2020
1 parent 4148e18 commit f4a47f1
Showing 1 changed file with 23 additions and 16 deletions.
39 changes: 23 additions & 16 deletions pdf2embeddings/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,16 +71,21 @@ def _text_to_series_of_pages(self, pdf_name: str) -> Tuple[pd.Series, int]:
pdf = open(os.path.join(self.pdf_folder, pdf_name), 'rb')
else:
pdf = s3fs.S3FileSystem().open(pdf_name, 'rb') # no need to join with self.pdf_folder as s3fs includes that
pdf_reader = slate3k.PDF(pdf)
num_pages = len(pdf_reader)
for i, page in enumerate(pdf_reader):
logger.debug(f'Reading page {i+1} of PDF file {pdf_name}')
page_text = self._clean_text(page)
page_series = pd.Series(page_text)
document_series = document_series.append(page_series, ignore_index=True)
pdf.close()

return document_series, num_pages
try:
pdf_reader = slate3k.PDF(pdf)
except Exception as err:
logger.error(f"The following file could not be parsed: {pdf}.\nThis error was generated: {err}.")
pdf.close()
return None
else:
num_pages = len(pdf_reader)
for i, page in enumerate(pdf_reader):
logger.debug(f'Reading page {i+1} of PDF file {pdf_name}')
page_text = self._clean_text(page)
page_series = pd.Series(page_text)
document_series = document_series.append(page_series, ignore_index=True)
pdf.close()
return document_series, num_pages

def _clean_text(self, text: str) -> str:
"""
Expand Down Expand Up @@ -109,9 +114,11 @@ def document_corpus_to_pandas_df(self) -> pd.DataFrame:
logger.info('Starting scraping PDFs...')
for i, file in enumerate(tqdm(sorted(pdf_list))):
# sorted is so pdfs are extracted in alphabetic order, and to make testing more robust.
series, num_pages = self._text_to_series_of_pages(file)
logger.info(f"Reading PDF file {i + 1} out of {len(pdf_list)}: \"{file}\", number of pages: {num_pages}")
if isinstance(series, pd.Series):
series.rename(file.replace('.pdf', ''), inplace=True)
df = pd.concat([df, series], axis=1)
return df
pdf_extract = self._text_to_series_of_pages(file)
if pdf_extract is not None: # excluding case when PDF could not be parsed due to encoding errors (it happens occasionally)
series, num_pages = pdf_extract
logger.info(f"Reading PDF file {i + 1} out of {len(pdf_list)}: \"{file}\", number of pages: {num_pages}")
if isinstance(series, pd.Series):
series.rename(file.replace('.pdf', ''), inplace=True)
df = pd.concat([df, series], axis=1)
return df

0 comments on commit f4a47f1

Please sign in to comment.