Skip to content
This repository has been archived by the owner on Jan 25, 2023. It is now read-only.

Commit

Permalink
Update scraper.py
Browse files Browse the repository at this point in the history
  • Loading branch information
andrea-sottana-MoJ authored Aug 7, 2020
1 parent 0ce3057 commit ccfdae2
Showing 1 changed file with 3 additions and 3 deletions.
6 changes: 3 additions & 3 deletions pdf2embeddings/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,9 @@ def _text_to_series_of_pages(self, pdf_name: str) -> Tuple[pd.Series, int]:
"""
assert pdf_name.endswith('.pdf'), 'Input file is not in .pdf format. The file cannot be processed.'
document_series = pd.Series()
if self.from_s3_bucket:
pdf = open(os.path.join(self.pdf_folder, pdf_name), 'rb')
if not self.from_s3_bucket:
pdf = open(os.path.join(self.pdf_folder, pdf_name), 'rb')
if self.from_s3_bucket:
pdf = s3fs.S3FileSystem().open(os.path.join(self.pdf_folder, pdf_name), 'rb')
pdf_reader = slate3k.PDF(pdf)
num_pages = len(pdf_reader)
Expand Down Expand Up @@ -110,4 +110,4 @@ def document_corpus_to_pandas_df(self) -> pd.DataFrame:
if isinstance(series, pd.Series):
series.rename(file.replace('.pdf', ''), inplace=True)
df = pd.concat([df, series], axis=1)
return df
return df

0 comments on commit ccfdae2

Please sign in to comment.