From e5ead7738c4a2cc0d4b379f41a318e2de1b79dc1 Mon Sep 17 00:00:00 2001 From: andrea-sottana-MoJ <55093726+andrea-sottana-MoJ@users.noreply.github.com> Date: Fri, 7 Aug 2020 14:16:03 +0100 Subject: [PATCH 1/3] Update scraper.py --- pdf2embeddings/scraper.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/pdf2embeddings/scraper.py b/pdf2embeddings/scraper.py index 24a642f..ab5f43b 100755 --- a/pdf2embeddings/scraper.py +++ b/pdf2embeddings/scraper.py @@ -1,3 +1,4 @@ +import s3fs import slate3k import logging from tqdm import tqdm @@ -50,14 +51,25 @@ def _text_to_series_of_pages(self, pdf_name: str) -> Tuple[pd.Series, int]: """ assert pdf_name.endswith('.pdf'), 'Input file is not in .pdf format. The file cannot be processed.' document_series = pd.Series() - with open(os.path.join(self.pdf_folder, pdf_name), 'rb') as pdf: - pdf_reader = slate3k.PDF(pdf) - num_pages = len(pdf_reader) - for i, page in enumerate(pdf_reader): - logger.debug(f'Reading page {i+1} of PDF file {pdf_name}') - page_text = self._clean_text(page) - page_series = pd.Series(page_text) - document_series = document_series.append(page_series, ignore_index=True) + try: + pdf = open(os.path.join(self.pdf_folder, pdf_name), 'rb') + except FileNotFoundError: # check if the path corresponds to an S3 bucket + try: + pdf = s3fs.S3FileSystem().open(os.path.join(self.pdf_folder, pdf_name), 'rb') + except FileNotFoundError as err: + raise FileNotFoundError( + f"{err}. We also tried to look for an S3 bucket path but could not find any. Other types of cloud " + f"storage are not natively supported by pdf2embeddings." + ) + pdf_reader = slate3k.PDF(pdf) + num_pages = len(pdf_reader) + for i, page in enumerate(pdf_reader): + logger.debug(f'Reading page {i+1} of PDF file {pdf_name}') + page_text = self._clean_text(page) + page_series = pd.Series(page_text) + document_series = document_series.append(page_series, ignore_index=True) + pdf.close() + return document_series, num_pages def _clean_text(self, text: str) -> str: @@ -92,3 +104,4 @@ def document_corpus_to_pandas_df(self) -> pd.DataFrame: series.rename(file.replace('.pdf', ''), inplace=True) df = pd.concat([df, series], axis=1) return df + From c3b608ec43ad17789bdc046a759ef358da64d448 Mon Sep 17 00:00:00 2001 From: andrea-sottana-MoJ <55093726+andrea-sottana-MoJ@users.noreply.github.com> Date: Fri, 7 Aug 2020 14:18:50 +0100 Subject: [PATCH 2/3] Update requirements.txt --- requirements.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/requirements.txt b/requirements.txt index 7f25c1d..67c49dd 100755 --- a/requirements.txt +++ b/requirements.txt @@ -21,3 +21,8 @@ coverage flake8 python-dotenv>=0.5.1 Sphinx + +PyYAML~=5.1.2 +setuptools~=47.1.1 +s3fs~=0.4.2 +sklearn~=0.0 From fc085c7fb1126ca1ddaf75bd9d12abde274c6d23 Mon Sep 17 00:00:00 2001 From: andrea-sottana-MoJ <55093726+andrea-sottana-MoJ@users.noreply.github.com> Date: Fri, 7 Aug 2020 14:19:51 +0100 Subject: [PATCH 3/3] Update setup.py --- setup.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 20a4031..3c31b48 100755 --- a/setup.py +++ b/setup.py @@ -37,7 +37,11 @@ def read(*paths): "sentence-transformers==0.2.5.1", "slate3k==0.5.3", "typing==3.7.4.1", - "tqdm==4.45.0" + "tqdm==4.45.0", + "PyYAML~=5.1.2", + "setuptools~=47.1.1", + "s3fs~=0.4.2", + "sklearn~=0.0" ], author='moj-analytical-services', classifiers=[