Skip to content
This repository has been archived by the owner on Jan 25, 2023. It is now read-only.

Commit

Permalink
Merge pull request #3 from moj-analytical-services/s3_compatibility
Browse files Browse the repository at this point in the history
S3 compatibility
  • Loading branch information
andrea-sottana-MoJ committed Aug 7, 2020
2 parents b39bbc9 + fc085c7 commit c63173b
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 9 deletions.
29 changes: 21 additions & 8 deletions pdf2embeddings/scraper.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import s3fs
import slate3k
import logging
from tqdm import tqdm
Expand Down Expand Up @@ -50,14 +51,25 @@ def _text_to_series_of_pages(self, pdf_name: str) -> Tuple[pd.Series, int]:
"""
assert pdf_name.endswith('.pdf'), 'Input file is not in .pdf format. The file cannot be processed.'
document_series = pd.Series()
with open(os.path.join(self.pdf_folder, pdf_name), 'rb') as pdf:
pdf_reader = slate3k.PDF(pdf)
num_pages = len(pdf_reader)
for i, page in enumerate(pdf_reader):
logger.debug(f'Reading page {i+1} of PDF file {pdf_name}')
page_text = self._clean_text(page)
page_series = pd.Series(page_text)
document_series = document_series.append(page_series, ignore_index=True)
try:
pdf = open(os.path.join(self.pdf_folder, pdf_name), 'rb')
except FileNotFoundError: # check if the path corresponds to an S3 bucket
try:
pdf = s3fs.S3FileSystem().open(os.path.join(self.pdf_folder, pdf_name), 'rb')
except FileNotFoundError as err:
raise FileNotFoundError(
f"{err}. We also tried to look for an S3 bucket path but could not find any. Other types of cloud "
f"storage are not natively supported by pdf2embeddings."
)
pdf_reader = slate3k.PDF(pdf)
num_pages = len(pdf_reader)
for i, page in enumerate(pdf_reader):
logger.debug(f'Reading page {i+1} of PDF file {pdf_name}')
page_text = self._clean_text(page)
page_series = pd.Series(page_text)
document_series = document_series.append(page_series, ignore_index=True)
pdf.close()

return document_series, num_pages

def _clean_text(self, text: str) -> str:
Expand Down Expand Up @@ -92,3 +104,4 @@ def document_corpus_to_pandas_df(self) -> pd.DataFrame:
series.rename(file.replace('.pdf', ''), inplace=True)
df = pd.concat([df, series], axis=1)
return df

5 changes: 5 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,8 @@ coverage
flake8
python-dotenv>=0.5.1
Sphinx

PyYAML~=5.1.2
setuptools~=47.1.1
s3fs~=0.4.2
sklearn~=0.0
6 changes: 5 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,11 @@ def read(*paths):
"sentence-transformers==0.2.5.1",
"slate3k==0.5.3",
"typing==3.7.4.1",
"tqdm==4.45.0"
"tqdm==4.45.0",
"PyYAML~=5.1.2",
"setuptools~=47.1.1",
"s3fs~=0.4.2",
"sklearn~=0.0"
],
author='moj-analytical-services',
classifiers=[
Expand Down

0 comments on commit c63173b

Please sign in to comment.