In [1]:
# pip install boto3
# pip install python-dotenv
# pip install PyPDF2

In [2]:
import boto3
import PyPDF2
from io import BytesIO

Load Credentials

In [3]:
from dotenv import dotenv_values, load_dotenv
import os
import pathlib

dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

Create an S3 Client

In [4]:
s3_resource = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ['S3_ENDPOINT'],
    aws_access_key_id=os.environ['S3_ACCESS_KEY'],
    aws_secret_access_key=os.environ['S3_SECRET_KEY'],
)

Retrieve all ESG reports from S&P Global in the bucket 

In [5]:
my_bucket = s3_resource.Bucket(os.environ['S3_BUCKET'])
files = []

for file in my_bucket.objects.filter(Prefix='redhat-osc-physicalrisk-upload/S&PGlobal_ESG/'):
    if file.key.endswith('pdf'):
        files.append(file.key)

files[0:10]

['redhat-osc-physicalrisk-upload/S&PGlobal_ESG/174656086_Glenmark Pharmaceu_2014-03-31.pdf',
 'redhat-osc-physicalrisk-upload/S&PGlobal_ESG/174656666_Glenmark Pharmaceu_2013-03-31.pdf',
 'redhat-osc-physicalrisk-upload/S&PGlobal_ESG/215786614_C.P. Pokphand Co_2017-12-31.pdf',
 'redhat-osc-physicalrisk-upload/S&PGlobal_ESG/223279479_Glenmark Pharmaceu_2018-03-31.pdf',
 'redhat-osc-physicalrisk-upload/S&PGlobal_ESG/231699798_Volkswagen AG_2017-12-31.pdf',
 'redhat-osc-physicalrisk-upload/S&PGlobal_ESG/271839402_T&D Hldgs Inc_2018-12-24.pdf',
 'redhat-osc-physicalrisk-upload/S&PGlobal_ESG/272156379_Mitsui E&S Hldgs C_2018-12-27.pdf',
 'redhat-osc-physicalrisk-upload/S&PGlobal_ESG/272526067_Equity Resdl_2017-12-31.pdf',
 'redhat-osc-physicalrisk-upload/S&PGlobal_ESG/272561465_Johnson & Johnson_2019-01-02.pdf',
 'redhat-osc-physicalrisk-upload/S&PGlobal_ESG/272561483_Citigroup Inc_2019-01-02.pdf']

Process a PDF file and perform text extraction

In [6]:
# Extract the PDF content of the first PDF file in the list
pdf_file = s3_resource.Object(os.environ['S3_BUCKET'],files[0])
pdf_file_body = pdf_file.get()['Body'].read()

In [7]:
# Read the PDF file content and convert from bytes to text
pdfReader = PyPDF2.PdfFileReader(BytesIO(pdf_file_body))
# Create page object and extract text
pageObj = pdfReader.getPage(0)
page1 = pageObj.extractText()
page1

'ENRICHING LIVES TO CR\nEAT\nE A H\nEALTHI\nER AND HAPPI\nER WORLD\nCSR\n REPORT 2013-14\n'