# datasets.py
Prepares the datasets by 
- Reading pdfs in `datasets/raw`
- Parsing them into utf-8 using PyPDF2
- Writing them to text files in `datasets/out`

In [6]:
import os
import pytesseract
from pdf2image import convert_from_path

# raw pdfs
PDF_DIRECTORY = "datasets/raw"
# output directories
TEXT_DIRECTORY = "datasets/out"

def setup():
    # Create directories if they don"t exist
    os.makedirs(PDF_DIRECTORY, exist_ok=True)
    os.makedirs(TEXT_DIRECTORY, exist_ok=True)

    print("Directory setup complete.")

    for file in os.listdir(PDF_DIRECTORY):
        text = extract_text(file)
        text_file_name = file.replace('.pdf', '.txt')
        text_file_path = os.path.join(TEXT_DIRECTORY, text_file_name)

        with open(text_file_path, 'w') as text_file:
            text_file.write(text)
    
    print("Done")

def extract_text(file_name: str) -> str:
    print(f"Extracting text from {file_name}...")

    ocr_text = ""
    images = convert_from_path(os.path.join(PDF_DIRECTORY, file_name))
    for image in images:
        ocr_text += pytesseract.image_to_string(image, lang='eng')

    return ocr_text

setup()

Directory setup complete.
Extracting text from 1.pdf...
Extracting text from 2.pdf...
Done


In [3]:
from openai import OpenAI
import os

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

def summarize(text: str) -> str:
    summary = ""
    chunks = [text[i:i + 4096] for i in range(0, len(text), 4096)]
    for chunk in chunks:
        completion = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{
                "role": "user",
                "content": "Summarize the following, and return nothing if the text is indecipherable: " + chunk
            }],
            max_tokens=100,
        )

        result = completion.choices[0].message.content

        summary += result
    
    return summary

for file in os.listdir("datasets/out"):
    with open(os.path.join("datasets/out", file), "rt") as fin:
        text = fin.read()
        # Two iterations because the first summary is still too long
        print(file + ": " + summarize(summarize(text)))

2.txt: new sustainability goals and initiatives.
3. Expanding partnerships with other organizations to address social and environmental issues.
4. Increasing support for diverse suppliers and small businesses.
5. Expanding financial services and resources for low- and moderate-income individuals and communities.
6. Increasing employee volunteerism and community engagement.
7. Launching new products and services to meet evolving customer needs.
8. Investing in technology and digital capabilities.
9. Continuing efforts to address climate change and promote environmental sustainability.
10. StrengthenBank of America emphasizes their commitment to responsible growth and delivering strong returns to stakeholders. They mention their focus on sustainable finance, racial and gender equality, economic opportunity, environmental sustainability, and diversity. The company has issued a Sustainability Bond and invested in minority depository institutions and funds run by women and diverse entrepren