# datasets.py
Prepares the datasets by 
- Reading pdfs in `datasets/raw`
- Parsing them into utf-8 using PyPDF2
- Writing them to text files in `datasets/out`

In [6]:
import os
import pytesseract
from pdf2image import convert_from_path

# raw pdfs
PDF_DIRECTORY = "datasets/raw"
# output directories
TEXT_DIRECTORY = "datasets/out"

def setup():
    # Create directories if they don"t exist
    os.makedirs(PDF_DIRECTORY, exist_ok=True)
    os.makedirs(TEXT_DIRECTORY, exist_ok=True)

    print("Directory setup complete.")

    for file in os.listdir(PDF_DIRECTORY):
        text = extract_text(file)
        text_file_name = file.replace('.pdf', '.txt')
        text_file_path = os.path.join(TEXT_DIRECTORY, text_file_name)

        with open(text_file_path, 'w') as text_file:
            text_file.write(text)
    
    print("Done")

def extract_text(file_name: str) -> str:
    print(f"Extracting text from {file_name}...")

    ocr_text = ""
    images = convert_from_path(os.path.join(PDF_DIRECTORY, file_name))
    for image in images:
        ocr_text += pytesseract.image_to_string(image, lang='eng')

    return ocr_text

setup()

Directory setup complete.
Extracting text from 1.pdf...
Extracting text from 2.pdf...
Done


In [12]:
from openai import OpenAI
import os

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

def summarize(file):
    with open(os.path.join("datasets/out", file), "rt") as fin:
        text = fin.read()

        summary = ""
        chunks = [text[i:i + 4096] for i in range(0, len(text), 4096)]
        for chunk in chunks:
            completion = client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[{
                    "role": "user",
                    "content": "Summarize the following, and return nothing if the text is indecipherable: " + chunk
                }],
                max_tokens=100,
            )

            result = completion.choices[0].message.content
            print(result)

            summary += result
        
        return summary

for file in os.listdir("datasets/out"):
    summarize(file)
    break

The text is a snippet from the annual report of Bank of America for the year 2022. It includes information about the company's lines of business, company performance, community impact, human capital management, financial highlights, and a letter from the CEO. The report highlights the company's adoption of Responsible Growth as a guiding principle and its positive impact on the company's performance in various economic environments. It also provides financial metrics and key performance indicators.
The text discusses the financial performance of Bank of America from 2015 to 2022. It mentions that the company experienced organic growth and managed expenses effectively, leading to strong financial results. Despite challenges such as the tightening cycle initiated by the Federal Reserve and the pandemic, the company performed well, doubling its stock price and generating $185 billion in after-tax earnings. The text also highlights the company's positioning for the future, with a focus on 

KeyboardInterrupt: 