# Text Extraction

In [1]:
import os
import io
import base64
import json
import concurrent.futures
import re

from openai import OpenAI, AzureOpenAI
from pdf2image import convert_from_path
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer

In [2]:
client = OpenAI(api_key="YOUR_API_KEY")
model_name = "gpt-4o" 

In [3]:
completion = client.chat.completions.create(
    model=model_name,
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "What is the capital of France?"}
    ]
)

print(completion.choices[0].message.content)

The capital of France is Paris.


In [4]:
def get_img_uri(img):
    png_buffer = io.BytesIO()
    img.save(png_buffer, format="PNG")
    png_buffer.seek(0)

    base64_png = base64.b64encode(png_buffer.read()).decode('utf-8')

    data_uri = f"data:image/png;base64,{base64_png}"
    return data_uri

In [5]:
system_prompt = '''
You will be provided with an image of a PDF page or a slide. Your goal is to deliver a detailed and engaging presentation about the content you see, using clear and accessible language suitable for a 101-level audience.

If there is an identifiable title, start by stating the title to provide context for your audience.

Describe visual elements in detail:

- **Diagrams**: Explain each component and how they interact. For example, "The process begins with X, which then leads to Y and results in Z."
  
- **Tables**: Break down the information logically. For instance, "Product A costs X dollars, while Product B is priced at Y dollars."

Focus on the content itself rather than the format:

- **DO NOT** include terms referring to the content format.
  
- **DO NOT** mention the content type. Instead, directly discuss the information presented.

Keep your explanation comprehensive yet concise:

- Be exhaustive in describing the content, as your audience cannot see the image.
  
- Exclude irrelevant details such as page numbers or the position of elements on the image.

Use clear and accessible language:

- Explain technical terms or concepts in simple language appropriate for a 101-level audience.

Engage with the content:

- Interpret and analyze the information where appropriate, offering insights to help the audience understand its significance.

------

If there is an identifiable title, present the output in the following format:

{TITLE}

{Content description}

If there is no clear title, simply provide the content description.
'''

In [6]:
def analyze_image(data_uri):
    # time.sleep(1)
    response = client.chat.completions.create(
        model=model_name,
        messages=[
            {"role": "system", "content": system_prompt},
            {
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": {"url": f"{data_uri}"}}
                ]
            },
        ],
        max_tokens=1000,
        temperature=0,
        top_p=0.1
    )
    return response.choices[0].message.content




def analyze_doc_image(img):
    img_uri = get_img_uri(img)
    return analyze_image(img_uri)

In [7]:
def extract_company_and_year(filename):
    abv_name_map = {'goog': 'google', 'msft': 'microsoft', 'nvda': 'nvidia'}
    abv, dt = filename.replace('.pdf', '').split('-')
    year = dt.strip()[:4]
    name = abv_name_map[abv]
    return name, year

In [8]:
def extract_text_per_page(pdf_path):
    # Returns list of text, one per page, using pdfminer layout
    pages = []
    for page_layout in extract_pages(pdf_path):
        page_text = ""
        for elem in page_layout:
            if isinstance(elem, LTTextContainer):
                page_text += elem.get_text()
        pages.append(page_text)
    return pages

In [9]:
files_path = "data"
files = [f for f in os.listdir(files_path) if f.endswith(".pdf")]#[:1]

In [10]:
files

['msft-20230630.pdf',
 'goog-20241231.pdf',
 'nvda-20220130.pdf',
 'msft-20220630.pdf',
 'nvda-20240128.pdf',
 'goog-20221231.pdf',
 'msft-20240630.pdf',
 'goog-20231231.pdf',
 'nvda-20230129.pdf']

In [None]:
# for f in files:
#     path = f"{files_path}/{f}"
#     doc = {"filename": f}
#     imgs = convert_from_path(path)
#     pages_description = []

#     print(f"Analyzing {f} ({len(imgs)} pages)")

#     with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
#         futures = [executor.submit(analyze_doc_image, img) for img in imgs]
#         for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
#             pass
#         for future in futures:
#             pages_description.append(future.result())

#     doc['pages_description'] = pages_description
#     docs.append(doc)

hybrid_docs = []

for fname in files:
    path = os.path.join(files_path, fname)
    company, year = extract_company_and_year(fname)
    imgs = convert_from_path(path)
    page_texts = extract_text_per_page(path)
    assert len(imgs) == len(page_texts), "Mismatch page count!"

    print(f"Analyzing {fname} ({len(imgs)} pages)...")
    page_entries = []

    def process_page(args):
        idx, img, text = args
        img_uri = get_img_uri(img)
        img_desc = analyze_image(img_uri)
        return {
            "page_number": idx+1,
            "company": company,
            "year": year,
            "pdf_name": fname,
            "raw_text": text,
            "img_desc": img_desc
        }

    with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
        futures = [
            executor.submit(process_page, (idx, img, txt))
            for idx, (img, txt) in enumerate(zip(imgs, page_texts))
        ]
        for f in concurrent.futures.as_completed(futures):
            pass
        page_entries = [f.result() for f in futures]

    hybrid_docs.extend(page_entries)

In [12]:
with open("data_processed/parsed_pdf_docs.json", 'w') as f:
    json.dump(hybrid_docs, f)