In [1]:
# Install required libraries if not already installed
# Uncomment and run the following lines in Jupyter Notebook if needed

# !pip install openai           # For interacting with OpenAI's API
# !pip install pytesseract      # For OCR (Optical Character Recognition)
# !pip install pdf2image        # For converting PDF pages into images
# !pip install pillow           # For image processing (PIL library)

# Import necessary libraries for PDF processing, OCR, and OpenAI API interaction
import os
import openai
import re
import pytesseract
from pdf2image import convert_from_path
from PIL import Image

print("✅ All necessary libraries are imported successfully.")


In [2]:
# Set up OpenAI API key (replace with your actual key)
openai.api_key = "api_key_here"


In [7]:
# Define file paths for input and output
pdf_path = "vodafone_annual_report_reduced.pdf"
image_dir = "extracted_images"  # Folder to store PDF-converted images

vocab_path = "vocabulary of allowed terms.rtf"  # Vocabulary file
output_txt_path = "formatted_financial_tables.txt"  # Final extracted output file

# Create a folder for storing images if it doesn't exist
os.makedirs(image_dir, exist_ok=True)
print(f"✅ Folder '{image_dir}' created (if not already existing).")


✅ Folder 'extracted_images' created (if not already existing).


In [4]:
# Convert each page of the PDF into an image
pdf_images = convert_from_path(pdf_path)
image_paths = []

# Save extracted images
for idx, img in enumerate(pdf_images):
    img_path = os.path.join(image_dir, f"page_{idx+1}.jpg")
    img.save(img_path, "JPEG")
    image_paths.append(img_path)

print(f"✅ Successfully saved {len(image_paths)} images in '{image_dir}/'.")


✅ Successfully saved 20 images in 'extracted_images/'.


In [5]:
# Initialize empty text variable
ocr_extracted_text = ""

# Extract text using OCR from each image
for i, img_path in enumerate(image_paths):
    extracted_text = pytesseract.image_to_string(Image.open(img_path), lang="eng")
    ocr_extracted_text += f"\n\nPage {i+1}:\n{extracted_text}"

print("✅ OCR text extraction completed successfully.")


✅ OCR text extraction completed successfully.


In [8]:
# Function to extract plain text from RTF file
def extract_plain_text_from_rtf(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        content = file.read()
    return re.sub(r"{\\.*?}", "", content)  # Remove RTF formatting tags

# Load and clean vocabulary list
def load_vocab(file_path):
    return extract_plain_text_from_rtf(file_path).split("\n")

# Load vocabulary terms
vocabulary_list = load_vocab(vocab_path)
print("✅ Vocabulary list loaded successfully.")


✅ Vocabulary list loaded successfully.


In [10]:
# Function to extract structured financial tables while keeping original formatting
def extract_financial_tables(text):
    response = openai.chat.completions.create(
        model="gpt-4-turbo",
        messages=[
            {"role": "system", "content": "Extract all financial tables (Balance Sheet, Income Statement, Cash Flow) from the provided text while maintaining the original spacing and structure."},
            {"role": "user", "content": f"Extract all financial tables from the following text and return them in their original format:\n\n{text}"}
        ],
        max_tokens=4000
    )

    return response.choices[0].message.content

# Call GPT-4 to extract tables from OCR-extracted text
extracted_tables_text = extract_financial_tables(ocr_extracted_text)
print("✅ Successfully extracted financial tables.")


✅ Successfully extracted financial tables.


In [11]:
# Function to relabel financial terms while keeping table structure intact
def relabel_financial_terms(text, vocab):
    response = openai.chat.completions.create(
        model="gpt-4-turbo",
        messages=[
            {
                "role": "system",
                "content": (
                    "You are a financial analyst. Replace financial line items "
                    "with the closest matching terms from a given vocabulary list, while keeping the rest of the text unchanged. "
                    "Ensure the output remains structured as a well-formatted Markdown table, preserving alignment."
                ),
            },
            {
                "role": "user",
                "content": (
                    f"Here is a financial table extracted from a report:\n\n{text}\n\n"
                    f"Replace financial line items with the closest matching terms from this vocabulary:\n{vocab}\n\n"
                    "Return **only** the modified table in Markdown format. Do not include any extra explanations."
                ),
            },
        ],
        max_tokens=4000
    )

    return response.choices[0].message.content

# Apply relabeling function
final_formatted_output = relabel_financial_terms(extracted_tables_text, vocabulary_list)
print("✅ Financial line items relabeled successfully.")


✅ Financial line items relabeled successfully.


In [12]:
# Save final formatted financial tables to a text file
with open(output_txt_path, "w", encoding="utf-8") as file:
    file.write(final_formatted_output)

print(f"✅ Extraction process complete. Check the file '{output_txt_path}' for results.")


✅ Extraction process complete. Check the file 'formatted_financial_tables.txt' for results.
