In [32]:
%%capture --no-stderr
%pip install "pyautogen>=0.2.3" pandas matplotlib PyPDF2
%pip install marly
%pip install groq

In [33]:
import marly
from marly import Marly
import logging
import base64
import zlib
import os
import json
from dotenv import load_dotenv
import time
load_dotenv()

client = Marly(
    base_url="http://localhost:8100",
)

PDF_FILE_PATH = "./lacers_reduced.pdf"

# Define schema for Marly, table column name and a breif description of the column
SCHEMA = json.dumps({
        "Firm": "The name of the firm",
        "Number of Funds": "The number of funds managed by the firm",
        "Commitment": "The commitment amount in millions of dollars",
        "Percent of Total Comm": "The percentage of total commitment",
        "Exposure (FMV + Unfunded)": "The exposure including fair market value and unfunded commitments in millions of dollars",
        "Percent of Total Exposure": "The percentage of total exposure",
        "TVPI": "Total Value to Paid-In multiple",
        "Net IRR": "Net Internal Rate of Return as a percentage"
    })

def read_and_encode_pdf(file_path):
    with open(file_path, "rb") as file:
        pdf_content = base64.b64encode(zlib.compress(file.read())).decode('utf-8')
    logging.debug(f"{file_path} read and encoded")
    return pdf_content

def to_dict(results):
    return {"results": [result.to_dict() for result in results]}

def process_pdf_tool(pdf_file_path: str, schema: str) -> list[dict]:
    """Use this to process the pdf file. """
    pipeline_response_model = None
    try:
        pipeline_response_model = client.pipelines.create(
            api_key=os.environ.get("GROQ_API_KEY"),
            provider_model_name="llama-3.1-70b-versatile",
            provider_type="groq",
            workloads=[
                {
                    "pdf_stream": read_and_encode_pdf(pdf_file_path),
                    "schemas": [schema]
                }
            ]
        )
    except marly.APIConnectionError as e:
        print("The server could not be reached")
        print(e.__cause__)
    except marly.RateLimitError as e:
        print("A 429 status code was received; we should back off a bit.")
    except marly.APIStatusError as e:
        print("Another non-200-range status code was received")
        print(e.status_code)
        print(e.response)

    max_attempts = 100
    attempt = 0
    while attempt < max_attempts:
        logging.debug(f"Waiting for pipeline to complete. Attempt {attempt + 1} of {max_attempts}")
        time.sleep(1)

        results = client.pipelines.retrieve(pipeline_response_model.task_id)
        logging.debug(f"Poll attempt {attempt + 1}: Status - {results.status}")

        if results.status == 'COMPLETED':
            logging.debug(f"Pipeline completed with results: {results.results}")
            return to_dict(results.results)
        elif results.status == 'FAILED':
            logging.error(f"Error: {results.get('error_message', 'Unknown error')}")
            return None

        attempt += 1

    logging.warning("Timeout: Pipeline execution took too long.")
    return None


    


In [34]:
import autogen

config_list = autogen.config_list_from_json(
    "OAI_CONFIG_LIST.json",
)

llm_config = {
    "config_list": config_list,
    "timeout": 120,
}

chatbot = autogen.AssistantAgent(
    name="chatbot",
    system_message="For pdf processing tasks, only use the functions you have been provided with. Reply TERMINATE when the task is done.",
    llm_config=llm_config,
)

# create a UserProxyAgent instance named "user_proxy"
user_proxy = autogen.UserProxyAgent(
    name="user_proxy",
    is_termination_msg=lambda x: x.get("content", "") and x.get("content", "").rstrip().endswith("TERMINATE"),
    human_input_mode="NEVER",
    max_consecutive_auto_reply=10
)


In [35]:
from typing_extensions import Annotated

@user_proxy.register_for_execution()
@chatbot.register_for_llm(description="PDF processor.")
def process_pdf(pdf_file_path: Annotated[str, "PDF to be processed"], schema: Annotated[str, "Schema for the data to be extracted"]) -> list[dict]:
    return process_pdf_tool(pdf_file_path, schema)

In [38]:
from autogen.cache import Cache
with Cache.disk() as cache:
    res = user_proxy.initiate_chat(
        chatbot, message=f"What data can you extract from the pdf under path {PDF_FILE_PATH} with the given schema {SCHEMA}", summary_method="reflection_with_llm", cache=cache
    )

[33muser_proxy[0m (to chatbot):

What data can you extract from the pdf under path ./lacers_reduced.pdf with the given schema {"Firm": "The name of the firm", "Number of Funds": "The number of funds managed by the firm", "Commitment": "The commitment amount in millions of dollars", "Percent of Total Comm": "The percentage of total commitment", "Exposure (FMV + Unfunded)": "The exposure including fair market value and unfunded commitments in millions of dollars", "Percent of Total Exposure": "The percentage of total exposure", "TVPI": "Total Value to Paid-In multiple", "Net IRR": "Net Internal Rate of Return as a percentage"}

--------------------------------------------------------------------------------
[33mchatbot[0m (to user_proxy):

[32m***** Suggested tool call (call_sf6j): process_pdf *****[0m
Arguments: 
{"pdf_file_path": "./lacers_reduced.pdf", "schema": "{\"Firm\": \"The name of the firm\", \"Number of Funds\": \"The number of funds managed by the firm\", \"Commitment\"