# OpenAi Batch API

Be careful running this notebook!

Do not "Run All"!

https://platform.openai.com/docs/guides/batch/getting-started


In [2]:
import os
from dotenv import load_dotenv
from openai import OpenAI
from pprint import pprint

In [7]:
BATCH_SIZE_LIMIT = 80_000_000  # tier 4 org

RAW_INPUT = "input/stj/processos_3911.csv"
BATCH_FILE = "../output/stj/processos_3911_pre_batch.jsonl"
OUTPUT = "../output/stj/processos_3911_post_batch.jsonl"
AI_MODEL = "gpt-4o"
METADATA = {"description": "STJ_3911"}

In [4]:
with open("prompt.txt", "r", encoding="utf-8") as f:
    PROMPT = f.read()

PROMPT

'Você foi encarregado de analisar um documento jurídico chamado "certidão de julgamento" e responder a perguntas específicas sobre ele. Suas respostas devem ser fornecidas em formato JSON.\n\nAqui está o texto da certidão de julgamento:\n\n<certidao>\n{variable}\n</certidao>\n\nLeia e analise cuidadosamente o texto acima para responder às seguintes perguntas:\n\n1. Qual foi o resultado do julgamento? Responda com "ACEITO", "PARCIAL" ou "NEGADO".\n2. A certidão menciona que o julgamento foi realizado em sessão virtual? Responda com "SIM" ou "NAO".\n3. A decisão foi unânime (unanimidade)? Responda com "SIM" ou "NAO".\n4. Houve efeitos modificativos? Responda com "SIM", "NAO". Se não houver menção de efeitos modificativos, responda null.\n5. Como foi julgado o conhecimento (conhecer)? Responda com "SIM", "PARCIAL", "NAO". Se não houver menção de conhecimento, responda null.\n6. Quais ministros votaram a favor?\n7. Quais ministros foram vencidos?\n8. Quais ministros fizeram um "voto-vista"

In [9]:
load_dotenv()
client = OpenAI(
    organization=os.getenv("OPENAI_ORGANIZATION"),
    project=os.getenv("OPENAI_PROJECT"),
    api_key=os.getenv("OPENAI_API_KEY"),
)

In [10]:
def upload_file():
    batch_input_file = client.files.create(
        file=open(BATCH_FILE, "rb"),
        purpose="batch",
    )
    pprint(vars(batch_input_file))
    return batch_input_file


upload_response = upload_file()
FILE_ID = upload_response.id

{'bytes': 12266654,
 'created_at': 1720659260,
 'filename': 'processos_3911_pre_batch.jsonl',
 'id': 'file-c3HDcZ64LoXwvoAdUVxYk268',
 'object': 'file',
 'purpose': 'batch',
 'status': 'processed',
 'status_details': None}


In [12]:
FILE_ID = "file-c3HDcZ64LoXwvoAdUVxYk268"

In [13]:
def create_batch(file_id: str, metadata: dict):
    res = client.batches.create(
        input_file_id=file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata=metadata,
    )
    pprint(vars(res))
    return res


batch_response = create_batch(FILE_ID, METADATA)
BATCH_ID = batch_response.id

{'cancelled_at': None,
 'cancelling_at': None,
 'completed_at': None,
 'completion_window': '24h',
 'created_at': 1720659457,
 'endpoint': '/v1/chat/completions',
 'error_file_id': None,
 'errors': None,
 'expired_at': None,
 'expires_at': 1720745857,
 'failed_at': None,
 'finalizing_at': None,
 'id': 'batch_Ph5MRFohup1eKe0uxWM7vPK7',
 'in_progress_at': None,
 'input_file_id': 'file-c3HDcZ64LoXwvoAdUVxYk268',
 'metadata': {'description': 'STJ_3911'},
 'object': 'batch',
 'output_file_id': None,
 'request_counts': BatchRequestCounts(completed=0, failed=0, total=0),
 'status': 'validating'}


Batch(id='batch_Ph5MRFohup1eKe0uxWM7vPK7', completion_window='24h', created_at=1720659457, endpoint='/v1/chat/completions', input_file_id='file-c3HDcZ64LoXwvoAdUVxYk268', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1720745857, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'STJ_3911'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))

In [21]:
list = client.batches.list()
pprint(vars(list))

{'_client': <openai.OpenAI object at 0x7f2a4c252800>,
 '_model': <class 'openai.types.batch.Batch'>,
 '_options': FinalRequestOptions(method='get', url='/batches', params={}, headers=NOT_GIVEN, max_retries=NOT_GIVEN, timeout=NOT_GIVEN, files=None, idempotency_key=None, post_parser=<function SyncAPIClient._request_api_list.<locals>._parser at 0x7f2a37124af0>, json_data=None, extra_json=None),
 'data': [Batch(id='batch_Ph5MRFohup1eKe0uxWM7vPK7', completion_window='24h', created_at=1720659457, endpoint='/v1/chat/completions', input_file_id='file-c3HDcZ64LoXwvoAdUVxYk268', object='batch', status='finalizing', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1720745857, failed_at=None, finalizing_at=1720659773, in_progress_at=1720659460, metadata={'description': 'STJ_3911'}, output_file_id=None, request_counts=BatchRequestCounts(completed=3911, failed=0, total=3911))]}


In [27]:
def check_batch(batch_id: str):
    res = client.batches.retrieve(batch_id)
    return res


result = check_batch(BATCH_ID)
pprint(vars(result))

{'cancelled_at': None,
 'cancelling_at': None,
 'completed_at': 1720659929,
 'completion_window': '24h',
 'created_at': 1720659457,
 'endpoint': '/v1/chat/completions',
 'error_file_id': None,
 'errors': None,
 'expired_at': None,
 'expires_at': 1720745857,
 'failed_at': None,
 'finalizing_at': 1720659773,
 'id': 'batch_Ph5MRFohup1eKe0uxWM7vPK7',
 'in_progress_at': 1720659460,
 'input_file_id': 'file-c3HDcZ64LoXwvoAdUVxYk268',
 'metadata': {'description': 'STJ_3911'},
 'object': 'batch',
 'output_file_id': 'file-gyS32c40e3WfbMxbMDMebuLS',
 'request_counts': BatchRequestCounts(completed=3911, failed=0, total=3911),
 'status': 'completed'}


In [31]:
def retrieve_batch():
    check_result = check_batch(BATCH_ID)
    output_file_id = check_result.output_file_id

    if not output_file_id:
        print(check_result)
    else:
        content = client.files.content(output_file_id)

        print(content.text)
        with open(OUTPUT, "w", encoding="utf-8") as f:
            f.write(content.text)


retrieve_batch()

{"id": "batch_req_FUKR7Ci3qNl8NrasY75acvuO", "custom_id": "0000", "response": {"status_code": 200, "request_id": "2c8f6204a1e09587e0cc4064b510a096", "body": {"id": "chatcmpl-9jcPxsYKZhv9zP2EyboIiMaDQBRsh", "object": "chat.completion", "created": 1720659465, "model": "gpt-4o-2024-05-13", "choices": [{"index": 0, "message": {"role": "assistant", "content": "```json\n{\n    \"resultado\": \"NEGADO\",\n    \"online\": \"SIM\",\n    \"unanimidade\": \"SIM\",\n    \"modificativos\": null,\n    \"conhecer\": null,\n    \"aFavor\": \"Benedito, Gurgel, Helena, Relator, S\u00e9rgio\",\n    \"vencidos\": null,\n    \"votoVista\": null,\n    \"lavrara\": null\n}\n```"}, "logprobs": null, "finish_reason": "stop"}], "usage": {"prompt_tokens": 670, "completion_tokens": 92, "total_tokens": 762}, "system_fingerprint": "fp_298125635f"}}, "error": null}
{"id": "batch_req_Psx3NZHVm586Kf8T1mkFZFua", "custom_id": "0001", "response": {"status_code": 200, "request_id": "a81ed39478f22b8653aabd5fa57e6246", "bod

In [37]:
import json


def check_costs(output_file: str):
    with open(output_file, "r", encoding="utf-8") as f:
        _input = 0.0
        _output = 0.0

        for line in f:
            js = json.loads(line)
            usage = js["response"]["body"]["usage"]

            OMNI_INPUT = 5
            OMNI_OUTPUT = 15

            input_cost = usage["prompt_tokens"] * OMNI_INPUT / 1_000_000 / 2
            output_cost = usage["completion_tokens"] * OMNI_OUTPUT / 1_000_000 / 2

            _input += input_cost
            _output += output_cost

        print(
            f"Actual cost: {_input:.2f} + {_output:.2f} = ${_input + _output:.2f} USD"
        )


check_costs(OUTPUT)

Actual cost: 6.58 + 2.97 = $9.55 USD


In [39]:
def check_errors(output_file: str):
    with open(output_file, "r", encoding="utf-8") as f:

        errors = 0
        for line in f:
            js = json.loads(line)
            if js["error"] != None:
                errors += 1
                print(js["id"])
                print(js["error"])

        print(f"Errors: {errors}")


check_errors(OUTPUT)

Errors: 0
