# Batch Requests with OpenAI API

This notebook demonstrates how to efficiently process multiple prompts at once using OpenAI's Batch API.  
  
**Note**: Batch responses may take up to 24 hours to be processed by OpenAI.
  
Make sure to run the following cell before any other cell.

In [None]:
import os
import csv
import json
from openai import OpenAI
from datetime import datetime

def convert_timestamp(timestamp):
    return datetime.fromtimestamp(timestamp) if timestamp is not None else None

api_key = os.getenv('OPENAI_API_KEY')
client = OpenAI(api_key=api_key)

### **Step 1: Prepare JSONL File**
To use OpenAI's Batch API, prompts must be provided in a JSONL (JSON Lines) format.  
Assuming you have a CSV file that includes two columns:  
- prompts 
- IDs corresponding to each prompt (needed to retrieve its response later)  

CSV file content should be structured as follows:
```
IDs,Prompts
request-1,"Once upon a time,"
request-2,"Translate Good morning to French."
request-3,"Write a poem about circles."
```
then use the following code to prepare a `JSONL` file.

In [None]:
# Path to CSV file
prompts_csv_file = './prompts.csv'                              # Path to CSV file containing the prompts
prompts_jsonl_file = prompts_csv_file.replace('.csv', '.jsonl')

# Model configuration
model = "gpt-4-turbo"
max_tokens = 10
temperature = 1
system_message = "You are a helpful assistant."

def csv_to_jsonl(csv_file, jsonl_file, model, max_tokens=4096, temperature=1, system="You are a helpful assistant."):
    """Convert a CSV file to JSON Lines (JSONL) for batch requests using OpenAI API."""
    with open(csv_file, 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        with open(jsonl_file, 'w', encoding='utf-8') as jsonlfile:
            for row in reader:
                custom_id = row['IDs']
                user_message = row['Prompts']
                data = construct_json_line(custom_id, model, user_message, system, max_tokens, temperature)
                jsonlfile.write(json.dumps(data) + '\n')

def construct_json_line(custom_id, model, user_message, system_message, max_tokens, temperature):
    """Construct a JSON line for a chat completion request."""
    return {
        "custom_id": custom_id, 
        "method": "POST", 
        "url": "/v1/chat/completions", 
        "body": {
            "model": model, 
            "messages": [
                {"role": "system", "content": system_message},
                {"role": "user", "content": user_message}
            ],
            "max_tokens": max_tokens,
            "temperature": temperature,
        }
    }

csv_to_jsonl(prompts_csv_file, prompts_jsonl_file, model, max_tokens, temperature, system_message)

#### <u>**Optional**</u>:
If prompts in the JSONL file exceeds the maximum limit of tokens per day (900,000 TPD for tier 1 for instance), they can be splitted into multiple JSONL files, and then batched sequentially.  
  
If this error is encountered for example:  
```
BatchError(code=‘token_limit_exceeded’, line=None, message='Enqueued token limit reached ...
```
in this case, the following code can be used to split the JSONL file.  
  
Specify `lines_per_file` based on the number of tokens in your prompts.

In [None]:
lines_per_file = 1000                           # Adjust as needed
prompts_jsonl_file = './prompts.jsonl'
splitted_prompts_dir = "./Batch_Prompts"        # Directory where JSONL files will be stored

def split_file(input_file_path, output_dir, max_lines_per_file):
    """Split lines into multiple files with a specified maximum number of lines per file."""
    os.makedirs(output_dir, exist_ok=True)        # Ensure the output directory exists, create if not 
    with open(input_file_path, "r") as infile:
        lines = infile.readlines()

    # Calculate the number of output files needed
    num_files = (len(lines) + max_lines_per_file - 1) // max_lines_per_file

    # Write lines to each output file
    for i in range(num_files):
        start_index = i * max_lines_per_file
        end_index = min((i + 1) * max_lines_per_file, len(lines))
        output_filename = f"prompts-batch_{i+1}.jsonl"
        with open(os.path.join(output_dir, output_filename), "w") as outfile:
            outfile.writelines(lines[start_index:end_index])

split_file(prompts_jsonl_file, splitted_prompts_dir, lines_per_file)

### **Step 2: Upload JSONL File(s) to OpenAI Account**

There are two options to upload a file:

1. **Via OpenAI Account**: Follow these steps:
    - In OpenAI account dashboard:
    - Navigate to [Storage](https://platform.openai.com/storage) section on the left sidebar.
    - Click on **+ Upload** button at the top right.
    - Select the file(s) and click on **Upload**.
2. **Using Python Script**: Execute the following code:

In [None]:
batch_prompts_dir = "."       # Change it to the directory containing the JSONL file(s)

def upload_file(file_path):
    '''Upload a JSONL file and return the file ID from OpenAI's server.'''
    try:
        with open(file_path, 'rb') as file:
            file_upload_response = client.files.create(
                file=file,
                purpose="batch"
            )
        return file_upload_response
    except Exception as e:
        print(e)
        return None

# Get the list of JSONL files in the directory
jsonl_files = [file for file in os.listdir(batch_prompts_dir) if file.endswith(".jsonl")]

# Upload each JSONL file
for jsonl_file in jsonl_files:
    jsonl_batch_file_path = os.path.join(batch_prompts_dir, jsonl_file)
    file_upload_response = upload_file(jsonl_batch_file_path)
    if file_upload_response:
        try:
            input_file_id = file_upload_response.id
            print(f"File ID for {jsonl_file}: {input_file_id}")
        except Exception as e:
            print(e)

#### Display List of Files in OpenAI Account
This includes uploaded files, as well as output files generated by OpenAI

In [None]:
try:
    list_files = client.files.list(
        # purpose="batch"       # Specify purpose of file (batch, fine-tuning, assistant, etc).
    )
    for i in range(len(list_files.data)):
        file_data = list_files.data[i]
        print(f"File: {file_data.id} | Purpose: {file_data.purpose:<12} | Created at: {convert_timestamp(file_data.created_at)} | File Name: {file_data.filename}")
        
except Exception as e:
    print(e)

#### Delete a File
You can delete a file by passing its file ID

In [None]:
delete_file_id = "file-xxxxxxxxxxxxxxxxxxxxxxxx"

try:
    response = client.files.delete(delete_file_id)
    print(response)
except Exception as e:
    print(e) 

### **Step 3: Create a Batch**

(If you have multiple batch files to process, execute them sequentially to avoid reaching the Max Tokens per Day limit.)  
  
Batch requests can be created:

1. **Via OpenAI Account**: Follow these steps:
    - Access OpenAI account dashboard.
    - Navigate to [Batch](https://platform.openai.com/batch) section on the left sidebar.
    - Click on **+ Create** button at the top right.
    - Upload the file and click on **Create**.
2. **Using Python Script**: Execute the following code:

In [None]:
# input_file_id = "file-xxxxxxxxxxxxxxxxxxxxxxxx"

def create_batch(input_file_id, client):
    """Create a batch request and return the response."""
    try:
        batch_response = client.batches.create(
            input_file_id=input_file_id,
            endpoint="/v1/chat/completions",
            completion_window="24h"
        )
        return batch_response
    except Exception as e:
        print(e)
        return None

# Execute the batch creation
create_batch_response = create_batch(input_file_id, client)

# Display batch information
if create_batch_response:
    try:
        file_name = client.files.retrieve(input_file_id).filename
        print("Batch file name:", file_name)
    except Exception as e:
        print(f"Error retrieving file name: {e}")
    print("Batch status:", create_batch_response.status)
    if create_batch_response.id:
        batch_id = create_batch_response.id
        print("Batch ID:", batch_id)

### **Step 4: Monitor and Retrieve Batch Information**

Two options to monitor a batch request:

1. **Via OpenAI Account**: Follow these steps:
    - Access OpenAI account dashboard.
    - Navigate to [Batch](https://platform.openai.com/batch) section on the left sidebar.
    - Click on the desired batch ID (e.g., `batch_xxxxxxxxxxxxxxxxxxxxxxxx`).
    - Check the status.
2. **Using Python Script**: Execute the following code:

In [None]:
# batch_id = "batch_xxxxxxxxxxxxxxxxxxxxxxxx"

def check_batch(batch_id):
    """Retrieve batch information using the provided batch ID."""
    try:
        return client.batches.retrieve(batch_id)
    except Exception as e:
        print(e)
        return None

def print_label_and_timestamp(label, timestamp):
    """Prints the provided label and timestamp in a human-readable format."""
    print(label + ":", convert_timestamp(timestamp)) if timestamp is not None else None

# Retrieve batch information
check_batch_response = check_batch(batch_id)

# Display batch information
if check_batch_response:
    print("Batch status:", check_batch_response.status)
    request_counts = check_batch_response.request_counts
    print(f"Completed: {request_counts.completed:<6}| Failed: {request_counts.failed:<6}| Total: {request_counts.total:<6}\n")

    if check_batch_response.errors:
        print("Batch error:", check_batch_response.errors)
    else:
        event_names = ['created_at', 'expires_at', 'completed_at', 'expired_at', 'failed_at']
        for event_name in event_names:
            print_label_and_timestamp(f"Batch {event_name.replace('_', ' ')}", getattr(check_batch_response, event_name))

#### Display List of Batch Requests

In [None]:
try:
    list_batches = client.batches.list(
        limit=20    # Number of batches to display
    )
    for i in range(len(list_batches.data)):
        batch_data = list_batches.data[i]
        try:
            input_file_name = client.files.retrieve(batch_data.input_file_id).filename
        except Exception:
            input_file_name = 'File is not found'
        print(f"Batch: {batch_data.id} | Status: {batch_data.status:<11} | Created at: {convert_timestamp(batch_data.created_at)} | Input File: {input_file_name}")
        
except Exception as e:
    print(e)

#### **<u>Cancel</u> a Batch**
To cancel a batch:
1. **Via OpenAI Account**: Follow these steps:
    - Access OpenAI account dashboard.
    - Navigate to [Batch](https://platform.openai.com/batch) section on the left sidebar.
    - Click on the desired batch ID (e.g., `batch_xxxxxxxxxxxxxxxxxxxxxxxx`).
    - Click on `Cancel`.
2. **Using Python Script**: Execute the following code:

In [None]:
batch_id = "batch_xxxxxxxxxxxxxxxxxxxxxxxx"

def cancel_batch(batch_id):
    """Cancel a batch with the provided batch ID and return the cancellation response."""
    try:
        return client.batches.cancel(batch_id)
    except Exception as e:
        print(e)
        return None


cancel_batch_response = cancel_batch(batch_id)

if cancel_batch_response:
    print("Batch status:", cancel_batch_response.status)
    if cancel_batch_response.errors:
        print("Batch error:", cancel_batch_response.errors)
    else:
        if cancel_batch_response.cancelled_at is not None:
            print("Batch canceled at:", convert_timestamp(cancel_batch_response.cancelled_at))

### **Step 5: Retrieve Output File Content**

Again there are two options to retrieve batch responses as a JSONL file:

1. **Via OpenAI Account**: Follow these steps:
    - Access OpenAI account dashboard.
    - Navigate to [Storage](https://platform.openai.com/storage) section on the left sidebar.
    - Click on the JSONL file name with the desired batch ID (e.g., `batch_xxxxxxxxxxxxxxxxxxxxxxxx_output.jsonl`).
    - Download the JSONL file
2. **Using Python Script**: Execute the following code:

In [None]:
import re

response_files_dir = "./Responses"         # Directory where the JSONL file(s) wil be stored
responses_file_name = 'responses.jsonl'    # Desired name of the responses file
# batch_id = "batch_xxxxxxxxxxxxxxxxxxxxxxxx"

def check_batch(batch_id):
    """Retrieve batch information using the provided batch ID."""
    try:
        return client.batches.retrieve(batch_id)
    except Exception as e:
        print(e)
        return None

def extract_batch_num(file_name):
    """A function to extract batch number from file name if possible"""
    try:
        integers = re.findall(r'\d+', file_name)     # Find all integers in the string
        return int(integers[-1])                # Extract the last number
    except (IndexError, ValueError):
        return None

check_batch_response = check_batch(batch_id)        # Retrieve batch information

if check_batch_response:
    print("Batch status:", check_batch_response.status)

    try:
        input_file_name = client.files.retrieve(check_batch_response.input_file_id).filename
        print("Related to input file name:", input_file_name, "\n")
    except Exception:
        input_file_name = ""
        print("Input file is not found in files list in OpenAI\n")

    if check_batch_response.status == "completed" and check_batch_response.output_file_id:
        output_file_id = check_batch_response.output_file_id
        print("Output file ID:", output_file_id)

        os.makedirs(response_files_dir, exist_ok=True)   # Ensure the output directory exists, create if not

        batch_output_content = client.files.content(output_file_id)     # Retrieve content of the output file

        batch_num = extract_batch_num(input_file_name)      # Extract batch number from the file name if possible

         # Define output file name
        if batch_num is not None:
            responses_file_name = f"{responses_file_name.split('.')[0]}-batch_{batch_num}.jsonl"

        output_file_path = os.path.join(response_files_dir, responses_file_name)
        
        batch_output_content.write_to_file(output_file_path)        # Write output file content to disk
        print("Content of batch response retrieved successfully:", responses_file_name)

#### <u>**Optional**</u>:  
Combine all jsonl files into a single file, if splitted and sent in multiple batches earlier

In [None]:
import re

response_files_dir = "./Responses"          # Directory containing the JSONL files
first_file = 'responses-batch_1.jsonl'      # First file in the sequence (e.g., 'responses-batch_1.jsonl')
                                            # Expected sequence format: 'responses-batch_{number}.jsonl'
responses_jsonl_file = "./Responses/responses.jsonl" # Desired output JSONL file path

def extract_prefix_before_last_num(text):
    """Extracts the prefix before the last number in the given text."""
    integers = re.findall(r'\d+', text)             # Find all integers in the string
    if not integers:
        raise ValueError("No number found in the input text.")
    last_integer = int(integers[-1])                # Extract the last number
    return text.rsplit(str(last_integer), 1)[0]     # Extract the prefix before the last number

def combine_jsonl_files(response_files_dir, output_file, first_file):
    """Combine JSONL files in the input directory and write to the output JSONL file."""
    if not first_file:
        raise ValueError("The 'first_file' argument must be provided to determine the prefix.")
    file_prefix = extract_prefix_before_last_num(first_file)
    if not os.path.isdir(response_files_dir):
        raise FileNotFoundError(f"Input directory '{response_files_dir}' does not exist.")
    if not os.listdir(response_files_dir):
        raise FileNotFoundError(f"Input directory '{response_files_dir}' is empty.")
    
    with open(output_file, "w", encoding="utf-8") as out_jsonl:
        for file_name in os.listdir(response_files_dir):
            if file_name.startswith(file_prefix) and file_name.endswith(".jsonl"):
                file_path = os.path.join(response_files_dir, file_name)
                with open(file_path, "r", encoding="utf-8") as in_jsonl:
                    for line in in_jsonl:
                        out_jsonl.write(line)
    print("Responses are now combined in one JSONL file.")

combine_jsonl_files(response_files_dir, responses_jsonl_file, first_file)

### **Extract Batch Responses from JSONL File to a CSV File**

In [None]:
prompts_jsonl_file = './prompts.jsonl'
responses_jsonl_file = "./Responses/responses.jsonl"
responses_csv_file = "./Responses/extracted_responses.csv"

# Dictionary to store custom IDs and corresponding message content
custom_id_to_message = {}

# Read data from the output JSONL file and populate the dictionary
with open(responses_jsonl_file, "r", encoding="utf-8") as jsonlfile:
    for line in jsonlfile:
        data = json.loads(line.strip())
        custom_id = data.get("custom_id", "")
        response_body = data.get("response", {}).get("body", {})
        
        if response_body:
            assistant_response = response_body.get("choices", [])[0].get("message", {}).get("content", "")
            custom_id_to_message[custom_id] = assistant_response
        else:
            print(f"No response found for custom ID: {custom_id}")

# Write data to the CSV file in the order of custom IDs from the input JSONL file
with open(responses_csv_file, "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Custom ID", "Response"])

    with open(prompts_jsonl_file, "r", encoding="utf-8") as input_jsonlfile:
        for line in input_jsonlfile:
            data = json.loads(line.strip())
            custom_id = data.get("custom_id", "")
            message_content = custom_id_to_message.get(custom_id, "")
            writer.writerow([custom_id, message_content])
            if not message_content:
                print(f"No message content found for custom ID: {custom_id}")

print("Responses to prompts have been saved to a CSV file.")