# Batch Requests with OpenAI API

This notebook demonstrates how to efficiently process multiple prompts at once using OpenAI's Batch API.  
  
**Note**: Batch responses may take up to 24 hours to be processed by OpenAI.
  
  
### **Step 1: Prepare JSONL File**
To use OpenAI's Batch API, you need to provide prompts in a JSONL (JSON Lines) format.  
Assuming you have a CSV file that includes two columns:  
- prompts 
- IDs corresponding to each prompt (needed to retreive its response later)  

CSV file content should be structured as follows:
```
IDs,Prompts
request-1,"Once upon a time,"
request-2,"Translate Good morning to French."
request-3,"Write a poem about circles."
```
then use the following code to make a `JSONL` file. (Batch API accepts only JSONL file).

In [1]:
import csv
import json
import os

# Path to your CSV file
csv_file_name = 'prompts.csv'
csv_file_directory = '.'

# Constructing the file paths
csv_file = os.path.join(csv_file_directory, csv_file_name)
input_jsonl_file = os.path.join(csv_file_directory, csv_file_name.replace('csv', 'jsonl'))

# Model configuration
model = "gpt-4-turbo"
max_tokens = 20
temperature = 1
system_message = "You are a helpful assistant."

def csv_to_jsonl(csv_file, jsonl_file, model, max_tokens=4096, temperature=1, system="You are a helpful assistant."):
    """Convert a CSV file to JSON Lines (JSONL) for batch requests using OpenAI API."""
    with open(csv_file, 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        with open(jsonl_file, 'w', encoding='utf-8') as jsonlfile:
            for row in reader:
                custom_id = row['IDs']
                user_message = row['Prompts']
                data = construct_json_line(custom_id, model, user_message, system, max_tokens, temperature)
                jsonlfile.write(json.dumps(data) + '\n')

def construct_json_line(custom_id, model, user_message, system_message, max_tokens, temperature):
    """Construct a JSON line for a chat completion request."""
    return {
        "custom_id": custom_id, 
        "method": "POST", 
        "url": "/v1/chat/completions", 
        "body": {
            "model": model, 
            "messages": [
                {"role": "system", "content": system_message},
                {"role": "user", "content": user_message}
            ],
            "max_tokens": max_tokens,
            "temperature": temperature,
        }
    }

# Convert CSV to JSONL
csv_to_jsonl(csv_file, input_jsonl_file, model, max_tokens, temperature, system_message)


#### <u>**Optional**</u>:
If your prompts in the JSONL file exceeds the maximum limit of tokens per day (900,000 TPD for tier 1), you can split the prompts in the JSONL file into multiple files, and batch them sequentially.  
  
If you receive this error for example:  
```
BatchError(code=‘token_limit_exceeded’, line=None, message='Enqueued token limit reached ...
```
in this case, you can use the following code to split the JSONL file.  
  
Specify `lines_per_file` based on the number of tokens in your prompts

In [3]:
import os

lines_per_file = 1    # Adjust as needed
input_jsonl_file = './prompts.jsonl'
splitted_files_directory = "./Batch_Prompts"

def split_file(input_file_path, output_directory, lines_per_file):
    """Split lines into multiple files with a specified maximum number of lines per file."""
    os.makedirs(output_directory, exist_ok=True)        # Ensure the output directory exists, create if not 
    with open(input_file_path, "r") as infile:
        lines = infile.readlines()

    # Calculate the number of output files needed
    num_files = (len(lines) + lines_per_file - 1) // lines_per_file

    # Write lines to each output file
    for i in range(num_files):
        start_index = i * lines_per_file
        end_index = min((i + 1) * lines_per_file, len(lines))
        output_filename = f"prompts-batch_{i+1}.jsonl"
        with open(os.path.join(output_directory, output_filename), "w") as outfile:
            outfile.writelines(lines[start_index:end_index])

split_file(input_jsonl_file, splitted_files_directory, lines_per_file)

### **Step 2: Upload JSONL File(s)**

There are two options to upload a file:

1. **Via OpenAI Account**: Follow these steps:
    - In OpenAI account dashboard:
    - Navigate to [Storage](https://platform.openai.com/storage) section on the left sidebar.
    - Click on **+ Upload** button at the top right.
    - Select the file(s) and click on **Upload**.
2. **Using Python Script**: Execute the following code:

In [3]:
import os
from openai import OpenAI

api_key = os.getenv('OPENAI_API_KEY')
client = OpenAI(api_key=api_key)

# batch_files_directory = "./Batch_Prompts"     # if you have multiple JSONL files
batch_files_directory = "."               # if you have one JSONL file

def upload_file(file_path):
    '''Upload a JSONL file and return the file ID from OpenAI's server.'''
    try:
        with open(file_path, 'rb') as file:
            response = client.files.create(
                file=file,
                purpose="batch"
            )
        return response
    except Exception as e:
        print(e)
        return None

# Get the list of JSONL files in the directory
jsonl_files = [file for file in os.listdir(batch_files_directory) if file.endswith(".jsonl")]

# Upload each JSONL file
for jsonl_file in jsonl_files:
    jsonl_batch_file_path = os.path.join(batch_files_directory, jsonl_file)
    response = upload_file(jsonl_batch_file_path)
    if response:
        try:
            input_file_id = response.id
            print(f"File ID for {jsonl_file}: {input_file_id}")
        except Exception as e:
            print(e)

File ID for prompts.jsonl: file-YqKy1G5WWVIwXngvhqH4Bc7v


#### Display List of Files in OpenAI Account
This includes uploaded files, as well as output files generated by OpenAI

In [5]:
# List of available files 
from datetime import datetime

def convert_timestamp(timestamp):
    return datetime.fromtimestamp(timestamp) if timestamp is not None else None

try:
    list_files = client.files.list(
        # purpose="batch"       # specify purpose of file (batch, fine-tuning, assistant, etc).
    )
except Exception as e:
    print(e)   

for i in range(len(list_files.data)):
    file_data = list_files.data[i]
    print(f"File: {file_data.id:<15} | Purpose: {file_data.purpose:<12} | Created at: {convert_timestamp(file_data.created_at)} | File Name: {file_data.filename:<15}")

File: file-YqKy1G5WWVIwXngvhqH4Bc7v | Purpose: batch        | Created at: 2024-04-28 09:28:24 | File Name: prompts.jsonl  


#### Delete a File
You can delete a file by passing its file ID

In [None]:
# delete a file
delete_file_id = "file-xxxxxxxxxxxxxxxxxxxxxxxx"

try:
    response = client.files.delete(delete_file_id)
    print(response)
except Exception as e:
    print(e) 

### **Step 3: Create a Batch**

(If you have multiple batch files to process, execute them sequentially to avoid reaching the Max Tokens per Day limit.)  
  
There are two options to create a batch request:

1. **Via OpenAI Account**: Follow these steps:
    - Access OpenAI account dashboard.
    - Navigate to [Batch](https://platform.openai.com/batch) section on the left sidebar.
    - Click on **+ Create** button at the top right.
    - Upload the file and click on **Create**.
2. **Using Python Script**: Execute the following code:

In [19]:
# input_file_id = "file-xxxxxxxxxxxxxxxxxxxxxxxx"

def create_batch(input_file_id, client):
    """Create a batch request and return the response."""
    try:
        batch_response = client.batches.create(
            input_file_id=input_file_id,
            endpoint="/v1/chat/completions",
            completion_window="24h"
        )
        return batch_response
    except Exception as e:
        print(e)
        return None

# Execute the batch creation
batch_response = create_batch(input_file_id, client)

# Display batch information
if batch_response:
    try:
        file_name = client.files.retrieve(input_file_id).filename
        print("Batch file name:", file_name)
    except Exception as e:
        print(f"Error retrieving file name: {e}")
    print("Batch status:", batch_response.status)
    if batch_response.id:
        batch_id = batch_response.id
        print("Batch ID:", batch_id)

Batch file name: prompts.jsonl
Batch status: validating
Batch ID: batch_CmmnPNarIuLiqqe031MwFMXR


### **Step 4: Monitor and Retrieve Batch Information**

Again you have two options to monitor a batch request:

1. **Via OpenAI Account**: Follow these steps:
    - Access OpenAI account dashboard.
    - Navigate to [Batch](https://platform.openai.com/batch) section on the left sidebar.
    - Click on the desired batch ID (e.g., `batch_xxxxxxxxxxxxxxxxxxxxxxxx`).
    - Check the status.
2. **Using Python Script**: Execute the following code:

In [20]:
from datetime import datetime

# batch_id = "batch_xxxxxxxxxxxxxxxxxxxxxxxx"

def check_batch(batch_id):
    try:
        return client.batches.retrieve(batch_id)
    except Exception as e:
        print("An error occurred while retrieving batch:", e)
        return None

def convert_timestamp(timestamp):
    return datetime.fromtimestamp(timestamp) if timestamp is not None else None

def print_label_and_timestamp(label, timestamp):
    if timestamp is not None:
        print(label + ":", convert_timestamp(timestamp))

batch_response = check_batch(batch_id)

if batch_response:
    print("Batch status:", batch_response.status)
    request_counts = batch_response.request_counts
    print(f"Completed: {request_counts.completed:<6}| Failed: {request_counts.failed:<6}| Total: {request_counts.total:<6}\n")

    if batch_response.errors:
        print("Batch error:", batch_response.errors)
    else:
        event_names = ['created_at', 'expires_at', 'completed_at', 'expired_at', 'failed_at']
        for event_name in event_names:
            print_label_and_timestamp(f"Batch {event_name.replace('_', ' ')}", getattr(batch_response, event_name))

Batch status: in_progress
Completed: 0     | Failed: 0     | Total: 3     

Batch created at: 2024-04-28 09:46:45
Batch expires at: 2024-04-29 09:46:45


#### **<u>Cancel</u> a Batch**
Eiter:
1. **Via OpenAI Account**: Follow these steps:
    - Access OpenAI account dashboard.
    - Navigate to [Batch](https://platform.openai.com/batch) section on the left sidebar.
    - Click on the desired batch ID (e.g., `batch_xxxxxxxxxxxxxxxxxxxxxxxx`).
    - Click on `Cancel`.
  
or  
2. **Using Python Script**: Execute the following code:

In [None]:
batch_id = "batch_xxxxxxxxxxxxxxxxxxxxxxxx"

batch_response = check_batch(batch_id)

if batch_response:
    print("Batch status:", batch_response.status)
    if batch_response.errors:
        print("Batch error:", batch_response.errors)
    else:
        cancel_response = client.batches.cancel(batch_id)
        if cancel_response.cancelled_at is not None:
            print("Batch canceled at:", convert_timestamp(cancel_response.cancelled_at))

### **Step 5: Retreive Output File Content**

Again there are two options to monitor a batch request:

1. **Via OpenAI Account**: Follow these steps:
    - Access OpenAI account dashboard.
    - Navigate to [Storage](https://platform.openai.com/storage) section on the left sidebar.
    - Click on the JSONL file name with the desired batch ID (e.g., `batch_xxxxxxxxxxxxxxxxxxxxxxxx_output.jsonl`).
    - Download the JSONL file
2. **Using Python Script**: Execute the following code:

In [None]:
import os

# batch_id = "batch_xxxxxxxxxxxxxxxxxxxxxxxx"
output_file_directory = "./Responses"
output_file_name = 'responses.jsonl'

def check_batch(batch_id):
    try:
        return client.batches.retrieve(batch_id)
    except Exception as e:
        print("An error occurred while retrieving batch:", e)
        return None

def extract_batch_num(file_name):
    """A function to extract batch number from file name if possible"""
    try:
        return int(file_name.split('_')[1].split('.')[0])
    except (IndexError, ValueError):
        return None

response = check_batch(batch_id)        # Retrieve batch information

if response:
    print("Batch status:", response.status)
    input_file_name = client.files.retrieve(response.input_file_id).filename
    print("Related to input file name:", input_file_name, "\n")
        
    if response.status == "completed" and response.output_file_id:
        output_file_id = response.output_file_id
        print("Output file ID:", output_file_id)

        os.makedirs(output_file_directory, exist_ok=True)   # Ensure the output directory exists, create if not

        batch_output_content = client.files.content(output_file_id)     # Retrieve content of the output file

        batch_num = extract_batch_num(input_file_name)      # Extract batch number from the file name if possible

         # Define output file name
        if batch_num is not None:
            output_file_name = f"{output_file_name.split('.')[0]}-batch_{batch_num}.jsonl"

        output_file_path = os.path.join(output_file_directory, output_file_name)
        
        batch_output_content.write_to_file(output_file_path)        # Write output file content to disk
        print("Output file retreived successfully:", output_file_name)

#### <u>**Optional**</u>: Combine all jsonl files into a single file, if splitted and sent in multiple batches earlier

In [None]:
import json
import os

# Directory containing the JSONL files
jsonl_directory = "./Responses"
output_file_name = 'responses.jsonl'

# List to store combined responses
combined_responses = []

# Step 1: Combine responses from all JSONL files into one list
for file_name in os.listdir(jsonl_directory):
    if file_name.startswith(f"{output_file_name}-batch_") and file_name.endswith(".jsonl"):
        file_path = os.path.join(jsonl_directory, file_name)
        with open(file_path, "r", encoding="utf-8") as jsonl_file:
            for line in jsonl_file:
                data = json.loads(line.strip())
                combined_responses.append(data)

print("Repsonses are now combined in one list.")
print("Response of the first request:")
print(combined_responses[0])

In [None]:
# Step 2: Write combined responses to a new JSONL file
output_jsonl_file = "./Responses/responses.jsonl"
with open(output_jsonl_file, "w", encoding="utf-8") as outfile:
    for response in combined_responses:
        outfile.write(json.dumps(response) + "\n")

print("Repsonses are now combined in one JSONL file.")

#### Extract Batch Responses from JSONL File to a CSV File

In [None]:
import json
import csv

input_jsonl_file = './prompts.jsonl'
output_jsonl_file = "./Responses/responses.jsonl"
output_csv_file = "./Responses/extracted_responses.csv"

# Dictionary to store custom IDs and corresponding message content
custom_id_to_message = {}

# Read data from the output JSONL file and populate the dictionary
with open(output_jsonl_file, "r", encoding="utf-8") as jsonlfile:
    for line in jsonlfile:
        data = json.loads(line.strip())
        custom_id = data.get("custom_id", "")
        response_body = data.get("response", {}).get("body", {})
        
        if response_body:
            assistant_response = response_body.get("choices", [])[0].get("message", {}).get("content", "")
            custom_id_to_message[custom_id] = assistant_response
        else:
            print(f"No response found for custom ID: {custom_id}")

# Write data to the CSV file in the order of custom IDs from the input JSONL file
with open(output_csv_file, "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Custom ID", "Response"])

    with open(input_jsonl_file, "r", encoding="utf-8") as input_jsonlfile:
        for line in input_jsonlfile:
            data = json.loads(line.strip())
            custom_id = data.get("custom_id", "")
            message_content = custom_id_to_message.get(custom_id, "")
            writer.writerow([custom_id, message_content])
            if not message_content:
                print(f"No message content found for custom ID: {custom_id}")

print("Responses to prompts have been saved to a CSV file.")