## Import Utilities

In [None]:
import re
import os
import csv
import json
import pandas as pd
from datetime import datetime
from openai import OpenAI
import sys

sys.path.append('..')  # Add the parent directory of LLM_Evaluations to the Python path
from Utils.llm_evaluation_utils import load_responses_df, \
                        check_and_store_response,   \
                        build_question_prompt,      \
                        QUESTION_SETS

api_key = os.environ.get('OPENAI_API_KEY')
client = OpenAI(api_key=api_key)

model_name = 'gpt-4o'
max_tokens = 200
temperature = 1
question_type = 'ZS'
QUESTIONS = QUESTION_SETS[question_type]['QUESTIONS']

Defined-Functions

In [None]:
def convert_timestamp(timestamp):
    '''Converts a Unix timestamp to a datetime object.'''
    return datetime.fromtimestamp(timestamp) if timestamp is not None else None

def retrieve_file_name(file_id, client: OpenAI):
    '''Retrieve the filename associated with the given file ID.'''
    try:
        return client.files.retrieve(file_id).filename
    except Exception as e:
        print(f'Error retrieving file ID {file_id}:', e)
        return ''

def upload_file(file_path, client: OpenAI):
    '''Upload a JSONL file and return the file ID from OpenAI's server.'''
    try:
        with open(file_path, 'rb') as file:
            file_upload_response = client.files.create(
                file=file,
                purpose='batch'
            )
        return file_upload_response
    except Exception as e:
        print(e)
        return None

def create_batch(input_file_id, client: OpenAI):
    '''Create a batch request and return the response.'''
    try:
        batch_response = client.batches.create(
            input_file_id=input_file_id,
            endpoint='/v1/chat/completions',
            completion_window='24h'
        )
        return batch_response
    except Exception as e:
        print(e)
        return None
    
def check_batch(batch_id, client: OpenAI):
    '''Retrieve batch information using the provided batch ID.'''
    try:
        return client.batches.retrieve(batch_id)
    except Exception as e:
        print(e)
        return None

def cancel_batch(batch_id, client: OpenAI):
    '''Cancel a batch with the provided batch ID and return the cancellation response.'''
    try:
        return client.batches.cancel(batch_id)
    except Exception as e:
        print(e)
        return None
    
def extract_batch_num(filename):
    '''
    Extract the batch number from the filename. 
    Filename should ends with an integer before the extension.
    Example: `prompts-batch_3.jsonl`
    '''
    match = re.search(r'(\d+)\.', filename)
    return int(match.group(1)) if match else None

def prepare_response_file(input_file_name, response_files_dir, responses_file_name='responses.jsonl'):
    '''Generate a response file name based on the batch number.'''
    batch_num = extract_batch_num(input_file_name)
    if batch_num is not None:
        responses_file_name = f"{responses_file_name.split('.')[0]}-batch_{batch_num}.jsonl"
    responses_file_path = os.path.join(response_files_dir, responses_file_name)
    return responses_file_path, responses_file_name

## **Load Data**

In [None]:
# transcripts_dir = '../../Getting_Transcripts'
# transcripts_file_name = 'merged_filtered_videos_transcripts.csv'
# responses_dir = '../LLMs_Responses'
# topics_to_include = ['Spina Bifida', 'Flat Feet', 'Cluster Headache', 'Trigger Finger', 'Pudendal Nerve']

transcripts_dir = '../../../ISA_Paper/Data'
transcripts_file_name = 'diabetes_videos_transcripts.csv'
responses_dir = '../../../ISA_Paper/Data/Results'
topics_to_include = ['Insulin Self-Administration']

prompt_type = 'GS_prompting'
topics = 'diabetes'
results_file_name = f'{model_name}-{topics}-{prompt_type}'

responses_df = load_responses_df(transcripts_dir, transcripts_file_name, responses_dir, results_file_name, question_type)

print('responses_df shape:', responses_df.shape)
responses_df.head(2)

In [None]:
if 'Topic' not in responses_df.columns:
    experts_file = '../../../Videos_and_DISCERN_data/filtered_experts_scores.csv'
    experts_df = pd.read_csv(experts_file)

    responses_df = responses_df.merge(experts_df[['Video ID', 'Topic']], on='Video ID', how='left')
    responses_df.insert(2, 'Topic', responses_df.pop('Topic'))
    responses_df = responses_df[responses_df['Topic'].isin(topics_to_include)]
    responses_df = responses_df.reset_index(drop=True)

print('responses_df shape:', responses_df.shape)
responses_df.head(2)

## **Build Prompts**

In [None]:
prompts_csv_file = './prompts.csv'     # Prompts file to create

data = []

for _, row in responses_df.iterrows():
    for question_num in range(1, len(QUESTIONS) + 1):
        if row[f'Q{question_num}'] is None:
            video_id = row['Video ID']
            transcript = row['Transcript']

            custom_id = f'{video_id}&{question_num}'
            prompt = build_question_prompt(transcript, question_num, question_type)
            data.append([custom_id, prompt])

df = pd.DataFrame(data, columns=['ID', 'Prompt'])
df.to_csv(prompts_csv_file, index=False, quoting=csv.QUOTE_MINIMAL, encoding='utf-8')

## **Batch Prompt Requests**

### Step 1: Prepare JSONL File
To use OpenAI's Batch API, prompts must be provided in a JSONL (JSON Lines) format.  
This script converts CSV to `JSONL` file.


In [None]:
# prompts_csv_file = './prompts.csv'
prompts_jsonl_file = 'Batch_Files/prompts.jsonl'

def csv_to_jsonl(csv_file, jsonl_file, model, max_tokens=4096, temperature=1, system='You are a helpful assistant.'):
    '''Convert a CSV file to JSON Lines (JSONL) for batch requests using OpenAI API.'''
    # Ensure the directory exists
    os.makedirs(os.path.dirname(jsonl_file), exist_ok=True)

    with open(csv_file, 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        with open(jsonl_file, 'w', encoding='utf-8') as jsonlfile:
            for row in reader:
                custom_id = row['ID']
                user_message = row['Prompt']
                data = construct_json_line(custom_id, model, user_message, system, max_tokens, temperature)
                jsonlfile.write(json.dumps(data) + '\n')

def construct_json_line(custom_id, model, user_message, system_message, max_tokens, temperature):
    '''Construct a JSON line for a chat completion request.'''
    return {
        'custom_id': custom_id, 
        'method': 'POST', 
        'url': '/v1/chat/completions', 
        'body': {
            'model': model, 
            'messages': [
                # {'role': 'system', 'content': system_message},
                {'role': 'user', 'content': user_message}
            ],
            'max_tokens': max_tokens,
            'temperature': temperature,
        }
    }

csv_to_jsonl(prompts_csv_file, prompts_jsonl_file, model_name, max_tokens, temperature)

Since prompts in the JSONL file exceeds the maximum limit of tokens per day (900,000 TPD for tier 1), they have been splitted into multiple JSONL files, and then batched sequentially.

In [None]:
lines_per_file = 70    # Adjust as needed
prompts_jsonl_file = 'Batch_Files/prompts.jsonl'
splitted_prompts_dir = './Batch_Files/Batch_Prompts'

def split_file(input_file_path, output_dir, max_lines_per_file):
    '''Split lines into multiple files with a specified maximum number of lines per file.'''
    os.makedirs(output_dir, exist_ok=True)        # Ensure the output directory exists, create if not 
    with open(input_file_path, 'r') as infile:
        lines = infile.readlines()

    # Calculate the number of output files needed
    num_files = (len(lines) + max_lines_per_file - 1) // max_lines_per_file

    # Write lines to each output file
    for i in range(num_files):
        start_index = i * max_lines_per_file
        end_index = min((i + 1) * max_lines_per_file, len(lines))
        output_filename = f'prompts-batch_{i+1}.jsonl'
        with open(os.path.join(output_dir, output_filename), 'w') as outfile:
            outfile.writelines(lines[start_index:end_index])

split_file(prompts_jsonl_file, splitted_prompts_dir, lines_per_file)

### Step 2: Upload JSONL File(s) to OpenAI Account

In [None]:
batch_prompts_dir = './Batch_Files/Batch_Prompts'           # The directory containing the JSONL file(s)

# Get the list of JSONL files in the directory
jsonl_files = [file for file in os.listdir(batch_prompts_dir) if file.endswith('.jsonl')]

# Upload each JSONL file
for prompts_batch_file in jsonl_files:
    jsonl_batch_file_path = os.path.join(batch_prompts_dir, prompts_batch_file)
    file_upload_response = upload_file(jsonl_batch_file_path, client)
    if file_upload_response:
        input_file_id = file_upload_response.id
        print(f'File ID for {prompts_batch_file}: {input_file_id}')

In [None]:
# def extract_and_sort_file_ids(text):
#     pattern = re.compile(r'File ID for (prompts-batch_\d+\.jsonl): (\S+)')
#     matches = pattern.findall(text)
#     sorted_matches = sorted(matches, key=lambda x: int(re.search(r'\d+', x[0]).group()))
#     return dict(sorted_matches)

# files_list = '''
# File ID for prompts-batch_9.jsonl: file-80P02rE5rTRSF0aMAvfz2khh
# '''
# batch_files_dict = extract_and_sort_file_ids(files_list)
# batch_files_dict

#### Display List of Files in OpenAI Account
This includes uploaded files, as well as output files generated by OpenAI

In [None]:
try:
    list_files = client.files.list(
        # purpose='batch'       # Specify purpose of file (batch, fine-tuning, assistant, etc).
    )
    for i in range(len(list_files.data)):
        file_data = list_files.data[i]
        print(f'File: {file_data.id} | Purpose: {file_data.purpose:<12} | ', end='')
        print(f'Created at: {convert_timestamp(file_data.created_at)} | File Name: {file_data.filename}')
        
except Exception as e:
    print(e)

#### Delete a File

In [None]:
delete_file_id = 'file-BMkcaccsiJqcxpwQjXCmECcr'

try:
    response = client.files.delete(delete_file_id)
    print(response)
except Exception as e:
    print(e) 

### Step 3: Create a Batch

In [None]:
input_file_id = 'file-GIai7j5wL4FHARjUNfii7Uml'

# Execute the batch creation
create_batch_response = create_batch(input_file_id, client)

# Display batch information
if create_batch_response:
    print('Batch status:', create_batch_response.status)
    
    file_name = retrieve_file_name(input_file_id, client)
    if file_name:
        print('Batch file name:', file_name)

    batch_id = create_batch_response.id
    print('Batch ID:', batch_id)

### Step 4: Monitor and Retrieve Batch Information

In [None]:
batch_id = 'batch_rVzH8duBdVA9tpMQjstW2d19'

def print_label_and_timestamp(label, timestamp):
    '''Prints the provided label and timestamp in a human-readable format.'''
    print(label + ':', convert_timestamp(timestamp)) if timestamp is not None else None

# Retrieve batch information
check_batch_response = check_batch(batch_id, client)

# Display batch information
if check_batch_response:
    print('Batch status:', check_batch_response.status)
    request_counts = check_batch_response.request_counts
    print(f'Completed: {request_counts.completed:<6}| Failed: {request_counts.failed:<6}| Total: {request_counts.total:<6}\n')

    if check_batch_response.errors:
        print('Batch error:', check_batch_response.errors)
    else:
        event_names = ['created_at', 'expires_at', 'completed_at', 'expired_at', 'failed_at', 'cancelled_at']
        for event_name in event_names:
            print_label_and_timestamp(f"Batch {event_name.replace('_', ' ')}", getattr(check_batch_response, event_name))

#### Display List of Batch Requests

In [None]:
try:
    list_batches = client.batches.list(
        limit=10    # Number of batches to display
    )
    for i in range(len(list_batches.data)):
        batch_data = list_batches.data[i]
        input_file_name = retrieve_file_name(batch_data.input_file_id, client)
        print(f'Batch: {batch_data.id} | Status: {batch_data.status:<11} ', end='')
        print(f'| Created at: {convert_timestamp(batch_data.created_at)} | Input File: {input_file_name}')
        
except Exception as e:
    print(e)

#### <u>Cancel</u> a Batch

In [None]:
batch_id = 'batch_wHUxDnkfhvQf3JXlHAakGD20'

cancel_batch_response = cancel_batch(batch_id, client)

if cancel_batch_response:
    print('Batch status:', cancel_batch_response.status)
    if cancel_batch_response.errors:
        print('Batch error:', cancel_batch_response.errors)
    else:
        if cancel_batch_response.cancelled_at is not None:
            print('Batch canceled at:', convert_timestamp(cancel_batch_response.cancelled_at))

### Step 5: Retrieve Response File Content

In [None]:
response_files_dir = './Batch_Files/Batch_Responses'         # Directory where the JSONL file(s) wil be stored
responses_file_name = 'responses.jsonl'                      # Desired name of the responses file
batch_id = 'batch_rVzH8duBdVA9tpMQjstW2d19'

# Retrieve batch information
check_batch_response = check_batch(batch_id, client)

if check_batch_response:
    print('Batch status:', check_batch_response.status)

    input_file_name = retrieve_file_name(check_batch_response.input_file_id, client)
    if input_file_name:
        print('Related to input file:', input_file_name, '\n')
    
    if check_batch_response.status == 'completed' and check_batch_response.output_file_id:
        responses_file_path, responses_file_name =  prepare_response_file(input_file_name, 
                                               response_files_dir, 
                                               responses_file_name)

        output_file_id = check_batch_response.output_file_id
        print('Output file ID:', output_file_id)

        # Retrieve content of the output file
        batch_output_content = client.files.content(output_file_id)

        # Write output file content to disk
        batch_output_content.write_to_file(responses_file_path)
        print('Content of batch response retrieved successfully:', responses_file_name)

## Alternatively, Steps 2-5: Batching sequentially automatically

In [None]:
import time

start_from_batch = 1        # Batch number to start from
batch_prompts_dir = './Batch_Files/Batch_Prompts'       # The directory containing the JSONL file(s)
response_files_dir = './Batch_Files/Batch_Responses'    # Directory where the output JSONL file(s) wil be stored

def create_output_directory(response_files_dir):
    '''Create the output directory if it doesn't exist.'''
    os.makedirs(response_files_dir, exist_ok=True)

def filter_files_by_batch(jsonl_files, start_from_batch):
    '''Filter JSONL files based on the starting batch number.'''
    filtered_files = [file for file in jsonl_files if extract_batch_num(file) is not None]
    filtered_files = sorted(filtered_files, key=extract_batch_num)
    return [file for file in filtered_files if extract_batch_num(file) >= start_from_batch]

def process_batch_file(batch_prompts_dir, prompts_batch_file, client: OpenAI):
    '''Process a single JSONL batch file.'''
    jsonl_batch_file_path = os.path.join(batch_prompts_dir, prompts_batch_file)
    file_upload_response = upload_file(jsonl_batch_file_path, client)
    if file_upload_response:
        input_file_id = file_upload_response.id
        print(f'File ID for {prompts_batch_file}: {input_file_id}')
        return input_file_id
    return None

def create_batch_and_wait(input_file_id, prompts_batch_file, client: OpenAI):
    '''Create a batch and wait for completion.'''
    create_batch_response = create_batch(input_file_id, client)
    if create_batch_response:
        batch_id = create_batch_response.id
        print(f'Batch for {prompts_batch_file} created with ID: {batch_id}')
    else:
        print(f'Batch for {prompts_batch_file} not created')
        return None

    while True:
        check_batch_response = check_batch(batch_id, client)
        if not check_batch_response or check_batch_response.status == 'failed':
            print(f'Batch {batch_id} failed\n')
            return None
            
        if check_batch_response.status == 'completed' and check_batch_response.output_file_id:
            print(f'######## Batch {prompts_batch_file} has completed. ########')
            return check_batch_response.output_file_id
        else:
            time.sleep(2 * 60)  # Delay for 2 minutes

def retrieve_and_save_response(output_file_id, response_files_dir, prompts_batch_file, client: OpenAI):
    '''Retrieve and save the response content.'''
    responses_file_path, responses_file_name = prepare_response_file(prompts_batch_file, response_files_dir)
    batch_output_content = client.files.content(output_file_id)     
    batch_output_content.write_to_file(responses_file_path)        
    print('Content of batch response retrieved successfully:', responses_file_name, '\n')

def process_batches(batch_prompts_dir, response_files_dir, client: OpenAI, start_from_batch):
    create_output_directory(response_files_dir)

    jsonl_files = [file for file in os.listdir(batch_prompts_dir) if file.endswith('.jsonl')]
    filtered_files = filter_files_by_batch(jsonl_files, start_from_batch)

    for prompts_batch_file in filtered_files:
        input_file_id = process_batch_file(batch_prompts_dir, prompts_batch_file, client)
        if input_file_id:
            output_file_id = create_batch_and_wait(input_file_id, prompts_batch_file, client)
            if output_file_id:
                retrieve_and_save_response(output_file_id, response_files_dir, prompts_batch_file, client)
            else:
                break

process_batches(batch_prompts_dir, response_files_dir, client, start_from_batch)

#### Combine all jsonl files into a single file, if splitted and sent in multiple batches earlier

In [None]:
batch_responses_dir = './Batch_Files/Batch_Responses'   # Directory containing the JSONL files
first_response_file = 'responses-batch_1.jsonl'                  # First file in the sequence (e.g., 'responses-batch_1.jsonl')
                                                        # Expected sequence format: 'responses-batch_{number}.jsonl'
responses_jsonl_file = './Batch_Files/responses.jsonl'           # Desired output JSONL file path

def extract_prefix_before_last_num(text):
    '''Extracts the prefix before the last number in the given text.'''
    integers = re.findall(r'\d+', text)             # Find all integers in the string
    if not integers:
        raise ValueError('No number found in the input text.')
    last_integer = int(integers[-1])                # Extract the last number
    return text.rsplit(str(last_integer), 1)[0]     # Extract the prefix before the last number

def combine_jsonl_files(input_dir, output_file, first_file):
    '''Combine JSONL files in the input directory and write to the output JSONL file.'''
    if not first_file:
        raise ValueError("The 'first_file' argument must be provided to determine the prefix.")
    file_prefix = extract_prefix_before_last_num(first_file)
    if not os.path.isdir(input_dir):
        raise FileNotFoundError(f"Input directory '{input_dir}' does not exist.")
    if not os.listdir(input_dir):
        raise FileNotFoundError(f"Input directory '{input_dir}' is empty.")

    with open(output_file, 'w', encoding='utf-8') as out_jsonl:
        for file_name in os.listdir(input_dir):
            if file_name.startswith(file_prefix) and file_name.endswith('.jsonl'):
                file_path = os.path.join(input_dir, file_name)
                with open(file_path, 'r', encoding='utf-8') as in_jsonl:
                    for line in in_jsonl:
                        out_jsonl.write(line)
    print('Responses are now combined in one JSONL file.')

combine_jsonl_files(batch_responses_dir, responses_jsonl_file, first_response_file)

## **Extracting Results From Repsonses**

### Extract assistant messages to DataFrame

In [None]:
responses_jsonl_file = './Batch_Files/responses.jsonl'

def get_video_id_and_question_num(custom_id):
    video_id = custom_id.split('&')[0]
    question_num = int(custom_id.split('&')[1])
    return video_id, question_num

# Read data from the output JSONL file and populate the dictionary
with open(responses_jsonl_file, 'r', encoding='utf-8') as jsonlfile:
    for line in jsonlfile:
        data = json.loads(line.strip())
        custom_id = data.get('custom_id', '')
        response = data.get('response', {})
        
        if response.get('status_code') == 200:
            response_body = response.get('body', {})
            if response_body:
                assistant_response = response_body.get('choices', [])[0].get('message', {}).get('content', '')
                video_id, question_num = get_video_id_and_question_num(custom_id)
                check_and_store_response(assistant_response, responses_df, video_id, question_num, rating_scale=5, )
            else:
                print(f'No response body found for custom ID: {custom_id}')
        else:
            print(f"Error for custom ID: {custom_id}, status_code: {response.get('status_code')}")

responses_df.head(3)

### Explore Results

In [None]:
columns_with_none = (responses_df.isna() | (responses_df == '')).sum()
columns_with_none

In [None]:
rows_with_none = responses_df[responses_df.isna().any(axis=1)]
rows_with_none

In [None]:
indices_with_problems = responses_df[responses_df['Problem'].apply(lambda x: len(x) > 0)].index.tolist()
print(indices_with_problems)

In [None]:
from IPython.display import display, HTML

if indices_with_problems:
    index_with_problem = 54
    responses_with_problem_list = list(responses_df.loc[index_with_problem, 'Problem'])
    print("List of questions with problem:", responses_with_problem_list)

    response_with_problem = responses_with_problem_list[0]
    text = responses_df.loc[index_with_problem, f'Response_{response_with_problem}']
    display(HTML("<div style='white-space: pre-wrap;'>{}</div>".format(text)))

In [None]:
# display the full responses for a specific transcript
index_to_display = 25
for question_num in range(1, 16):
    print(f'Q{question_num}:', responses_df.at[index_to_display,f'Response_{question_num}'])

### Store Results in a CSV File

In [None]:
csv_output_file = os.path.join(responses_dir, f'{results_file_name}-response.csv')

responses_df.to_csv(csv_output_file, index=False, encoding='utf-8')