# Evaluation Data Upload Guide

This notebook demonstrates how to upload and prepare data for evaluation.

## 1. Upload Data File

Upload a  file containing model generations for evaluation.

In [1]:
import os
import pandas as pd

df_ru = pd.read_excel("./ru_kaz_data/ru_kz_15_model_responses.xlsx", sheet_name = "Ru-response")
df_kz = pd.read_excel("./ru_kaz_data/ru_kz_15_model_responses.xlsx", sheet_name = "Kz-response")

## 2. Construct Evaluation Messages

Build structured messages from the dataset to be passed into the evaluation model.

In [None]:
from notebooks.evaluate_binary_safety import construct_kz_message, construct_ru_message

In [3]:
# exaples of using these two functions to construct messages input into openai
messages, risk_ids = construct_kz_message(df_kz, risk_type_key = "risk_area", question_key = "question", 
                                          response_key = "llama3_3p1_8b_extend25p_3-1-3_CL_RT_en_IFT3")

messages, risk_ids = construct_kz_message(df_kz, risk_type_key = "risk_area", question_key = "question", 
                                          response_key = "LLama_3.1_KazLLM_1.0_70B")


messages, risk_ids, gold_labels = construct_ru_message(df_ru, risk_type_key = "risk_area", question_key = "question", 
                                                    response_key = "llama3_3p1_8b_extend25p_3-1-3_CL_RT_en_IFT3")

messages, risk_ids, gold_labels = construct_ru_message(df_ru, risk_type_key = "risk_area", question_key = "question", 
                                                    response_key = "LLama_3.1_KazLLM_1.0_70B")

## 3. Openai Batch Input

#### 1. Prepare Your Batch File

In [None]:
from notebooks.evaluate_binary_safety import generate_batch_request

In [9]:
# Construct messages for the Russian responses
messages, risk_ids, gold_labels = construct_ru_message(df_ru, risk_type_key = "risk_area", question_key = "question", 
                                                    response_key = "gpt4o_response")
# Select a column to evaluate
data = generate_batch_request(messages, savedir = "./ru_kaz_data/eval_results/new", 
                              dataset_name = "gpt4o_response", model="gpt-4o")

4383


In [10]:
# Construct messages for the Kazakh responses

messages, risk_ids = construct_kz_message(df_kz, risk_type_key = "risk_area", question_key = "question", 
                                          response_key = "gpt4o_response")
# Select a column to evaluate
data = generate_batch_request(messages, savedir = "./ru_kaz_data/eval_results/new", 
                              dataset_name = "gpt4o_response", model="gpt-4o")

3786


In [None]:
from openai import OpenAI
key_path = "./openaikey.txt"
with open(key_path, 'r') as f:
    api_key = f.readline()
client = OpenAI(api_key = api_key.strip())

In [None]:
files = os.listdir("./ru_kaz_data/eval_results/new")

for file in files:
    print(file)
    # upload model responses
    input_file = os.path.join("./ru_kaz_data/eval_results/new", file)
    
    batch_input_file = client.files.create(
      file=open(input_file, "rb"),
      purpose="batch"
    )
    
    batch = client.batches.create(
        input_file_id=batch_input_file.id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
          "description": "nightly eval job"
        }
    )
    
    batch_info = client.batches.retrieve(batch.id)
    print(batch_info)

#### 2. Retrieve the Results

In [None]:
client.batches.list(limit=3).to_dict()

In [None]:
content = client.files.content('YOUR_FILE_ID_HERE')  # replace with your file ID
content.write_to_file("./ru_kaz_data/eval_results/new/YOUR_MODEL_RU_gpt-4o_output.jsonl")