In [3]:
import json
import random

def load_file(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

def get_random_perturbation(perturbations):
    category = random.choice(list(perturbations.keys()))
    perturbation = random.choice(list(perturbations[category].items()))
    return category, perturbation

# Usage
file_path = 'perturbations.json'
perturbations = load_file(file_path)
random_category, random_perturbation = get_random_perturbation(perturbations)
print(f"Category: {random_category}, Perturbation: {random_perturbation[0]}, Description: {random_perturbation[1]}")


Category: Market and Economic Factors, Perturbation: Competitive Reaction, Description: Considering the market entry of a new competitor


In [None]:
# LLM dynamic evals

import replicate
import pandas as pd
import json
import os
from dotenv import load_dotenv
load_dotenv()
folder_path = 'files'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

config = load_file('config.json')

INSTRUCTION = config['instructions']
F_NAME = config["name"]

# # Load the file
df = pd.read_excel('files/questions_dynamic.xlsx')
# Save the original DataFrame
df.to_excel('files/questions_original_dynamic.xlsx', index=False)

# Trim whitespace and newline characters
df['Question'] = df['Question'].str.strip()  # Removes leading/trailing whitespace

# Check for duplicate questions
duplicates = df.duplicated(subset=['Question'], keep=False)
if duplicates.any():
    print("Duplicates found. Removing duplicates.")

    # Remove duplicates, keeping the first occurrence
    df = df.drop_duplicates(subset=['Question'], keep='first')

    # Save the modified DataFrame, overwriting the original 'questions.xlsx'
    df.to_excel('files/questions_dynamic.xlsx', index=False)
else:
    print("No duplicates found.")

perturbations = load_file('perturbations.json')
knowledgebase = load_file('knowledgebase.json')

# Modify DataFrame to include new columns
results_df = pd.DataFrame(columns=['Model', 'Question', 'Response', 'Perturbed Question', 'Perturbed Response', 'Final Analysis Question', 'Final Analysis Response'])

models = {
    "qwen-14b": "nomagick/qwen-14b-chat:f9e1ed25e2073f72ff9a3f46545d909b1078e674da543e791dec79218072ae70",
    "falcon-40b": "joehoover/falcon-40b-instruct:7d58d6bddc53c23fa451c403b2b5373b1e0fa094e4e0d1b98c3d02931aa07173",
    "yi-34b": "01-ai/yi-34b-chat:914692bbe8a8e2b91a4e44203e70d170c9c5ccc1359b283c84b0ec8d47819a46",
    "mistral-7b": "mistralai/mistral-7b-instruct-v0.2:f5701ad84de5715051cb99d550539719f8a7fbcf65e0e62a3d1eb3f94720764e",
    "llama2-70b": "meta/llama-2-70b-chat",
    "openhermes2": "antoinelyset/openhermes-2.5-mistral-7b:d7ccd25700fb11c1787c25b580ac8d715d2b677202fe54b77f9b4a1eb7d73e2b",
    "mixtral-instruct": "mistralai/mixtral-8x7b-instruct-v0.1:2b56576fcfbe32fa0526897d8385dd3fb3d36ba6fd0dbe033c72886b81ade93e",
    "deepseek_33bq": "kcaverly/deepseek-coder-33b-instruct-gguf:ea964345066a8868e43aca432f314822660b72e29cab6b4b904b779014fe58fd",
    }

prompt_for_qwen="""<|im_start|>system\n {INSTRUCTION}. Please try your best to answer the following question. <|im_end|>\n<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n"""
prompt_for_hermes = """[
{{
  "role": "system",
  "content": "{INSTRUCTION}. Please try your best to answer the following question." 
}},
{{
  "role": "user",
  "content": {question}
}}
]"""

def ask_llm(model_value, prompt):   
    output = replicate.run(
        model_value,
        input={
            "debug": False,
        #   "top_k": 50,
            "top_p": 0.9,
            "prompt": prompt,
            "temperature": 0.7,
            "max_new_tokens": 500,
            "min_new_tokens": -1
        }
    )
    response = ""
    for item in output:
        item_str = str(item)  # Convert item to string
        response += item_str if len(item_str) == 1 else f" {item_str}"
    response = response.strip()
    return response

# Iterate through each model
for model_key, model_value in models.items():
    responses = []

    for index, row in df.iterrows():
        qn = row['Question']
        question = json.dumps(qn)

        if model_key == "yi-34b":  # Yi model
            prompt = prompt_for_qwen.format(INSTRUCTION=INSTRUCTION, question=question)
        if model_key == "qwen-14b":  # Qwen model
            prompt = prompt_for_qwen.format(INSTRUCTION=INSTRUCTION, question=question)
        elif model_key == "openhermes2":  # Hermes model
            prompt = prompt_for_hermes.format(INSTRUCTION=INSTRUCTION, question=question)
        else:
            plain_text_question = json.loads(question)
            prompt = f"{INSTRUCTION}. Please try your best to answer the following question. {plain_text_question}"

        try:
            print(prompt)
            response = ask_llm(model_value, prompt)
            
        except Exception as e:
            response = f"Error: {e}"

        # Perturb the question and get the response
        category, (perturbation, description) = get_random_perturbation(perturbations)
        perturbed_qn = f"{qn}\nResponse: {response}\nChange in circumstances: {perturbation} - {description}\n What should change in the response?"
        if model_key == "yi-34b":  # Yi model
            prompt = prompt_for_qwen.format(INSTRUCTION=INSTRUCTION, question=perturbed_qn)
        if model_key == "qwen-14b":  # Qwen model
            prompt = prompt_for_qwen.format(INSTRUCTION=INSTRUCTION, question=perturbed_qn)
        elif model_key == "openhermes2":  # Hermes model
            prompt = prompt_for_hermes.format(INSTRUCTION=INSTRUCTION, question=perturbed_qn)
        else:
            plain_text_question = json.loads(perturbed_qn)
            prompt = f"{INSTRUCTION}. Please try your best to answer the following question. {plain_text_question}"
        perturbed_response = ask_llm(model_value, perturbed_qn)

        # Evaluate sufficiency or suggest alternate course
        final_analysis_qn = f"Original Question: {question}\nOriginal Response: {response} \nPerturbation ({category}): {perturbation} - {description}\n {perturbed_response}\nKnowledgebase: {knowledgebase}\nNow consider the knowlegebase, what else ought we to do?"
        if model_key == "yi-34b":  # Yi model
            prompt = prompt_for_qwen.format(INSTRUCTION=INSTRUCTION, question=final_analysis_qn)
        if model_key == "qwen-14b":  # Qwen model
            prompt = prompt_for_qwen.format(INSTRUCTION=INSTRUCTION, question=final_analysis_qn)
        elif model_key == "openhermes2":  # Hermes model
            prompt = prompt_for_hermes.format(INSTRUCTION=INSTRUCTION, question=final_analysis_qn)
        else:
            plain_text_question = json.loads(final_analysis_qn)
            prompt = f"{INSTRUCTION}. Please try your best to answer the following question. {plain_text_question}"
        final_analysis_response = ask_llm(model_value, final_analysis_qn)

        # Record each stage
        new_row = {
            'Model': model_key, 
            'Question': qn, 
            'Response': response, 
            'Perturbed Question': perturbed_qn, 
            'Perturbed Response': perturbed_response, 
            'Final Analysis Question': final_analysis_qn, 
            'Final Analysis Response': final_analysis_response
        }
        results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)

        if index % 10 == 0:
            results_df.to_excel(f'files/{F_NAME}_results_grouped_by_model_dynamic.xlsx', index=False, sheet_name='Sheet1')
            
results_df.to_excel(f'files/{F_NAME}_results_grouped_by_model_dynamic.xlsx', index=False, sheet_name='Sheet1')

In [None]:
# GPT-4 dynamic evaluation

import pandas as pd
import json
import openai
import requests
from openai import OpenAI
import time
from dotenv import load_dotenv
load_dotenv()
import os
folder_path = 'files'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

config = load_file('config.json')
perturbations = load_file('perturbations.json')
knowledgebase = load_file('knowledgebase.json')

INSTRUCTION = config['instructions']
F_NAME = config["name"]

GPT_MODEL = "gpt-4-1106-preview"
INPUT_CSV_PATH = 'files/questions_dynamic.xlsx'
OUTPUT_CSV_PATH = f'files/{F_NAME}_results_gpt4_dynamic.xlsx'

client = OpenAI()
def show_json(obj):
    print(json.loads(obj.model_dump_json()))

assistant = client.beta.assistants.create(
    name=f"{F_NAME} AI Dynamic Evaluator",
    instructions=INSTRUCTION,
    model=GPT_MODEL,
)
show_json(assistant)

# Utility functions
def read_csv(file_path):
    return pd.read_excel(file_path)

def process_data_for_gpt(data):
    prompts = []
    for _, row in data.iterrows():
        question = row['Question']
        prompt = f"Please try your best to answer the following question.:\n\n{question}"
        prompts.append(prompt)
    return prompts

def submit_message_and_create_run(assistant_id, prompt):
    thread = client.beta.threads.create() # If you replace this globally it appends all answers to the one before.
    client.beta.threads.messages.create(thread_id=thread.id, role="user", content=prompt)
    return client.beta.threads.runs.create(thread_id=thread.id, assistant_id=assistant_id), thread

def wait_on_run_and_get_response(run, thread):
    while run.status == "queued" or run.status == "in_progress":
        run = client.beta.threads.runs.retrieve(thread_id=thread.id, run_id=run.id)
        time.sleep(0.5)
    messages = client.beta.threads.messages.list(thread_id=thread.id, order="asc")
    return [m.content[0].text.value for m in messages if m.role == 'assistant']

data = read_csv(INPUT_CSV_PATH)
prompts = process_data_for_gpt(data)
ASSISTANT_ID = assistant.id

def ask_gpt4(prompt, ASSISTANT_ID):
    run, thread = submit_message_and_create_run(ASSISTANT_ID, prompt)
    response = wait_on_run_and_get_response(run, thread)
    if isinstance(response, list):
        response = ' '.join(map(str, response))
    response = response.replace("\\\\n", "\\n")
    response = response.strip()
    print(response)
    responses.append(response)

def process_question_with_gpt4(original_question, assistant_id):
    # Get initial response
    first_response = ask_gpt4(original_question,assistant_id = ASSISTANT_ID)
    # Perturb the question and get the response
    category, (perturbation, description) = get_random_perturbation(perturbations)
    perturbed_qn = f"{original_question}\nResponse: {first_response}\nChange in circumstances: {perturbation} - {description}\n What should change in the response?"
    perturbed_response = ask_gpt4(perturbed_qn)

    # Evaluate sufficiency or suggest alternate course
    final_analysis_qn = f"Original Question: {question}\nOrig Response: {first_response} \nPerturbation ({category}): {perturbation} - {description}\n {perturbed_response}\nKnowledgebase Content: {knowledgebase}\n Now consider the knowlegebase, what else ought we to do?"
    final_analysis_response = ask_gpt4(final_analysis_qn)

    return {
        'Question': original_question, 
        'Response': first_response, 
        'Perturbed Question': perturbed_qn, 
        'Perturbed Response': perturbed_response, 
        'Final Analysis Question': final_analysis_qn, 
        'Final Analysis Response': final_analysis_response
    }

# Modify DataFrame to include new columns
new_data_columns = ['Model', 'Question', 'Response', 'Perturbed Question', 'Perturbed Response', 'Final Analysis Question', 'Final Analysis Response']
results_df = pd.DataFrame(columns=new_data_columns)

# Process each question
for prompt in prompts:
    processed_info = process_question_with_gpt4(prompt, ASSISTANT_ID)
    results_df = results_df.append(processed_info, ignore_index=True)

# Save the results
results_df.to_excel(OUTPUT_CSV_PATH, index=False)
