In [None]:
!pip install together
from together import Together



In [None]:
import os
import sys
sys.path.append('/content/drive/MyDrive/CS 159')

from utils import extract_answer, evaluate_file, evaluate_across_files

Mounted at /content/drive


In [None]:
import re
import glob
import requests
import pandas as pd
pd.set_option("display.max_columns", None)
import numpy as np
from tqdm.notebook import tqdm


data_path = "/content/drive/MyDrive/CS 159/data/raw/SpatialEvalLLM/"

In [None]:
class BaseModelClient:
    def __init__(self, api_key, model_path):
        self.client = Together(api_key=api_key)
        self.model_path = model_path

    def predict(self, prompt):
        response = self.client.chat.completions.create(
            model=self.model_path,
            messages=[{"role": "user", "content": prompt}]
        )
        return response.choices[0].message.content

class Llama8b(BaseModelClient):
    def __init__(self, api_key):
        super().__init__(api_key, "meta-llama/Llama-3-8b-chat-hf")

class Llama70b(BaseModelClient):
    def __init__(self, api_key):
        super().__init__(api_key, "meta-llama/Llama-3-70b-chat-hf")

api_key = "--redacted--"


In [None]:
import glob
import pandas as pd
import re
import concurrent.futures
from tqdm.notebook import tqdm
from pathlib import Path


def process_model_evaluation(model_name, model_instance, data_path, leading_prompt, trailing_prompt, suffix):
    print(f"Starting evaluation for {model_name} with trailing prompt: {trailing_prompt[:30]}...")

    scores_df = evaluate_across_files(
        model=model_instance,
        leading_prompt=leading_prompt,
        trailing_prompt=trailing_prompt,
        data_path=data_path,
        suffix=suffix,
        use_code=False,
        save=True,
    )

    save_path = f"/content/drive/MyDrive/CS 159/{model_name}_evaluation_{suffix}.csv"

    scores_df.to_csv(save_path, index=False)
    print(f"Results saved to {save_path}")

models = {
    "Llama70b": Llama70b(api_key),
    "Llama8b": Llama8b(api_key)
}


leading_prompt = ""
suffixes = ["indexing"] #"baseline", "zero_shot_cot", "zero_shot_vot"]

trailing_prompts = [
    "\n\nAssign a numerical index to each element on the path and use this system to track your position.\n\nPlace the answer, in lower case, with asterisks before and after like *this*. No asterisks anywhere else in the response.",
    # "\n\nExplain your reasoning step-by-step.\n\nPlace the answer, in lower case, with asterisks before and after like *this*. No asterisks anywhere else in the response.",
    # "\n\nVisualize the state of the map after each step.\n\nVisualize the state after each step.\n\nPlace the answer, in lower case, with asterisks before and after like *this*. No asterisks anywhere else in the response."
]

with concurrent.futures.ThreadPoolExecutor(max_workers=len(models) * len(suffixes)) as executor:
    futures = [
        executor.submit(process_model_evaluation, model_name, model_instance, data_path, leading_prompt, trailing_prompt, suffix)
        for model_name, model_instance in models.items()
        for trailing_prompt, suffix in zip(trailing_prompts, suffixes)
    ]
    concurrent.futures.wait(futures)

print("All evaluations are complete.")


In [None]:
# internal representations
models = {
    "Llama70b": Llama70b(api_key),
    "Llama8b": Llama8b(api_key)
}
model_instance = Llama8b(api_key)  # assuming this is your model instance
leading_prompt = ""

prompts = {
    'base': '\nPresent reasoning step-by-step. Place the answer, in lower case, with asterisks before and after like *this*. No asterisks anywhere else in the response.',
    'grid': '''\nPresent reasoning step-by-step.\nAt each step, represent the current state (including your current location) as
+-----------+-----------+---------+
| Object 1 | Object 2 | Object 3 |
+-----------+-----------+-----------+
| Object 4 | Object 5* | Object 6 |
+----------+-----------+-----------+
| Object 7 | Object 8 | Object 9 |
+----------+-----------+----------+
where the asterisk * corresponds to the current location.
Place the answer, in lower case, with asterisks before and after like *this*. No asterisks anywhere else in the response.''',
    'csv': '''\nPresent reasoning step-by-step.\nAt each step, represent the current state (including your current location) as
Object 1, Object 2, Object 3
Object 4, Object 5*, Object 6
Object 7, Object 8, Object 9
where the asterisk * corresponds to the current location.
Place the answer, in lower case, with asterisks before and after like *this*. No asterisks anywhere else in the response.''',
    'coord': '''\nPresent reasoning step-by-step.\nAt each step, represent the current state (including your current location) as
(1, 1): Object 1, (1, 2): Object 2, (1, 3): Object 3, (2, 1): Object 4, (2, 2): Object 5*, (2, 3): Object 6, (3, 1): Object 7, (3, 2): Object 8, (3, 3): Object 9
where the asterisk * corresponds to the current location.
Place the answer, in lower case, with asterisks before and after like *this*. No asterisks anywhere else in the response.''',
    'colbycol': '''\nPresent reasoning step-by-step.\nAt each step, represent the current state (including your current location) as
Object 1, Object 4, Object 7
Object 2, Object 5*, Object 8
Object 3, Object 6, Object 9
where the asterisk * corresponds to the current location.
Place the answer, in lower case, with asterisks before and after like *this*. No asterisks anywhere else in the response.''',
}
# trailing_prompt = "\n\nVisualize the state of the map after each step.\n\nVisualize the state after each step.\n\nPlace the answer, in lower case, with asterisks before and after like *this*. No asterisks anywhere else in the response."

for model_name, model_instance in models.items():
    for label, prompt in prompts.items():
        save_path = f'/content/drive/MyDrive/CS 159/internal_representations/type-square_size-3_steps-4_seed-3_n-100_{model_name}_' + label + '.csv'
        print('Running', save_path)
        # Run the evaluation for the specific configuration
        # process_model_evaluation(model_name, model_instance, data_path, leading_prompt, trailing_prompt, suffix)
        df = evaluate_file(
            data_path + 'map_global/type-square_size-3_steps-4_seed-3_n-100.jsonl',
            model_instance,
            leading_prompt,
            prompt,
            use_code=False,
            suffix=label,
            pbar=tqdm(),
            debug=False
        )
        df.to_csv(save_path)
