Import packages

In [43]:
import json
import wandb

from dotenv import load_dotenv
from pathlib import Path
from tqdm.auto import tqdm
from utils.commmon import build_dir_name

Specify the dataset, model, and run configurations.

In [44]:
dataset = "multimedia"
temperature = 0.2
top_p = 0.1
api_addr = "api.together.xyz"
api_port = 443
multiworker = 1
llm = "meta-llama_Llama-3.3-70B-Instruct-Turbo"
use_demos = 2
reformat = True
reformat_by = "self"
tag = True
dependency_type = "resource"
log_first_detail = True
fraction = 1.0
seed = 42
wait_time = 3.0

Build the path to the predictions file based on the run configurations.

In [45]:
data_dir = f"data_{dataset}"

predictions_dir = build_dir_name(
    base_name="predictions",
    use_demos=use_demos,
    reformat_by=reformat_by,
)

predictions_path = Path(f"../{data_dir}/{predictions_dir}/{llm}.json")
print(f"Predictions path: {predictions_path}")

Predictions path: ..\data_multimedia\predictions_use_demos_2_reformat_by_self\meta-llama_Llama-3.3-70B-Instruct-Turbo.json


Read the dataset's user requests file to count the number of user requests. 

In [46]:
user_requests_path = Path(f"../{data_dir}/user_requests.json")

num_user_requests = 0
try:
    with open(user_requests_path, "r") as f:
        lines = [line.strip() for line in f if line.strip()]

except FileNotFoundError:
    print(f"File not found: {user_requests_path}")
except json.JSONDecodeError:
    print("File is not valid JSON (likely JSONL format)")

print(f"Number of user requests: {(num_user_requests := len(lines))}")

Number of user requests: 5584


Load the predictions from the JSON file. 
- The number of predictions should be the same as the number of user requests.
- If this isn't the case, then that means some of the API calls failed to generate predictions.

In [47]:
predictions = []

kwargs = {
    "desc": "Loading predictions",
    "unit": "prediction",
    "total": num_user_requests,
}

with open(predictions_path, "r") as f:
    for i, line in tqdm(enumerate(f, 1), **kwargs):
        line = line.strip()  # Remove whitespace
        if line:  # Skip empty lines
            try:
                prediction = json.loads(line)
                predictions.append(prediction)
            except json.JSONDecodeError as e:
                print(f"Error parsing line {i}: {e}")
                continue

print(f"Loaded {(num_predictions := len(predictions))} predictions")

if num_predictions != num_user_requests:
    print(
        f"Number of predictions: {num_predictions} does not match number "
        f"of user requests: {num_user_requests}"
    )
    difference = num_user_requests - num_predictions
    percentage = difference / num_user_requests * 100
    print(f"There are {difference} user requests that do not have predictions")
    print(f"This is {percentage:.2f}% of the user requests")

Loading predictions:   0%|          | 0/5584 [00:00<?, ?prediction/s]

Loaded 5571 predictions
Number of predictions: 5571 does not match number of user requests: 5584
There are 13 user requests that do not have predictions
This is 0.23% of the user requests


Initialize [W&B](https://wandb.ai/mpgee-usc/Eval%20Project?nw=nwusermpgee). configurations.

In [48]:
run_config = {
    "dataset": dataset,
    "temperature": temperature,
    "top_p": top_p,
    "api_addr": api_addr,
    "api_port": api_port,
    "multiworker": multiworker,
    "llm": llm,
    "use_demos": use_demos,
    "reformat": reformat,
    "reformat_by": reformat_by,
    "tag": tag,
    "dependency_type": dependency_type,
    "log_first_detail": log_first_detail,
    "fraction": fraction,
    "seed": seed,
    "wait_time": wait_time,
}

init_kwargs = {
    "name": f"upload_{dataset}_{llm}_artifacts",
    "tags": [
        f"dataset={dataset}",
        f"llm={llm}",
    ],
    "group": "sae_stats",
    "notes": f"Uploading artifacts for {dataset} with {llm}",
    "job_type": "artifact_upload",
    "mode": "online",
}

predictions_kwargs = {
    "name": f"{llm}_{dataset}_predictions",
    "type": "predictions",
    "description": f"{llm} predictions for TaskBench's {dataset.capitalize()} dataset",
    "metadata": run_config,
}


metrics_kwargs = {
    "name": f"{llm}_{dataset}_metrics",
    "type": "metrics",
    "description": f"{llm} evaluation metrics for TaskBench's {dataset.capitalize()} dataset",
    "metadata": run_config,
}

Upload the predictions to W&B.

In [None]:
load_dotenv()
run = wandb.init(**init_kwargs, config=run_config)

predictions_artifact = wandb.Artifact(**predictions_kwargs)
metrics_artifact = wandb.Artifact(**metrics_kwargs)

metrics_dir = build_dir_name(
    base_name="metrics",
    use_demos=use_demos,
    reformat_by=reformat_by,
)

metrics_path = Path(f"../{data_dir}/{metrics_dir}/{llm}.json")

predictions_artifact.add_file(predictions_path)
metrics_artifact.add_file(metrics_path)

run.log_artifact(predictions_artifact)
run.log_artifact(metrics_artifact)

# TODO: Change this so it logs the overall metrics to the summary

run.finish()

[34m[1mwandb[0m: Currently logged in as: [33mmpgee[0m ([33mmpgee-usc[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.
