In [1]:
import wandb
import pandas as pd
import numpy as np
import yaml


with open("wandb.cfg", "r") as f:
    WANDB_CONFIG = yaml.load(f, Loader=yaml.FullLoader)

# Load informations about the games.
GAME_INFOS = pd.read_csv("game_infos.csv")

def fetch_data(framework, src="cloud"):
    wandb_config = WANDB_CONFIG[src]
    entity = wandb_config["entity"]
    project = wandb_config["project"]
    api = wandb.Api(overrides={"base_url": wandb_config["base_url"]}, api_key=wandb_config["api_key"])
    runs = api.runs(
        f"{entity}/{project}",
        filters={
            "state": "finished",
            "config.framework": framework,
            # "user": "marc-cote-19",
            # "config.agent_type": "zero-shot",
            # "config.conversation": True,
        }
    )

    # Conver Wandb RUNS to a pandas dataframe
    runs_data = []
    for run in runs:
        run_data = dict(run.summary)
        run_data.update(run.config)
        # run_data["agent"] = run.config["agent"]
        # run_data["game"] = run.config["game"]
        runs_data.append(run_data)

    runs_df = pd.DataFrame(runs_data)

    categories = ["Stochastic", "Dialog", "Darkness", "Nonstandard Action", "Inventory", "Maze", "Maze", "Trivia", "Self-Maintenance", "Combat", "Time", "Mechanical Puzzle", "Persistence", "Difficulty"]
    ignore_categories = ["Difficulty"]

    # Keep game, model, Max Normalized Score
    #runs_df = runs_df[["game", "agent", "final/Normalized Score"]]
    runs_df = runs_df.sort_values(by="game")

    # For each game add columns with the following games infos: Stochastic, Dialog, Darkness, Nonstandard Action, Inventory, Difficulty
    for game in runs_df["game"].unique():
        game_info = GAME_INFOS[GAME_INFOS["EnvName"] == game]
        if game_info.empty:
            print(f"Can't find {game} in game_infos.csv")

        for col in categories:
            runs_df.loc[runs_df["game"] == game, col] = game_info[col].values[0]

    return runs_df


In [2]:
framework = "scienceworld"
framework = "textworld"
#framework = "textworld_express"
#framework = "alfworld"
#framework = "jericho"
data = fetch_data(framework, src="pearls")
print(len(data))
# data

1671


In [3]:
# filter out the llama-31-70b-instruct
#tmp = data[~data["llm"].str.contains("llama-31-70b-instruct")]
tmp = data

# Keep only LLM agent.
tmp = tmp[tmp["agent"].str.contains("LLMAgent")]
tmp = tmp[tmp["conversation"]]

tmp["llm_short"] = tmp["llm"].str.replace("meta-llama/Llama-", "")
tmp["llm_short"] = tmp["llm_short"].str.replace("-Instruct", "")
tmp.groupby(["llm"])["final/Normalized Score"].describe().reset_index()

# Sort the llm groups by name

# Compute the average score for each llm group by conversation and stderror
mean_stderror = tmp.groupby(["llm", "llm_short"])["final/Normalized Score"].agg(["mean", "sem"]).reset_index()
# Sort the llm groups by predefined order
order = ["claude-3.5-sonnet-latest", "claude-3.5-sonnet", "claude-3.5-haiku", "gpt-4o", "gpt-4o-mini", "3.3-70B", "3.1-405B", "3.1-70B", "3.1-8B", "3.2-3B", "3.2-1B"]
order += sorted(set(tmp["llm_short"].unique().tolist()) - set(order))
mean_stderror["llm_short"] = pd.Categorical(mean_stderror["llm_short"], categories=order, ordered=True)
mean_stderror = mean_stderror.sort_values("llm_short", ascending=False)

# Display grouped bar chat for each llm group by conversation using plotly
import plotly.express as px
fig = px.bar(mean_stderror, x="llm_short", y="mean", color="llm", title=framework.title(), error_y="sem", width=1000)
fig.update_traces(width=0.4)

# Set y-axis title
fig.update_yaxes(title_text="Avg. Score")
fig.update_xaxes(title_text="Model Size")

# Set y range to 0, 1
fig.update_yaxes(range=[0, 1.02])

# Put legent at the top
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="top",
    y=1.02,
    xanchor="right",
    x=1
))
# Remove legend
fig.update_layout(showlegend=False)

# Hide legend title
fig.update_layout(legend_title_text='')

fig.update_traces(textposition='outside')
fig.show()

# save the figure to png
fig.write_image(f"{framework}_llm.png")


In [5]:
# filter out the llama-31-70b-instruct
tmp = data[~data["llm"].str.contains("llama-31-70b-instruct")]
tmp = tmp[tmp["conversation"]]

# Create colun agent_type and set it to zero-shot if LLMAgent is in agent and CoT if ReactAgent is in agent
tmp["agent_type"] = np.where(tmp["agent"].str.contains("LLMAgent"), "zero-shot", "CoT")

tmp["llm_short"] = tmp["llm"].str.replace("meta-llama/Llama-", "")
tmp["llm_short"] = tmp["llm_short"].str.replace("-Instruct", "")
#tmp.groupby(["agent_type", "conversation", "llm"])["final/Normalized Score"].describe().reset_index()

# Sort the llm groups by name

# Compute the average score for each llm group by conversation and stderror
mean_stderror = tmp.groupby(["agent_type", "llm", "llm_short"])["final/Normalized Score"].agg(["mean", "sem"]).reset_index()
# Sort the llm groups by predefined order
order = ["3.1-405B", "3.1-70B", "3.1-8B", "3.2-3B", "3.2-1B"]
order += sorted(set(tmp["llm_short"].unique().tolist()) - set(order))
mean_stderror["llm_short"] = pd.Categorical(mean_stderror["llm_short"], categories=order, ordered=True)
mean_stderror = mean_stderror.sort_values("llm_short", ascending=False)

# Display grouped bar chat for each llm group by conversation using plotly
import plotly.express as px
fig = px.bar(mean_stderror, x="agent_type", y="mean", color="llm", barmode="group", title="ScienceWorld", error_y="sem", text="llm_short", width=1000)

# Set y-axis title
fig.update_yaxes(title_text="Avg. Score")
fig.update_xaxes(title_text="Agent type")

# Fix Y range to 0, 1
fig.update_yaxes(range=[0, 1.02])

# Put legent at the top
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="top",
    y=1.02,
    xanchor="right",
    x=1
))

# Hide legend title
fig.update_layout(legend_title_text='')

fig.update_traces(textposition='outside')
fig.show()

# save the figure to png
fig.write_image("scienceworld_llm_agent_type.png")


In [30]:
# filter out the llama-31-70b-instruct
tmp = data[~data["llm"].str.contains("llama-31-70b-instruct")]

# Keep only LLM agent.
tmp = tmp[tmp["agent"].str.contains("LLMAgent")]

tmp["llm_short"] = tmp["llm"].str.replace("meta-llama/Llama-", "")
tmp["llm_short"] = tmp["llm_short"].str.replace("-Instruct", "")
tmp.groupby(["conversation", "llm"])["final/Normalized Score"].describe().reset_index()

# Compute the average score for each llm group by conversation and stderror
mean_stderror = tmp.groupby(["conversation", "llm", "llm_short"])["final/Normalized Score"].agg(["mean", "sem"]).reset_index()
# Sort the llm groups by predefined order
order = ["3.1-405B", "3.1-70B", "3.1-8B", "3.2-3B", "3.2-1B"]
order += sorted(set(tmp["llm_short"].unique().tolist()) - set(order))
mean_stderror["llm_short"] = pd.Categorical(mean_stderror["llm_short"], categories=order, ordered=True)
mean_stderror = mean_stderror.sort_values("llm_short", ascending=False)

# Display grouped bar chat for each llm group by conversation using plotly
import plotly.express as px
fig = px.bar(mean_stderror, x="conversation", y="mean", color="llm", barmode="group", title="ScienceWorld", error_y="sem", text="llm_short", width=1000)

# Set y-axis title
fig.update_yaxes(title_text="Avg. Score")
fig.update_xaxes(title_text="Conversation Mode?")

# Set y range to 0, 1
fig.update_yaxes(range=[0, 1.02])

# Put legent at the top
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="top",
    y=1.02,
    xanchor="right",
    x=1
))

# Hide legend title
fig.update_layout(legend_title_text='')

fig.update_traces(textposition='outside')
fig.show()
# save the figure to png
fig.write_image("scienceworld_llm_conversation.png")




In [31]:
# Create a group bar chart with the average score per game for each model
import os
import plotly.express as px
import plotly.graph_objects as go


def make_grouped_bar_chart(df, groupby="agent"):
    df = df.copy()

    # sort by max score for gpt-4o
    #df = df.sort_values(by=["model", "Max Normalized Score"], ascending=True)

    # Strip common prefix from game name
    game_names = list(df["game"].unique())
    commond_prefix = os.path.commonprefix(game_names)
    df["game"] = df["game"].str.replace(commond_prefix, "")

    # Filterout game lurking and hollywood
    #df = df[~df["game"].isin(["lurking", "hollywood"])]

    # Add new column to order difficulty
    df["order"] = df["Difficulty"].map({"Possible": 2, "Difficult": 1, "Extreme": 0})

    df = df.sort_values(by=["order", groupby, "final/Normalized Score"], ascending=True)
    df = df.groupby(["game", "Difficulty", "llm", "llm_short"])["final/Normalized Score"].agg(["mean", "sem"]).reset_index()

    # Add new column color for each game based on the game difficulty
    def color(game, diff):
        if diff == "Possible":
            return f"<span style='color:green'>{game}</span>"
        elif diff == "Difficult":
            return f"<span style='color:orange'>{game}</span>"
        elif diff == "Extreme":
            return f"<span style='color:red'>{game}</span>"
        else:
            raise ValueError("Unknown difficulty")

    df["color"] = df.apply(lambda x: color(x["game"], x["Difficulty"]), axis=1)

    fig = px.bar(df, x="mean", y="game", color=groupby, barmode="group", title="ScienceWorld - Normalized Score", error_x="sem", height=2000)

    # Sort the y axis by game difficulty and then by max score for gpt-4o
    #fig.update_yaxes(categoryorder="array", categoryarray=df.sort_values(by=["order", groupby, "final/Normalized Score"], ascending=True))
    fig.update_yaxes(tickvals=df["game"], tickmode="array", ticktext=df["color"])

    # Make x axis range from 0 to 1
    fig.update_xaxes(range=[0, 1.02])

    # sort game title by max score for gpt-4o
    #fig.update_yaxes(categoryorder="total ascending")
    return fig


# Work-in-progress

In [15]:
df = fetch_data("jericho")
fig = make_grouped_bar_chart(df)
fig.show()

In [16]:
df = fetch_data("textworld")
fig = make_grouped_bar_chart(df)
fig.show()

In [16]:
#df = fetch_data("scienceworld")

# filter out the llama-31-70b-instruct
tmp = data[~data["llm"].str.contains("llama-31-70b-instruct")]

# Keep only LLM agent and conversation mode.
tmp = tmp[tmp["agent"].str.contains("LLMAgent")]
tmp = tmp[tmp["conversation"]]

tmp["llm_short"] = tmp["llm"].str.replace("meta-llama/", "")
tmp["llm_short"] = tmp["llm_short"].str.replace("mistralai/", "")
tmp["llm_short"] = tmp["llm_short"].str.replace("Qwen/", "")
tmp["llm_short"] = tmp["llm_short"].str.replace("-Instruct", "")
tmp.groupby(["llm"])["final/Normalized Score"].describe().reset_index()

fig = make_grouped_bar_chart(tmp, groupby="llm_short")
fig.show()