# This is a notebook do display and analyze the model responses for the various experiments.
*Author: Max Mohr*

In [73]:
import os
import sys
from dash import Dash, dcc, html, Input, Output
import plotly.express as px
from typing import Dict, List
from src.data.db_helpers import Database

# Add backend folder to path
parent_dir = os.path.dirname(
    os.path.realpath("/Users/mAx/Documents/Master/04/Master_Thesis/03_Codebase/src")
)
sys.path.append(parent_dir)


# Initialize the database
db = Database()
db.connect()

[1m[92mSuccessfully connected to database.[0m


(<connection object at 0x3191d3a00; dsn: 'user=postgres password=xxx dbname=mthesisdb host=193.196.52.142 port=5433', closed: 0>,
 <cursor object at 0x3228165c0; closed: 0>,
 Engine(postgresql+psycopg2://postgres:***@193.196.52.142:5433/mthesisdb))

## What are the model responses per bias scenario and how are they distributed?

In [83]:
responses_grouped = db.fetch_data(total_object="v_responses_grouped")
responses_grouped.describe()

Unnamed: 0,experiment_id,bias_id,model_id,count,max_updated_at
count,195.0,195.0,195.0,195.0,195
mean,44833.076923,447.682051,64.871795,34.779487,2024-09-22 12:35:52.617455104
min,10110.0,101.0,10.0,1.0,2024-09-16 20:27:42.648240
25%,20260.0,202.0,50.0,2.0,2024-09-23 14:16:55.741220096
50%,50170.0,501.0,80.0,10.0,2024-09-23 17:58:38.121818112
75%,60280.0,602.0,80.0,77.5,2024-09-23 18:16:57.588279040
max,80280.0,802.0,80.0,127.0,2024-09-23 18:44:17.709565
std,22689.329155,226.874282,21.808893,40.364928,


In [84]:
responses_grouped

Unnamed: 0,experiment_id,bias_id,model_id,response_type,response,count,max_updated_at
0,20170,201,70,choice,B,100,2024-09-16 20:27:42.648240
1,30180,301,80,choice,A,3,2024-09-18 14:27:43.959214
2,70180,701,80,choice,Failed prompt,6,2024-09-23 16:15:09.913320
3,60280,602,80,numerical,45,2,2024-09-23 18:09:07.368811
4,80220,802,20,choice,A,100,2024-09-18 23:34:33.452031
...,...,...,...,...,...,...,...
190,20150,201,50,choice,A,65,2024-09-18 16:02:43.588980
191,50180,501,80,numerical,50,96,2024-09-23 18:42:31.380119
192,60280,602,80,numerical,24,1,2024-09-23 18:09:07.368811
193,70250,702,50,choice,B,90,2024-09-18 16:03:51.721138


In [76]:
bias_dict: Dict[str, List[int]] = {
    "all": [101, 102, 201, 202, 301, 401, 402, 501, 502, 601, 602, 701, 702, 801, 802],
    "anchoring": [601, 602],
    "category size bias": [401, 402],
    "endowment effect": [101, 102],
    "framing effect": [801, 802],
    "gambler's fallacy": [501, 502],
    "loss aversion": [201, 202],
    "sunk cost fallacy": [301],
    "transaction utility": [701, 702],
}

model_dict: Dict[str, List[int]] = {
    "all": [10, 20, 30, 40, 50, 60, 70, 80],
    "gemma2": [10],
    "gemma2:27b": [20],
    "gpt-4o-mini": [30],
    "gpt-4o": [40],
    "llama3.1": [50],
    "llama3.1:70b": [60],
    "phi3.5": [70],
    "phi3:medium": [80],
}

In [90]:
app = Dash(__name__)

app.layout = html.Div(
    [
        html.H2("Response distributions"),
        html.H4("Select bias:"),
        dcc.Dropdown(
            id="bias",
            options=[key for key, _ in bias_dict.items()],
            value="all",
            clearable=False,
        ),
        html.H4("Select model:"),
        dcc.Dropdown(
            id="model",
            options=[key for key, _ in model_dict.items()],
            value="all",
            clearable=False,
        ),
        html.H4("Include failed prompts?"),
        dcc.Dropdown(
            id="failed_prompts",
            options=["Yes", "No"],
            value="Yes",
            clearable=False,
        ),
        dcc.Graph(id="graph"),
    ]
)


@app.callback(
    Output("graph", "figure"),
    Input("bias", "value"),
    Input("model", "value"),
    Input("failed_prompts", "value"),
)
def update_bar_chart(bias: str, model: str, failed_prompts: str):
    mask = (responses_grouped["bias_id"].isin(bias_dict[bias])) & (
        responses_grouped["model_id"].isin(model_dict[model])
    )
    if failed_prompts == "No":
        mask = mask & (responses_grouped["response"] != "Failed prompt")
    fig = px.bar(
        responses_grouped[mask],
        x="bias_id",
        y="count",
        color="response",
        barmode="group",
    )
    return fig


app.run_server(debug=True)

In [78]:
responses = db.fetch_data(total_object="t_responses")
responses[
    (responses["bias_id"] == 601)
    & (responses["model_id"] == 50)
    & (responses["correct_run"] == 0)
]

Unnamed: 0,experiment_id,bias_id,model_id,response_type,response,reason,correct_run,updated_at


## How many correct runs did we achieve on each model per bias scenario?

In [86]:
experiment_runs = db.fetch_data(total_object="v_experiments")
experiment_runs.describe()

Unnamed: 0,experiment_id,bias_id,model_id,part,parts_total,ran_date,correct_ran_loops,total_ran_loops
count,120.0,120.0,120.0,120.0,120.0,63,120.0,120.0
mean,46191.666667,461.466667,45.0,1.466667,1.933333,2024-09-21 07:26:45.948389632,43.833333,56.516667
min,10110.0,101.0,10.0,1.0,1.0,2024-09-16 20:27:42.648240,0.0,0.0
25%,20267.5,202.0,27.5,1.0,2.0,2024-09-18 16:20:03.717024512,0.0,0.0
50%,50145.0,501.0,45.0,1.0,2.0,2024-09-23 14:16:51.609127936,0.0,100.0
75%,70122.5,701.0,62.5,2.0,2.0,2024-09-23 16:15:18.205922304,100.0,100.0
max,80280.0,802.0,80.0,2.0,2.0,2024-09-23 18:44:17.709565,100.0,221.0
std,23423.962748,234.239514,23.00895,0.500979,0.25049,,48.388194,56.453146


In [87]:
# Group by
experiment_runs = (
    experiment_runs.groupby(
        [
            "experiment_id",
            "bias",
            "model",
            "experiment_type",
            "scenario",
            "response_type",
        ]
    )
    .agg({"correct_ran_loops": "sum", "total_ran_loops": "sum"})
    .reset_index()
)

experiment_runs["failed_ran_loops"] = (
    experiment_runs["total_ran_loops"] - experiment_runs["correct_ran_loops"]
)
experiment_runs

Unnamed: 0,experiment_id,bias,model,experiment_type,scenario,response_type,correct_ran_loops,total_ran_loops,failed_ran_loops
0,10110,endowment effect,gemma2,multi-scenario,0_normal,numerical,0,100,100
1,10120,endowment effect,gemma2:27b,multi-scenario,0_normal,numerical,0,0,0
2,10130,endowment effect,gpt-4o-mini,multi-scenario,0_normal,numerical,0,0,0
3,10140,endowment effect,gpt-4o,multi-scenario,0_normal,numerical,0,0,0
4,10150,endowment effect,llama3.1,multi-scenario,0_normal,numerical,100,100,0
...,...,...,...,...,...,...,...,...,...
115,80240,framing effect,gpt-4o,multi-scenario,0_normal,choice,0,0,0
116,80250,framing effect,llama3.1,multi-scenario,0_normal,choice,100,100,0
117,80260,framing effect,llama3.1:70b,multi-scenario,0_normal,choice,0,0,0
118,80270,framing effect,phi3.5,multi-scenario,0_normal,choice,100,100,0


In [88]:
app = Dash(__name__)

app.layout = html.Div(
    [
        html.H2("Response distributions"),
        html.H4("Select model:"),
        dcc.Dropdown(
            id="model",
            options=[key for key, _ in model_dict.items()],
            value="all",
            clearable=False,
        ),
        dcc.Graph(id="graph"),
    ]
)


@app.callback(Output("graph", "figure"), Input("model", "value"))
def update_bar_chart_2(model: str):
    mask = experiment_runs["model"] == model
    fig = px.bar(
        experiment_runs[mask],
        x="bias",
        y=["correct_ran_loops", "failed_ran_loops"],
    )
    return fig


app.run_server(debug=True)