# finRAG Study Results
By Thomas Flassbeck ([twitter](https://twitter.com/flashback_t) | [linkedin](https://www.linkedin.com/in/thomas-flassbeck-90aa72104/)) from Parsee.ai.

Last update on May 15th, 2024, with the GPT-4 Omni results.

In [1]:
import os
import pandas as pd
from typing import *
import warnings
warnings.filterwarnings('ignore')

from run_evaluation import *

In [2]:
# GLOBAL settings for this run (v2)
token_limit = 8000
# Parsee.ai template ID, can be seen here (requires free log-in): https://app.parsee.ai/template/662a37cb080aaf6db5499923
template_id = "662a37cb080aaf6db5499923"

# RAG-text Dataset

Note: Snowflake arctic is excluded from the RAG-text datasets as it was not able to return results consistently even at a reduced context length of 3.5k.

In [3]:
# The following models are being used:
models = [
            anthropic_config("n/a", "claude-3-opus-20240229", token_limit),
            gpt_config("n/a", token_limit, "gpt-4-1106-preview"),
            gpt_config("n/a", token_limit, "gpt-4o"),
            replicate_config("n/a", "meta/meta-llama-3-70b-instruct", token_limit),
            together_config("n/a", "mistralai/Mixtral-8x22B-Instruct-v0.1", token_limit),
            mistral_api_config(None, "mistral-large-latest", token_limit),
            together_config("n/a", "databricks/dbrx-instruct", token_limit),
            cohere_config("n/a", "command-r-plus", token_limit),
        ]

In [4]:
df, all_dataset_names = make_df(template_id, "../data/v2/answers_150524/rag-text", models)
df[['model', 'total']+[f"{x}_total" for x in all_dataset_names]].sort_values('total', ascending=False)

Unnamed: 0,model,total,rag_text_100_rev_meta_total,rag_text_100_rev23_meta_total,rag_text_100_rev22_meta_total
2,gpt-4o,0.953037,0.89661,0.985,0.9775
0,claude-3-opus-20240229,0.871469,0.914407,0.9775,0.7225
1,gpt-4-1106-preview,0.827754,0.595763,0.9625,0.925
5,mistral-large-latest,0.765508,0.491525,0.955,0.85
7,command-r-plus,0.669364,0.435593,0.9025,0.67
3,meta/meta-llama-3-70b-instruct,0.516568,0.132203,0.6575,0.76
4,mistralai/Mixtral-8x22B-Instruct-v0.1,0.469195,0.255085,0.59,0.5625
6,databricks/dbrx-instruct,0.247175,0.191525,0.315,0.235


# Selection-text Dataset

In [5]:
# Token limit is still set to 8k for all models, except for the Snowflake Arctic model, which has a max. context of 4k tokens (we set the context to 3k here as it was throwing many errors at 4k context before).
# The token limit is not really relevant for this dataset as all the prompts contain at most ~3k tokens.
# The following models are being used:
models = [
            anthropic_config("n/a", "claude-3-opus-20240229", token_limit),
            gpt_config("n/a", token_limit, "gpt-4-1106-preview"),
            gpt_config("n/a", token_limit, "gpt-4o"),
            replicate_config("n/a", "meta/meta-llama-3-70b-instruct", token_limit),
            together_config("n/a", "mistralai/Mixtral-8x22B-Instruct-v0.1", token_limit),
            mistral_api_config(None, "mistral-large-latest", token_limit),
            together_config("n/a", "databricks/dbrx-instruct", token_limit),
            cohere_config("n/a", "command-r-plus", token_limit),
            together_config("n/a", "Snowflake/snowflake-arctic-instruct", 3000)
        ]

In [6]:
df, all_dataset_names = make_df(template_id, "../data/v2/answers_150524/selection-text", models)
df[['model', 'total']+[f"{x}_total" for x in all_dataset_names]].sort_values('total', ascending=False)

Unnamed: 0,model,total,selection_text_100_rev23_meta_total,selection_text_100_rev_meta_total,selection_text_100_rev22_meta_total
2,gpt-4o,0.995523,1.0,0.994068,0.9925
0,claude-3-opus-20240229,0.973023,0.9925,0.994068,0.9325
4,mistral-large-latest,0.971554,0.9775,0.979661,0.9575
1,gpt-4-1106-preview,0.968573,0.97,0.94322,0.9925
8,meta/meta-llama-3-70b-instruct,0.838192,0.9,0.674576,0.94
6,command-r-plus,0.746073,0.8875,0.49322,0.8575
3,mistralai/Mixtral-8x22B-Instruct-v0.1,0.687288,0.75,0.661864,0.65
5,databricks/dbrx-instruct,0.53089,0.5775,0.410169,0.605
7,Snowflake/snowflake-arctic-instruct,0.280862,0.3475,0.155085,0.34


# Selection-image Dataset

In [7]:
# Token limit is still set to 8k for all models
# The token limit is not really relevant for this dataset as all the prompts contain at most ~1k tokens.
# The following models are being used:
models = [
            anthropic_config("n/a", "claude-3-opus-20240229", token_limit, True, 1),
            gpt_config("n/a", token_limit, "gpt-4o"),
            gpt_config("n/a", token_limit, "gpt-4-1106-vision-preview", True, 1),
        ]

In [8]:
df, all_dataset_names = make_df(template_id, "../data/v2/answers_150524/selection-image", models)
df[['model', 'total']+[f"{x}_total" for x in all_dataset_names]].sort_values('total', ascending=False)

Unnamed: 0,model,total,selection_image_100_rev23_meta_total,selection_image_100_rev22_meta_total,selection_image_100_rev_meta.csv_total
1,gpt-4o,0.952528,0.9625,0.94,0.955085
0,claude-3-opus-20240229,0.610395,0.6975,0.3625,0.771186
2,gpt-4-1106-vision-preview,0.581398,0.7975,0.445,0.501695
