In [13]:
import os
import pandas as pd

from getpass import getpass
from langchain.utilities import SQLDatabase
from langchain.agents import create_sql_agent, AgentType
from langchain.agents.agent_toolkits import SQLDatabaseToolkit
from langchain.chat_models import ChatOpenAI
from langchain.evaluation.qa import QAEvalChain
from langchain.callbacks import get_openai_callback

from evaluation_prompts import GRADING_PROMPT
from utils import CustomDatabase

if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass("OpenAI API Key: ")

## Load evaluation dataset

In [2]:
databases = [
    {
        'name': 'architecture',
        'eval_set_path': 'datasets/pampa_dataset/architecture_eval_dataset.json',
        'db_path': 'datasets/pampa_dataset/architecture.sqlite'
    },
    # {
    #     'name': 'architecture',
    #     'eval_set': 'datasets/pampa_dataset/architecture_eval_dataset.json',
    #     'db': 'datasets/pampa_dataset/architecture.sqlite'
    # },
]

eval_databases = []
for database in databases:
    eval_databases.append(CustomDatabase(name=database['name'],
                                         database=SQLDatabase.from_uri(f"sqlite:///{database['db_path']}"),
                                         evaluation_dataset=pd.read_json(database['eval_set_path'])
                                         )
                                        )

## Create your Agent
We need to create one Agent for each database

In [3]:
llm = ChatOpenAI(temperature=0)

for eval_database in eval_databases:
    print(f"Creating Agent for {eval_database.name}...")
    eval_database.agent = create_sql_agent(
        llm=llm,
        verbose=False,
        agent_type=AgentType.OPENAI_FUNCTIONS,
        toolkit=SQLDatabaseToolkit(llm=llm, db=eval_database.database)
    )

Creating Agent for architecture...


### Evaluate the Agent

In [15]:
evaluate_chain = QAEvalChain.from_llm(llm=llm,prompt=GRADING_PROMPT)

In [25]:
for eval_database in eval_databases:
    with get_openai_callback() as agent_cb:
        eval_database.run_agent()
        agent_tokens = agent_cb.total_tokens

    df = eval_database.evaluation_dataset
    questions = list(df['question'])
    targets = list(df['nl_result'])
    agent_results = list(df['agent_results'])
    
    predictions = []
    target_results = []
    
    for i, question in enumerate(questions):
        predictions.append({'question':question, 'result': agent_results[i]})
        target_results.append({'question':question, 'answer': targets[i]})

    with get_openai_callback() as eval_cb:
        res = evaluate_chain.evaluate(target_results, predictions, question_key="question", prediction_key="result")
        res
        accuracy = len([r for r in res if r['results'] == 'Correct'])/len(res)
    
    print("---------------------------------------------")
    print(f"Finished evaluation for database: {eval_database.name}")
    print(f"Accuracy: {accuracy}")
    print(f"Agent run token cost: {agent_tokens}")
    print(f"Evaluation token cost: {eval_cb.total_tokens}")
    print("---------------------------------------------\n\n")

Running agent on DB architecture: 100%|██████████| 17/17 [01:44<00:00,  6.13s/it]


---------------------------------------------
Finished evaluation on architecture
Accuracy: 0.7058823529411765
Agent run cost: 53274
Evaluation cost: 3141 tokens
---------------------------------------------
