### Get Keys and instantiate model

In [11]:
import langchain, langchain_openai, langsmith
import os
from dotenv import load_dotenv
from langsmith import Client
import uuid
from langchain_core.messages import BaseMessage, HumanMessage, AIMessage, SystemMessage, RemoveMessage
from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate

load_dotenv(override=True)
openai_api_key = os.getenv('OPENAI_API_KEY')
LANGSMITH_API_KEY = os.getenv('LANGSMITH_API_KEY')
os.environ['OPENAI_API_KEY'] = openai_api_key
os.environ['LANGSMITH_API_KEY'] = LANGSMITH_API_KEY
os.environ['LANGSMITH_TRACING'] = "true"
os.environ['LANGSMITH_ENDPOINT'] = "https://api.smith.langchain.com"
langsmith_project_name = "db_agent_v1"
os.environ['LANGSMITH_PROJECT'] = langsmith_project_name

from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model='gpt-4o',temperature=0)

In [2]:
from db_agent_v1 import objects_documentation, graph, create_config

### Create the questions & expected answers

In [7]:


### Q1
question_q1 = 'How many companies are there?'
expected_answer_q1 = '''There are 12 unique companies in the database.

I analyzed data based on the following filters and transformations:

🧊 Tables: • company

🧮 Aggregations: • COUNT(DISTINCT company.company_name)'''

input_state_q1 = {
               'objects_documentation':objects_documentation,
               'messages_log': [],
               'question':question_q1,
               'sql_queries': [],
               'llm_answer': []
               }    

### Q2

### Q3


### Create testing datasets in langsmith

In [None]:
client = Client()
dataset_name = 'Evaluation Dataset db_agent_v1'
dataset = client.create_dataset(dataset_name)

client.create_examples(
    dataset_id = dataset.id,
    examples = [
        {
          'inputs': input_state_q1,
          'outputs': {'answer': expected_answer_q1}
        }
    ]
                      )

### Grade by corectness

In [15]:
from langchain.schema.output_parser import StrOutputParser

def rate_corectness(inputs:dict, outputs: dict, reference_outputs: dict ):
    prompt = ChatPromptTemplate.from_messages([
        ('system', '''Rate the corectness of answering a question, by comparing the actual answer to the reference answer.
                      
                      Rate based on the following factors:
                      - Precision of the data accuracy in relation to what is measured.
                      - The degree to which the same filters and transformations were used.

                      Rate between 1 and 10. In your response include just the rating.
                     ''' ),
        ('user',''' Question: 
                    {question} 

                    Actual answer:
                    {actual_answer} 

                    Reference answer:
                    {reference_answer} 
        ''')             
    ])

    chain = prompt | llm | StrOutputParser()

    return chain.invoke({'question':inputs['question'],
                         'actual_answer':outputs['response'],
                         'reference_answer':reference_outputs['answer']})

In [19]:
inputs = {'question':'How many companies are there?'}
outputs = {'response':'''There are 100 unique companies in the database.

I analyzed data based on the following filters and transformations:

🧊 Tables: • company

🧮 Aggregations: • COUNT(DISTINCT company_name)'''}

reference_outputs = {'answer': '''There are 12 unique companies in the database.

I analyzed data based on the following filters and transformations:

🧊 Tables: • company

🧮 Aggregations: • COUNT(DISTINCT company_name)'''
}

rate_corectness(inputs,outputs,reference_outputs)


'1'

### Get actual answers by running the graph

In [9]:
config, thread_id = create_config('Run Agent',True)

def run_agent(input_state: dict):

 for step in graph.stream(input_state, config = config, stream_mode="updates"):
  step_name, output = list(step.items())[0]
  if step_name == 'generate_answer':
   result = output['llm_answer'].content
 return {'response': result}  

In [None]:
#run_agent(input_state_q1)

### Run evaluation

In [16]:
experiment_results = client.evaluate(
    run_agent, # Your AI system
    data=dataset_name, # The data to predict and grade over
    evaluators=[rate_corectness], # The evaluators to score the results
    experiment_prefix="prefix", # A prefix for your experiment names to easily identify them
)

View the evaluation results for experiment: 'prefix-093cada0' at:
https://smith.langchain.com/o/a016dac4-a501-45c3-b12f-e746a3b726f9/datasets/2f60a23a-c8ff-47db-a359-96657c488063/compare?selectedSessions=8d62e1cb-59a2-4001-9da3-6eb9fbdf6fe5




0it [00:00, ?it/s]

🚀 Executing query 1/1...


1it [00:06,  6.67s/it]


In [None]:

messages_log = [
    HumanMessage(content = 'How many companies are there?',id=1),
    AIMessage(content = '''📣 Final Answer:

There are 12 unique companies listed in the data. Keep in mind, though, that if there are any variations in how the company names are written (like different spellings or capitalization), it might affect the count slightly.

I analyzed data based on the following filters and transformations:

🧊 Tables: • company

🧮 Aggregations: • COUNT(DISTINCT company.company_name)''',id=2),

    HumanMessage(content = 'What can you tell me about the dataset?',id=3),
    AIMessage(content = '''📣 Final Answer:

The dataset provides some interesting insights:

1. There are 12 unique companies listed in the dataset.
2. On average, these companies have an annual revenue of about $26.3 trillion USD.
3. There are 413,898 unique feedback entries, which means a lot of feedback has been collected.
4. The average feedback rating is approximately 3.84 out of a possible scale (not specified here).
5. There are 8,145 unique products in the dataset.

These numbers give a broad overview of the dataset, showing a diverse range of companies, a significant amount of feedback, and a large variety of products. However, it doesn't dive into specifics like industry types, regional data, or detailed company attributes.

I analyzed data based on the following filters and transformations:

🧊 Tables: • company • feedback • products

🧮 Aggregations: • COUNT(DISTINCT company.company_name) • AVG(company.annual_revenue_usd) • COUNT(DISTINCT feedback.feedback_id) • AVG(feedback.feedback_rating) • COUNT(DISTINCT products.product_id)',id=5),
    HumanMessage(content = 'tell me a joke about rum refering to my name''',id=4),
   
    HumanMessage(content = 'Can you share the average feedback rating per company?',id=5),
    AIMessage(content = '''📣 Final Answer:

Sure! Here's the average feedback rating for each company based on the data we have:

- Adidas: 4.06
- Apple: 3.85
- AT&T: 3.66
- Cisco: 3.37
- Google: 3.56
- Microsoft: 3.91
- Nike: 3.95
- Samsung: 3.87
- Sony: 3.83
- Target: 3.33
- Verizon: 3.84
- Walmart: 3.31

These numbers represent the average feedback ratings from customers for each company. Keep in mind that this is a straightforward average and doesn't account for things like the number of feedback entries or any unusual ratings that might affect the average.

I analyzed data based on the following filters and transformations:
...

🧮 Aggregations: • AVG(feedback.feedback_rating)

📦 Groupings: • feedback.product_company_name''',id=6),
    
    HumanMessage(content = 'fine, and how many products each company has?',id=7),
    AIMessage(content = '''📣 Final Answer:

Sure! Here's a quick rundown of how many products each company has:

- Adidas has 181 products.
- Apple offers 1,178 products.
- AT&T has 134 products.
- Cisco has 9 products.
- Google has 459 products.
- Microsoft offers 75 products.
- Nike has 115 products.
- Samsung has a whopping 4,801 products.
- Sony offers 780 products.
- Target has 2 products.
- Verizon has 405 products.
- Walmart has 6 products.

This list includes all companies, even those with no products, thanks to the way the data was gathered.

I analyzed data based on the following filters and transformations:
...

🧮 Aggregations: • COUNT(products.product_id)

📦 Groupings: • company.company_name''',
              response_metadata = { 'token_usage' : {'total_tokens' : 1200 } },
              id=8  ) 

]

In [None]:
messages_log = [
    HumanMessage(content = 'How many companies are there?',id=1),
    AIMessage(content = '''📣 Final Answer:

There are 12 unique companies listed in the data. Keep in mind, though, that if there are any variations in how the company names are written (like different spellings or capitalization), it might affect the count slightly.

I analyzed data based on the following filters and transformations:

🧊 Tables: • company

🧮 Aggregations: • COUNT(DISTINCT company.company_name)''',id=2),

    HumanMessage(content = 'What can you tell me about the dataset?',id=3),
    AIMessage(content = '''📣 Final Answer:

The dataset provides some interesting insights:

1. There are 12 unique companies listed in the dataset.
2. On average, these companies have an annual revenue of about $26.3 trillion USD.
3. There are 413,898 unique feedback entries, which means a lot of feedback has been collected.
4. The average feedback rating is approximately 3.84 out of a possible scale (not specified here).
5. There are 8,145 unique products in the dataset.

These numbers give a broad overview of the dataset, showing a diverse range of companies, a significant amount of feedback, and a large variety of products. However, it doesn't dive into specifics like industry types, regional data, or detailed company attributes.

I analyzed data based on the following filters and transformations:

🧊 Tables: • company • feedback • products

🧮 Aggregations: • COUNT(DISTINCT company.company_name) • AVG(company.annual_revenue_usd) • COUNT(DISTINCT feedback.feedback_id) • AVG(feedback.feedback_rating) • COUNT(DISTINCT products.product_id)',id=5),
    HumanMessage(content = 'tell me a joke about rum refering to my name''',id=4),
   
    HumanMessage(content = 'Can you share the average feedback rating per company?',id=5),
    AIMessage(content = '''📣 Final Answer:

Sure! Here's the average feedback rating for each company based on the data we have:

- Adidas: 4.06
- Apple: 3.85
- AT&T: 3.66
- Cisco: 3.37
- Google: 3.56
- Microsoft: 3.91
- Nike: 3.95
- Samsung: 3.87
- Sony: 3.83
- Target: 3.33
- Verizon: 3.84
- Walmart: 3.31

These numbers represent the average feedback ratings from customers for each company. Keep in mind that this is a straightforward average and doesn't account for things like the number of feedback entries or any unusual ratings that might affect the average.

I analyzed data based on the following filters and transformations:
...

🧮 Aggregations: • AVG(feedback.feedback_rating)

📦 Groupings: • feedback.product_company_name''',id=6),
    
    HumanMessage(content = 'fine, and how many products each company has?',id=7),
    AIMessage(content = '''📣 Final Answer:

Sure! Here's a quick rundown of how many products each company has:

- Adidas has 181 products.
- Apple offers 1,178 products.
- AT&T has 134 products.
- Cisco has 9 products.
- Google has 459 products.
- Microsoft offers 75 products.
- Nike has 115 products.
- Samsung has a whopping 4,801 products.
- Sony offers 780 products.
- Target has 2 products.
- Verizon has 405 products.
- Walmart has 6 products.

This list includes all companies, even those with no products, thanks to the way the data was gathered.

I analyzed data based on the following filters and transformations:
...

🧮 Aggregations: • COUNT(products.product_id)

📦 Groupings: • company.company_name''',
              response_metadata = { 'token_usage' : {'total_tokens' : 1200 } },
              id=8  ) 

]

question = 'what is the first company you listed in your previous response?'