### Get Keys and instantiate model

In [2]:
import langchain, langchain_openai, langsmith
import os
from dotenv import load_dotenv
from langsmith import Client
import uuid
from langchain_core.messages import BaseMessage, HumanMessage, AIMessage, SystemMessage, RemoveMessage
from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate

load_dotenv(override=True)
openai_api_key = os.getenv('OPENAI_API_KEY')
LANGSMITH_API_KEY = os.getenv('LANGSMITH_API_KEY')
os.environ['OPENAI_API_KEY'] = openai_api_key
os.environ['LANGSMITH_API_KEY'] = LANGSMITH_API_KEY
os.environ['LANGSMITH_TRACING'] = "true"
os.environ['LANGSMITH_ENDPOINT'] = "https://api.smith.langchain.com"
langsmith_project_name = "db_agent_v1"
os.environ['LANGSMITH_PROJECT'] = langsmith_project_name

from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model='gpt-4o',temperature=0)

In [3]:
from db_agent_v1 import objects_documentation, graph, create_config

### Placeholder code to test vector stores

In [None]:
vector_store = create_or_retrieve_vector_store()

query = 'SELECT COUNT(DISTINCT company.company_name) FROM company;'
sql_query_explanation = create_sql_query_explanation(query)
sql_query_result = QuerySQLDataBaseTool(db=db).invoke(query)
sql_query_limitation = create_sql_query_limitation(query)
sql_query_insight = create_sql_query_insight(query,sql_query_result,sql_query_limitation)
sql_query_metadata = create_query_metadata(query)
doc = [Document(
    id=len(vector_store.store)+1,
    page_content=sql_query_explanation,
    metadata={"query": query,
              "result": sql_query_result,
              "insight": sql_query_insight,
              "metadata": sql_query_metadata
    }
)]
vector_store.add_documents(documents=doc)

query = 'SELECT AVG(company.annual_revenue_usd) FROM company;'
sql_query_explanation = create_sql_query_explanation(query)
sql_query_result = QuerySQLDataBaseTool(db=db).invoke(query)
sql_query_limitation = create_sql_query_limitation(query)
sql_query_insight = create_sql_query_insight(query,sql_query_result,sql_query_limitation)
sql_query_metadata = create_query_metadata(query)
doc = [Document(
    id=len(vector_store.store)+1,
    page_content=sql_query_explanation,
    metadata={"query": query,
              "result": sql_query_result,
              "insight": sql_query_insight,
              "metadata": sql_query_metadata
    }
)]
vector_store.add_documents(documents=doc)

query = 'SELECT COUNT(DISTINCT feedback.feedback_id) FROM feedback;'
sql_query_explanation = create_sql_query_explanation(query)
sql_query_result = QuerySQLDataBaseTool(db=db).invoke(query)
sql_query_limitation = create_sql_query_limitation(query)
sql_query_insight = create_sql_query_insight(query,sql_query_result,sql_query_limitation)
sql_query_metadata = create_query_metadata(query)
doc = [Document(
    id=len(vector_store.store)+1,
    page_content=sql_query_explanation,
    metadata={"query": query,
              "result": sql_query_result,
              "insight": sql_query_insight,
              "metadata": sql_query_metadata
    }
)]
vector_store.add_documents(documents=doc)

query = 'SELECT AVG(feedback.feedback_rating) FROM feedback;'
sql_query_explanation = create_sql_query_explanation(query)
sql_query_result = QuerySQLDataBaseTool(db=db).invoke(query)
sql_query_limitation = create_sql_query_limitation(query)
sql_query_insight = create_sql_query_insight(query,sql_query_result,sql_query_limitation)
sql_query_metadata = create_query_metadata(query)
doc = [Document(
    id=len(vector_store.store)+1,
    page_content=sql_query_explanation,
    metadata={"query": query,
              "result": sql_query_result,
              "insight": sql_query_insight,
              "metadata": sql_query_metadata
    }
)]
vector_store.add_documents(documents=doc)

query = 'SELECT products.product_name, products.product_average_rating FROM products ORDER BY products.product_average_rating DESC LIMIT 1;'
sql_query_explanation = create_sql_query_explanation(query)
sql_query_result = QuerySQLDataBaseTool(db=db).invoke(query)
sql_query_limitation = create_sql_query_limitation(query)
sql_query_insight = create_sql_query_insight(query,sql_query_result,sql_query_limitation)
sql_query_metadata = create_query_metadata(query)
doc = [Document(
    id=len(vector_store.store)+1,
    page_content=sql_query_explanation,
    metadata={"query": query,
              "result": sql_query_result,
              "insight": sql_query_insight,
              "metadata": sql_query_metadata
    }
)]
vector_store.add_documents(documents=doc)

query = 'SELECT distinct product_name FROM products WHERE product_average_rating = 5;'
sql_query_explanation = create_sql_query_explanation(query)
sql_query_result = QuerySQLDataBaseTool(db=db).invoke(query)
sql_query_limitation = create_sql_query_limitation(query)
sql_query_insight = create_sql_query_insight(query,sql_query_result,sql_query_limitation)
sql_query_metadata = create_query_metadata(query)
doc = [Document(
    id=len(vector_store.store)+1,
    page_content=sql_query_explanation,
    metadata={"query": query,
              "result": sql_query_result,
              "insight": sql_query_insight,
              "metadata": sql_query_metadata
    }
)]
vector_store.add_documents(documents=doc)

query = '''WITH CTE as (
SELECT product_id, AVG(feedback_rating) as avg_rating
FROM feedback 
GROUP BY product_id
) 

select count(distinct product_id ) as nr_products
from CTE 
where avg_rating = (select max(avg_rating) from CTE)'''
sql_query_explanation = create_sql_query_explanation(query)
sql_query_result = QuerySQLDataBaseTool(db=db).invoke(query)
sql_query_limitation = create_sql_query_limitation(query)
sql_query_insight = create_sql_query_insight(query,sql_query_result,sql_query_limitation)
sql_query_metadata = create_query_metadata(query)
doc = [Document(
    id=len(vector_store.store)+1,
    page_content=sql_query_explanation,
    metadata={"query": query,
              "result": sql_query_result,
              "insight": sql_query_insight,
              "metadata": sql_query_metadata
    }
)]
vector_store.add_documents(documents=doc)



### Create the questions & expected answers

In [4]:
### Q1
question_q1 = 'How many companies are there?'
expected_answer_q1 = '''There are 12 unique companies in the database.

I analyzed data based on the following filters and transformations:

🧊 Tables: • company

🧮 Aggregations: • COUNT(DISTINCT company.company_name)'''

input_state_q1 = {
               'objects_documentation':objects_documentation,
               'messages_log': [],
               'question':question_q1,
               'sql_queries': [],
               'llm_answer': []
               }    

In [5]:
### Q2

question_q2 = 'What can you tell me about the dataset?'
expected_answer_q2 = '''The dataset provides a snapshot of various aspects of a business environment. Here's a quick overview:

1. **Companies**: There are 12 unique companies listed. This means there are 12 different company names, though it's possible that some companies might have similar names due to variations in spelling or formatting.

2. **Revenue**: On average, these companies have a massive annual revenue of about $26.3 trillion USD. This figure is an overall average and doesn't break down into specific categories or time periods.

3. **Feedback**: There are 413,898 unique feedback entries. This suggests a substantial amount of feedback data, though we don't have details on what the feedback is about or when it was given.

4. **Feedback Ratings**: The average feedback rating is approximately 3.84 out of a possible scale (likely 5). This gives a general sense of customer satisfaction or product/service quality.

5. **Products**: The dataset includes 8,145 unique products. This count gives an idea of the variety of products available, but doesn't specify categories or availability.

Overall, the dataset offers a broad view of company performance, customer feedback, and product diversity, but lacks detailed insights into specific segments or trends.

I analyzed data based on the following filters and transformations:

🧊 Tables: • company • feedback • products

🧮 Aggregations: • COUNT(DISTINCT company.company_name) • AVG(company.annual_revenue_usd) • COUNT(DISTINCT feedback.feedback_id) • AVG(feedback.feedback_rating) • COUNT(DISTINCT products.product_id)'''

input_state_q2 = {
               'objects_documentation':objects_documentation,
               'messages_log': [],
               'question':question_q2,
               'sql_queries': [],
               'llm_answer': []
               }   

#### Chain of prompts

In [None]:
### Q3

question_q3 = 'What is the product with the highest ranking?'

expected_answer_q3 = '''The product with the highest ranking is the "Samsung Galaxy S21 Ultra Official LED View Flip Cover Gray," which has an average rating of 5.0.

I analyzed data based on the following filters and transformations:

🧊 Tables: • products
'''

input_state_q3 = {
               'objects_documentation':objects_documentation,
               'messages_log': [],
               'question':question_q3,
               'sql_queries': [],
               'llm_answer': []
               }  


In [None]:
### Q4 

question_q4 = 'How many products have this rating?'

expected_answer_q4 = ''' '''

input_state_q4 = {
               'objects_documentation':objects_documentation,
               'messages_log': [HumanMessage(content='What is the product with the highest ranking?', additional_kwargs={}, response_metadata={}),
                                AIMessage(content='The product with the highest ranking is the "Samsung Galaxy S21 Ultra Official LED View Flip Cover Gray," which has an average rating of 5.0.\n\nI analyzed data based on the following filters and transformations:\n\n🧊 Tables: • products', additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 33, 'prompt_tokens': 288, 'total_tokens': 339, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_f5bdcc3276', 'id': 'chatcmpl-BWIA7s4SqyAeWrsc9jhvVzrKHooY7', 'finish_reason': 'stop', 'logprobs': None})],
               'question':question_q4,
               'sql_queries': [{'query': 'SELECT products.product_name, products.product_average_rating FROM products ORDER BY products.product_average_rating DESC LIMIT 1;',
   'explanation': 'This SQL query retrieves the name and average rating of the product with the highest average rating from the "products" table. It sorts all products by their average rating in descending order and then limits the result to just the top entry, effectively selecting the single product with the highest rating. \n\nKey points to note:\n- There are no filters applied, so it considers all products in the table.\n- There are no aggregations or groupings; it simply orders the existing data.\n- The query assumes that the "product_average_rating" field is correctly calculated and stored in the table.\n- A limitation is that if multiple products share the highest rating, only one will be returned, and it’s not specified which one.',
   'result': "[('Samsung Galaxy S21 Ultra Official LED View Flip Cover Gray', 5.0)]"}],
               'llm_answer': []
               }  



### Create testing datasets in langsmith

In [6]:
client = Client()
dataset_name = 'Evaluation Dataset db_agent_v1'
dataset = client.create_dataset(dataset_name)

client.create_examples(
    dataset_id = dataset.id,
    examples = [
        {
          'inputs': input_state_q1,
          'outputs': {'answer': expected_answer_q1}
        },
        {
          'inputs': input_state_q2,
          'outputs': {'answer': expected_answer_q2}
        },
        
    ]
                      )

{'example_ids': ['a11f454a-6e69-4a7f-bb8f-6c8cb7f092fb',
  '31de6fcf-d8f3-4016-be41-1c870ec33a50'],
 'count': 2}

### Grade by corectness

In [7]:
from langchain.schema.output_parser import StrOutputParser

def rate_corectness(inputs:dict, outputs: dict, reference_outputs: dict ):
    prompt = ChatPromptTemplate.from_messages([
        ('system', '''Rate the corectness of answering a question, by comparing the actual answer to the reference answer.
                      
                      Rate based on the following factors:
                      - Precision of the data accuracy in relation to what is measured.
                      - The degree to which the same filters and transformations were used.

                      Rate between 1 and 10. In your response include just the rating.
                     ''' ),
        ('user',''' Question: 
                    {question} 

                    Actual answer:
                    {actual_answer} 

                    Reference answer:
                    {reference_answer} 
        ''')             
    ])

    chain = prompt | llm | StrOutputParser()

    return chain.invoke({'question':inputs['question'],
                         'actual_answer':outputs['response'],
                         'reference_answer':reference_outputs['answer']})

### Get actual answers by running the graph

In [8]:
config, thread_id = create_config('Run Agent',True)

def run_agent(input_state: dict):

 for step in graph.stream(input_state, config = config, stream_mode="updates"):
  step_name, output = list(step.items())[0]
  if step_name == 'generate_answer':
   result = output['llm_answer'].content
 return {'response': result}  

### Run evaluation

In [None]:
experiment_results = client.evaluate(
    run_agent, # Your AI system
    data=dataset_name, # The data to predict and grade over
    evaluators=[rate_corectness], # The evaluators to score the results
    experiment_prefix="regresion-test", # A prefix for your experiment names to easily identify them
)

In [None]:

messages_log = [
    HumanMessage(content = 'How many companies are there?',id=1),
    AIMessage(content = '''📣 Final Answer:

There are 12 unique companies listed in the data. Keep in mind, though, that if there are any variations in how the company names are written (like different spellings or capitalization), it might affect the count slightly.

I analyzed data based on the following filters and transformations:

🧊 Tables: • company

🧮 Aggregations: • COUNT(DISTINCT company.company_name)''',id=2),

    HumanMessage(content = 'What can you tell me about the dataset?',id=3),
    AIMessage(content = '''📣 Final Answer:

The dataset provides some interesting insights:

1. There are 12 unique companies listed in the dataset.
2. On average, these companies have an annual revenue of about $26.3 trillion USD.
3. There are 413,898 unique feedback entries, which means a lot of feedback has been collected.
4. The average feedback rating is approximately 3.84 out of a possible scale (not specified here).
5. There are 8,145 unique products in the dataset.

These numbers give a broad overview of the dataset, showing a diverse range of companies, a significant amount of feedback, and a large variety of products. However, it doesn't dive into specifics like industry types, regional data, or detailed company attributes.

I analyzed data based on the following filters and transformations:

🧊 Tables: • company • feedback • products

🧮 Aggregations: • COUNT(DISTINCT company.company_name) • AVG(company.annual_revenue_usd) • COUNT(DISTINCT feedback.feedback_id) • AVG(feedback.feedback_rating) • COUNT(DISTINCT products.product_id)',id=5),
    HumanMessage(content = 'tell me a joke about rum refering to my name''',id=4),
   
    HumanMessage(content = 'Can you share the average feedback rating per company?',id=5),
    AIMessage(content = '''📣 Final Answer:

Sure! Here's the average feedback rating for each company based on the data we have:

- Adidas: 4.06
- Apple: 3.85
- AT&T: 3.66
- Cisco: 3.37
- Google: 3.56
- Microsoft: 3.91
- Nike: 3.95
- Samsung: 3.87
- Sony: 3.83
- Target: 3.33
- Verizon: 3.84
- Walmart: 3.31

These numbers represent the average feedback ratings from customers for each company. Keep in mind that this is a straightforward average and doesn't account for things like the number of feedback entries or any unusual ratings that might affect the average.

I analyzed data based on the following filters and transformations:
...

🧮 Aggregations: • AVG(feedback.feedback_rating)

📦 Groupings: • feedback.product_company_name''',id=6),
    
    HumanMessage(content = 'fine, and how many products each company has?',id=7),
    AIMessage(content = '''📣 Final Answer:

Sure! Here's a quick rundown of how many products each company has:

- Adidas has 181 products.
- Apple offers 1,178 products.
- AT&T has 134 products.
- Cisco has 9 products.
- Google has 459 products.
- Microsoft offers 75 products.
- Nike has 115 products.
- Samsung has a whopping 4,801 products.
- Sony offers 780 products.
- Target has 2 products.
- Verizon has 405 products.
- Walmart has 6 products.

This list includes all companies, even those with no products, thanks to the way the data was gathered.

I analyzed data based on the following filters and transformations:
...

🧮 Aggregations: • COUNT(products.product_id)

📦 Groupings: • company.company_name''',
              response_metadata = { 'token_usage' : {'total_tokens' : 1200 } },
              id=8  ) 

]

In [None]:
messages_log = [
    HumanMessage(content = 'How many companies are there?',id=1),
    AIMessage(content = '''📣 Final Answer:

There are 12 unique companies listed in the data. Keep in mind, though, that if there are any variations in how the company names are written (like different spellings or capitalization), it might affect the count slightly.

I analyzed data based on the following filters and transformations:

🧊 Tables: • company

🧮 Aggregations: • COUNT(DISTINCT company.company_name)''',id=2),

    HumanMessage(content = 'What can you tell me about the dataset?',id=3),
    AIMessage(content = '''📣 Final Answer:

The dataset provides some interesting insights:

1. There are 12 unique companies listed in the dataset.
2. On average, these companies have an annual revenue of about $26.3 trillion USD.
3. There are 413,898 unique feedback entries, which means a lot of feedback has been collected.
4. The average feedback rating is approximately 3.84 out of a possible scale (not specified here).
5. There are 8,145 unique products in the dataset.

These numbers give a broad overview of the dataset, showing a diverse range of companies, a significant amount of feedback, and a large variety of products. However, it doesn't dive into specifics like industry types, regional data, or detailed company attributes.

I analyzed data based on the following filters and transformations:

🧊 Tables: • company • feedback • products

🧮 Aggregations: • COUNT(DISTINCT company.company_name) • AVG(company.annual_revenue_usd) • COUNT(DISTINCT feedback.feedback_id) • AVG(feedback.feedback_rating) • COUNT(DISTINCT products.product_id)',id=5),
    HumanMessage(content = 'tell me a joke about rum refering to my name''',id=4),
   
    HumanMessage(content = 'Can you share the average feedback rating per company?',id=5),
    AIMessage(content = '''📣 Final Answer:

Sure! Here's the average feedback rating for each company based on the data we have:

- Adidas: 4.06
- Apple: 3.85
- AT&T: 3.66
- Cisco: 3.37
- Google: 3.56
- Microsoft: 3.91
- Nike: 3.95
- Samsung: 3.87
- Sony: 3.83
- Target: 3.33
- Verizon: 3.84
- Walmart: 3.31

These numbers represent the average feedback ratings from customers for each company. Keep in mind that this is a straightforward average and doesn't account for things like the number of feedback entries or any unusual ratings that might affect the average.

I analyzed data based on the following filters and transformations:
...

🧮 Aggregations: • AVG(feedback.feedback_rating)

📦 Groupings: • feedback.product_company_name''',id=6),
    
    HumanMessage(content = 'fine, and how many products each company has?',id=7),
    AIMessage(content = '''📣 Final Answer:

Sure! Here's a quick rundown of how many products each company has:

- Adidas has 181 products.
- Apple offers 1,178 products.
- AT&T has 134 products.
- Cisco has 9 products.
- Google has 459 products.
- Microsoft offers 75 products.
- Nike has 115 products.
- Samsung has a whopping 4,801 products.
- Sony offers 780 products.
- Target has 2 products.
- Verizon has 405 products.
- Walmart has 6 products.

This list includes all companies, even those with no products, thanks to the way the data was gathered.

I analyzed data based on the following filters and transformations:
...

🧮 Aggregations: • COUNT(products.product_id)

📦 Groupings: • company.company_name''',
              response_metadata = { 'token_usage' : {'total_tokens' : 1200 } },
              id=8  ) 

]

question = 'what is the first company you listed in your previous response?'