In [2]:
import matplotlib.pyplot as plt
from pyvis.network import Network
import networkx as nx
from pathlib import Path
import pandas as pd
import numpy as np

import nest_asyncio

from llama_index.core import Response
from llama_index.core.evaluation import (
    FaithfulnessEvaluator,
    DatasetGenerator,
    RelevancyEvaluator,
    EvaluationResult,
    CorrectnessEvaluator,
    AnswerRelevancyEvaluator,
    ContextRelevancyEvaluator,
    BatchEvalRunner,
    RetrieverEvaluator,
    SemanticSimilarityEvaluator,
    QueryResponseDataset,
)
from tqdm import tqdm
from notebooks.helpers.bot.promtps import  (ANSWER_REL_EVAL_TEMPLATE,
    question_gen_query,
    EVALUATION_CORRECTNESS_SYSTEM_TEMPLATE,
    FAITH_EVAL_TEMPLATE,
    CONTEXT_REL_PROMPT, EVALUATION_CORRECTNESS_SYSTEM_TEMPLATE)
from evaluate_pairings import prepare_evalution_qa, parse_evalutions, get_qr_pairs, evaluate_correctness, evaluate_faithfulness, default_parser, evaluate_ans_relevancy, evaluate_context_relevancy, evaluate_relevancy
from app.notebooks.helpers.bot.kg_generation import create_kg_triplets
from notebooks.helpers.bot.bot import (
    get_chat_engine,
    get_query_engine,
    load_llm,
    load_embedding_model,
    setup_index_and_storage,
    generate_pairings_documents,
    service,
)


nest_asyncio.apply()
%matplotlib inline

In [None]:
def display_eval_df(query: str, response: Response, eval_result: Response) -> None:
    eval_df = pd.DataFrame(
        {
            "Query": query,
            "Response": str(response),
            "Source": (' '.join(eval_result.contexts)[:1000] + "..."),
            "Evaluation Result": eval_result.passing,
        },
        index = [0]
    )
    eval_df = eval_df.style.set_properties(
        **{
            "inline-size": "600px",
            "overflow-wrap": "break-word",
        },
        subset=["Response", "Source"]
    )
    display(eval_df)



In [None]:
def parse_triplets(text):
    triplets = []
    rows = text.split('\n')
    for row in rows[:-1]:
        triplet = row.split('**')
        triplets.append((triplet[0], triplet[1], triplet[2]))
    return triplets

kg_triplets = []

KG = create_kg_triplets(sample_size=10, for_model=False)
G = nx.DiGraph()
for _, row in KG.iterrows():
    triplets = parse_triplets(row['triplets'])
    for triplet in triplets:
        G.add_edge(triplet[0], triplet[2], label=triplet[1])

In [None]:
# KG = create_kg_triplets()
# kg_pairings = KG.apply(generate_pairings_documents, axis=1)

In [None]:
# len(kg_pairings)

In [None]:
# print(kg_pairings.iloc[0].get_content(metadata_mode='llm'))

In [39]:
llm = load_llm('openai3.5')
embed_model = load_embedding_model("openai3")
service_context = service(llm=llm, embed_model=embed_model)

In [40]:
storage_context, kg_index = setup_index_and_storage(
    service=service_context,
    kg_pairings=None,
    show_progress=True,
    force=False,
)


Task exception was never retrieved
future: <Task finished name='Task-1242' coro=<tqdm_asyncio.gather.<locals>.wrap_awaitable() done, defined at /home/carlosjosuna/.local/share/virtualenvs/wine_parings-Y4sTaGQc/lib/python3.8/site-packages/tqdm/asyncio.py:75> exception=RateLimitError("Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-3.5-turbo-0125 in organization org-rSW8VTGvcLlOXkwlKwR4NiVf on tokens per min (TPM): Limit 60000, Used 59640, Requested 3577. Please try again in 3.217s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}")>
Traceback (most recent call last):
  File "/usr/lib/python3.8/asyncio/tasks.py", line 280, in __step
    result = coro.send(None)
  File "/home/carlosjosuna/.local/share/virtualenvs/wine_parings-Y4sTaGQc/lib/python3.8/site-packages/tqdm/asyncio.py", line 76, in wrap_awaitable
    return i, await f
  File "/home/carlosjosuna/.local/share/virtualenvs/wine

In [41]:

llm_eval = load_llm("openai3.5")
embed_model_eval = load_embedding_model("openai3")
service_context_eval = service(llm=llm_eval, embed_model=embed_model_eval)
relevancy_eval = RelevancyEvaluator(service_context=service_context_eval)
faithfulness_eval = FaithfulnessEvaluator(
    service_context=service_context, eval_template=FAITH_EVAL_TEMPLATE
)
semantic_eval = SemanticSimilarityEvaluator(service_context=service_context_eval)
answer_eval = AnswerRelevancyEvaluator(
    service_context=service_context_eval,
    eval_template=ANSWER_REL_EVAL_TEMPLATE,
    score_threshold=3.0,
)
context_eval = ContextRelevancyEvaluator(
    service_context=service_context_eval, eval_template = CONTEXT_REL_PROMPT
)
correctness_eval = CorrectnessEvaluator(
    service_context=service_context_eval,
    parser_function=default_parser,
    eval_template=EVALUATION_CORRECTNESS_SYSTEM_TEMPLATE,
)


CHAT_MODE = "context"
RETRIEVER_MODE = "hybrid"
RESPONSE_MODE = "compact"

In [42]:
if CHAT_MODE != 'simple':
    runner = BatchEvalRunner(
        {
            "semantic": semantic_eval,
            "answer_relevancy": answer_eval,
            "context_relevancy": context_eval,
            "relevancy": relevancy_eval,
            # "correctness": correctness_eval,

        },
        workers=5,
        show_progress=True,
    )
else:
    runner = BatchEvalRunner(
    {
        "semantic": semantic_eval,
        "answer_relevancy": answer_eval,
        "relevancy": relevancy_eval,
        # "correctness": correctness_eval,
    },
    workers=6,
    show_progress=True,
)


In [43]:
query_engine = get_query_engine(
    kg_index,
    chat_mode=CHAT_MODE,
    retriver_mode=RETRIEVER_MODE,
    response_mode=RESPONSE_MODE,
    use_global_node_triplets=False,
    max_keywords_per_query=10,
    num_chunks_per_query=10,
    similarity_top_k=4,
    graph_store_query_depth=2,
    include_text=False,  # Do not include text of the node into the model
    verbose = False
)

In [44]:
queries, references = get_qr_pairs()

if CHAT_MODE == 'simple':
    responses_strs = [
        llm.complete(query).text
        for query in tqdm(queries, total=len(queries), desc="Responses from Model")
    ]
    contexts = [['']] * len(responses_strs)
else:
    responses = [
        query_engine.query(query)
        for query in tqdm(queries, total=len(queries), desc="Querying Knowledge Graph")
    ]
    responses_strs = [response_ty.response for response_ty in responses]
    contexts = [[node.get_content()] for response in responses for node in response.source_nodes]

Querying Knowledge Graph: 100%|██████████| 162/162 [10:56<00:00,  4.05s/it]


In [45]:
eval_results = runner.evaluate_response_strs(
    queries=queries,
    response_strs = responses_strs,
    contexts_list = contexts,
    reference=references,  # type: ignore
)

 54%|█████▍    | 350/648 [01:49<01:52,  2.65it/s]Retrying llama_index.llms.openai.base.OpenAI._achat in 0.6063655929045642 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-3.5-turbo-0125 in organization org-rSW8VTGvcLlOXkwlKwR4NiVf on tokens per min (TPM): Limit 60000, Used 57091, Requested 3496. Please try again in 587ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}.
 62%|██████▏   | 402/648 [02:11<01:54,  2.14it/s]Retrying llama_index.llms.openai.base.OpenAI._achat in 0.0576832651751904 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-3.5-turbo-0125 in organization org-rSW8VTGvcLlOXkwlKwR4NiVf on tokens per min (TPM): Limit 60000, Used 56881, Requested 3585. Please try again in 466ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': Non

In [46]:
def evaluate_faithfulness(faithfulness_eval, queries, contexts, responses):
    results = []
    for query, context, response in tqdm(
        zip(queries, contexts, responses),
        total=len(responses),
        desc="Calculating Faithfulness",
    ):
        try:
            evaluation = faithfulness_eval.evaluate(
                query=query, response=response, contexts=context
            )
            results.append(evaluation)
        except:
            results.append(None)
            continue
    return results

def evaluate_correctness(correctness_eval, queries, references, responses):
    results = []
    for query, reference, response in tqdm(
        zip(queries, references, responses),
        total=len(responses),
        desc="Calculating Correctness",
    ):
        try:
            evaluation = correctness_eval.evaluate(
                query=query, response=response, referece=reference
            )
            results.append(evaluation)
        except:
            results.append(None)
            continue
    return results

eval_results["faithfulness"] = evaluate_faithfulness(
    faithfulness_eval=faithfulness_eval,
    queries=queries,
    contexts = contexts,
    responses=responses_strs,
)
eval_results["correctness"] = evaluate_correctness(
    correctness_eval,
    queries,
    references,
    responses=responses_strs,
)

Calculating Faithfulness: 100%|██████████| 162/162 [01:27<00:00,  1.84it/s]
Calculating Correctness: 100%|██████████| 162/162 [04:08<00:00,  1.53s/it]


In [47]:
parse_evalutions(
    eval_results=eval_results,
    model="gpt-3.5",
    embedding_model="gpt-3.5",
    chat_mode=CHAT_MODE,
    retriever_mode=RETRIEVER_MODE,
    response_mode=RESPONSE_MODE,
    queries=queries,
    responses=references,
)

semantic Score: 0.8921377954665135
answer_relevancy Score: 0.9938271604938271
context_relevancy Score: 0.5
relevancy Score: 0.9567901234567902
faithfulness Score: 0.9876543209876543
correctness Score: 0.6296296296296297


In [69]:

chat_engine = get_chat_engine(
    kg_index,
    response_mode='compact',
    retriver_mode='hybrid',
    chat_mode='context',
    use_global_node_triplets=True,
    max_keywords_per_query=10,
    num_chunks_per_query=10,
    similarity_top_k=4,
    graph_store_query_depth=2,
    include_text=False,
)

chat_engine.reset()

def chat_with_query(query):
    response = chat_engine.chat(query)
    eval_response = evaluator.evaluate_response(query, response)
    return eval_response, query

def print_chat_interface(response, query):
    interface = f"""
    QUERY: {query}\n
    --------------------------\n
    Context:\n {" ".join(response.contexts)}\n
    ---------------------------\n
    Response: {response.response.replace('**', '')}
    """
    print(interface)


In [60]:
import random


queries, responses = get_qr_pairs()


In [55]:
indx = random.randint(a=0, b=160)
query = queries[indx]
reference = responses[indx]
print(query)

How much does the Casaloste 2011 Chianti Classico wine cost?


In [None]:
embed_model.get_query_embedding("What three wines best pair with a ribeye steak and as a side dish baked potatoes with butter and mushrooms?")

In [70]:

eval_response, query = chat_with_query("What three wines best pair with a ribeye steak and as a side dish baked potatoes with butter and mushrooms?")

[1;3;32mExtracted keywords: ['wines', 'potatoes', 'baked', 'ribeye', 'ribeye steak', 'baked potatoes', 'side', 'mushrooms', 'pair', 'steak', 'three wines', 'side dish', 'dish', 'butter', 'three']
[0m[1;3;34mKG context:
The following are knowledge sequence in max depth 2 in the form of directed graph like:
`subject -[predicate]->, object, <-[predicate_next_hop]-, object_next_hop ...`
('steak roquefort', 'pairs_with', 'Pinot Noir San Francisco Bay, Central Coast, California, USA')
('steak and drunken mushroom', 'pairs_with', 'White Blend Sierra Foothills, California, USA')
('steak and drunken mushroom', 'pairs_with', 'White Blend Columbia Valley, Washington, USA')
('steak roquefort', 'pairs_with', 'White Blend Southwest France, France')
[0m

In [71]:
print_chat_interface(eval_response, query)


    QUERY: What three wines best pair with a ribeye steak and as a side dish baked potatoes with butter and mushrooms?

    --------------------------

    Context:
 The following are knowledge sequence in max depth 2 in the form of directed graph like:
`subject -[predicate]->, object, <-[predicate_next_hop]-, object_next_hop ...`
('steak roquefort', 'pairs_with', 'Pinot Noir San Francisco Bay, Central Coast, California, USA')
('steak and drunken mushroom', 'pairs_with', 'White Blend Sierra Foothills, California, USA')
('steak and drunken mushroom', 'pairs_with', 'White Blend Columbia Valley, Washington, USA')
('steak roquefort', 'pairs_with', 'White Blend Southwest France, France')

    ---------------------------

    Response: When pairing wines with a ribeye steak and baked potatoes with butter and mushrooms, you want to consider the rich and savory flavors of the dish. Here are three wine options that would complement this meal:

1. Cabernet Sauvignon: 
   - Why: Cabernet Sauvign

In [23]:
queries = [query]
response_strs = [eval_response.response]
contexts_list = [eval_response.contexts]
references = ["For a duck dish cooked in a traditional Japanese 'donabe' hot pot with a collection of vegetables, you'll want a wine that can complement the rich and savory flavors of the duck while also harmonizing with the umami notes from the vegetables and broth. One excellent wine pairing for this dish would be a Pinot Noir from Burgundy, France. Pinot Noir from Burgundy is known for its elegant and delicate flavors, with notes of red fruit, earthiness, and a silky texture. The wine's medium body and balanced acidity can complement the richness of the duck while also enhancing the flavors of the vegetables in the hot pot. Another great option would be a Riesling from the Mosel region, Germany. A crisp and slightly off-dry Riesling can provide a refreshing contrast to the savory duck and umami-rich vegetables. The wine's acidity and hint of sweetness can balance the flavors of the dish and cleanse the palate between bites. These wine choices can elevate the dining experience by creating a harmonious balance of flavors between the duck hot pot and the wines, enhancing the overall enjoyment of the meal with a touch of elegance and sophistication."
]

eval_results = runner.evaluate_response_strs(
    queries= queries,
    response_strs = response_strs,
    contexts_list = contexts_list,
    reference=references,  # type: ignore
)


eval_results["faithfulness"] = evaluate_faithfulness(
    faithfulness_eval=faithfulness_eval,
    queries=queries,
    contexts = contexts_list,
    responses=response_strs,
)
eval_results["correctness"] = evaluate_correctness(
    correctness_eval,
    queries,
    references,
    responses=response_strs,
)

100%|██████████| 4/4 [00:02<00:00,  1.96it/s]
Calculating Faithfulness: 100%|██████████| 1/1 [00:00<00:00, 732.25it/s]
Calculating Correctness: 100%|██████████| 1/1 [00:01<00:00,  1.89s/it]


In [27]:
for key, val in eval_results.items():
    resp =  dict(val[0])
    print(key, resp['score'], resp['passing'])

semantic 0.9202498699259001 True
answer_relevancy 1.0 None
context_relevancy None None
relevancy 0.0 False
faithfulness 0.0 False
correctness 5.0 True


In [None]:
eval_response, query = chat_with_query("From these wines which one is the most expensive?")


In [None]:
print_chat_interface(eval_response, query)

In [None]:
eval_response, query = chat_with_query("From the Bordeaux wine provided which one is considered a budget wine?")


In [None]:
print_chat_interface(eval_response, query)

In [None]:
G = kg_index.get_networkx_graph(limit=1000)

In [None]:
net = Network(notebook = True, cdn_resources = "remote",
                # bgcolor = "#222222",
                font_color = "black",
                height = "750px",
                width = "100%",
                select_menu = True,
                filter_menu = True,
)
net.show_buttons(filter_="physics")
net.from_nx(G)
net.show("nx.html")

In [None]:
pos = nx.spring_layout(G, seed=42, k=1.5)
labels = nx.get_edge_attributes(G, 'label')
plt.figure(figsize=(20, 20))
nx.draw(G, pos, font_size=8, node_size=200, node_color='lightblue', edge_color='gray', alpha=0.6)
# nx.draw_networkx_edge_labels(G, pos, font_size=3, label_pos=0.3, verticalalignment='baseline')
plt.title('Knowledge Graph')
plt.show()

In [None]:
for edge in G.edges(data=True):
    print(edge)

In [None]:
print(G.number_of_nodes())

In [None]:

eval_dataset_path = Path("./app/data/evaluation/evaluation_evolved.json")

data_generator = DatasetGenerator.from_documents(
    kg_pairings.sample(n=3000),
    service_context=service_context,
    question_gen_query=question_gen_query,
    num_questions_per_chunk=2,
    show_progress = True
)
eval_dataset = data_generator.generate_dataset_from_nodes(130)
eval_dataset.save_json(eval_dataset_path)