In [6]:
import matplotlib.pyplot as plt
from pyvis.network import Network
import networkx as nx
from pathlib import Path
import pandas as pd
import numpy as np

import nest_asyncio

from llama_index.core import Response
from llama_index.core.evaluation import (
    FaithfulnessEvaluator,
    DatasetGenerator,
    RelevancyEvaluator,
    EvaluationResult,
    CorrectnessEvaluator,
    AnswerRelevancyEvaluator,
    ContextRelevancyEvaluator,
    BatchEvalRunner,
    RetrieverEvaluator,
    SemanticSimilarityEvaluator,
    QueryResponseDataset,
)

from notebooks.helpers.bot.promtps import question_gen_query,  EVALUATION_CORRECTNESS_SYSTEM_TEMPLATE
from evaluate_pairings import prepare_evalution_qa, parse_evalutions, get_qr_pairs, evaluate_correctness, evaluate_faithfulness
from app.notebooks.helpers.bot.kg_generation import create_kg_triplets
from notebooks.helpers.bot.bot import (
    get_chat_engine,
    get_query_engine,
    load_llm,
    load_embedding_model,
    setup_index_and_storage,
    generate_pairings_documents,
    service,
)


nest_asyncio.apply()
%matplotlib inline

In [2]:
def display_eval_df(query: str, response: Response, eval_result: Response) -> None:
    eval_df = pd.DataFrame(
        {
            "Query": query,
            "Response": str(response),
            "Source": (' '.join(eval_result.contexts)[:1000] + "..."),
            "Evaluation Result": eval_result.passing,
        },
        index = [0]
    )
    eval_df = eval_df.style.set_properties(
        **{
            "inline-size": "600px",
            "overflow-wrap": "break-word",
        },
        subset=["Response", "Source"]
    )
    display(eval_df)



In [3]:
def parse_triplets(text):
    triplets = []
    rows = text.split('\n')
    for row in rows[:-1]:
        triplet = row.split('**')
        triplets.append((triplet[0], triplet[1], triplet[2]))
    return triplets

kg_triplets = []

KG = create_kg_triplets(sample_size=10, for_model=False)
G = nx.DiGraph()
for _, row in KG.iterrows():
    triplets = parse_triplets(row['triplets'])
    for triplet in triplets:
        G.add_edge(triplet[0], triplet[2], label=triplet[1])

Size of KG is:  (10362, 2)


In [7]:
KG = create_kg_triplets()
kg_pairings = KG.apply(generate_pairings_documents, axis=1)

Size of KG is:  (10362, 2)


In [12]:
len(kg_pairings)

5000

In [8]:
print(kg_pairings.iloc[0].get_content(metadata_mode='llm'))

Metadata: first=>mineral
second=>lime
third=>apple
fourth=>peach
fifth=>lemon
-----
Content: The object variety, **Riesling Rheinhessen, Germany** has a relationship of **has_aroma_of** with the object **mineral
The object variety, **Riesling Rheinhessen, Germany** has a relationship of **has_aroma_of** with the object **lime
The object variety, **Riesling Rheinhessen, Germany** has a relationship of **has_aroma_of** with the object **apple
The object variety, **Riesling Rheinhessen, Germany** has a relationship of **has_aroma_of** with the object **peach
The object variety, **Riesling Rheinhessen, Germany** has a relationship of **has_aroma_of** with the object **lemon


In [9]:
llm = load_llm('openai3.5')
embed_model = load_embedding_model("openai3")
service_context = service(llm=llm, embed_model=embed_model)

In [10]:
storage_context, kg_index = setup_index_and_storage(
    service=service_context,
    kg_pairings=None,
    show_progress=False,
    force=False,
)


In [14]:
metrics = ["mrr", "hit_rate"]

relevancy_eval = RelevancyEvaluator(service_context=service_context)
faithfulness_eval = FaithfulnessEvaluator(service_context=service_context)
semantic_eval = SemanticSimilarityEvaluator(service_context=service_context)
answer_eval = AnswerRelevancyEvaluator(service_context=service_context)
context_eval = ContextRelevancyEvaluator(service_context=service_context)
correctness_eval = CorrectnessEvaluator(
    llm=load_llm("openai3.5"),
    parser_function=default_parser,
    eval_template=EVALUATION_CORRECTNESS_SYSTEM_TEMPLATE,
)
retriever_eval = RetrieverEvaluator.from_metric_names(
    metrics,
    service_context=service_context,
    retriever=kg_index.as_retriever(retriever_mode="hybrid"),
)

runner = BatchEvalRunner(
    {
        "relevancy": relevancy_eval,
        "answer_relevancy": answer_eval,
        "semantic": semantic_eval,
        "context_relevancy": context_eval,
    },
    workers=4,
)


CHAT_MODE = "simple"
RETRIEVER_MODE = "embedding"
RESPONSE_MODE = "compact"

In [3]:
query_engine = get_query_engine(
    kg_index,
    chat_mode=CHAT_MODE,
    retriver_mode=RETRIEVER_MODE,
    response_mode=RESPONSE_MODE,
    use_global_node_triplets=True,
    max_keywords_per_query=10,
    num_chunks_per_query=10,
    similarity_top_k=4,
    graph_store_query_depth=4,
    include_text=False,  # Do not include text of the node into the model
)

queries, references = get_qr_pairs()
responses = [query_engine.query(query) for query in queries]

In [20]:
eval_results = runner.evaluate_responses(
    responses = responses,
    queries=queries,
    reference=references,  # type: ignore
    # contexts=references if CHAT_MODE == "simple" else None,  # type: ignore
)

In [21]:
eval_results["faithfulness"] = evaluate_faithfulness(
    faithfulness_eval=faithfulness_eval,
    queries = queries,
    references = references,
    responses = responses,
)

In [22]:
eval_results["correctnes"] = evaluate_correctness(
    correctness_eval,
    queries,
    references,
    responses = responses,
)

5.0
4.0
5.0
4.0
5.0
4.0
5.0
5.0
3.0
4.5


In [23]:
eval_results["correctnes"] 

[EvaluationResult(query='What is the alcohol content of the Giant Steps 2014 Applejack Vineyard Pinot Noir from Yarra Valley?', contexts=None, response='Based on my own knowledge, the alcohol content of a typical Pinot Noir from Yarra Valley, such as the Giant Steps 2014 Applejack Vineyard Pinot Noir, is usually around 13-14% ABV.', passing=True, feedback='REASONING:', score=5.0, pairwise_source=None, invalid_result=False, invalid_reason=None),
 EvaluationResult(query='Which winery produced the Pinot Noir wine from Victoria, Australia?', contexts=None, response='Based on my own knowledge, one of the wineries in Victoria, Australia known for producing Pinot Noir wine is Yarra Yering.', passing=True, feedback='REASONING:', score=4.0, pairwise_source=None, invalid_result=False, invalid_reason=None),
 EvaluationResult(query='In which province is the Yarra Valley region located?', contexts=None, response='The Yarra Valley region is located in the state of Victoria, Australia.', passing=True

In [24]:
parse_evalutions(
    eval_results=eval_results,
    model="gpt-3.5",
    embedding_model="gpt-3.5",
    chat_mode=CHAT_MODE,
    retriever_mode=RETRIEVER_MODE,
    response_mode=RESPONSE_MODE,
    queries=queries,
    responses=references,
)

relevancy Score: 0.9
answer_relevancy Score: 1.0
semantic Score: 0.8
context_relevancy Score: 0.0
faithfulness Score: 0.8
correctnes Score: 0.9


In [None]:
evaluator = FaithfulnessEvaluator(service_context=service_context)

chat_engine = get_chat_engine(
    kg_index,
    response_mode="compact",
    retriver_mode="hybrid",
    chat_mode="context",
    use_global_node_triplets=True,
    max_keywords_per_query=10,
    num_chunks_per_query=10,
    similarity_top_k=3,
    graph_store_query_depth=3,
    include_text=False,
)

def chat_with_query(query):
    response = chat_engine.chat(query)
    eval_response = evaluator.evaluate_response(query, response)
    return eval_response, query

def print_chat_interface(response, query):
    interface = f"""
    QUERY: {query}\n
    --------------------------\n
    Context: {response.contexts}\n
    ---------------------------\n
    Response: {response.response}
    """
    print(interface)


In [None]:
import random


queries, responses = get_qr_pairs(num=70)
indx = random.randint(a=0, b=70)
query = queries[indx]
reference = responses[indx]
eval_response, query = chat_with_query("What is the average price for 1990 Chateau Petrus?")

In [None]:
print_chat_interface(eval_response, query)

In [None]:
correctness_eval = CorrectnessEvaluator(service_context=service_context)
eval = correctness_eval.evaluate(query = query, response=eval_response.response, referece=reference)

In [None]:
eval.score

In [None]:
eval_response, query = chat_with_query("What is the average price for 1990 Chateau Petrus?")


In [None]:
print_chat_interface(eval_response, query)

In [None]:
G = kg_index.get_networkx_graph(limit=1000)

In [None]:
net = Network(notebook = True, cdn_resources = "remote",
                # bgcolor = "#222222",
                font_color = "black",
                height = "750px",
                width = "100%",
                select_menu = True,
                filter_menu = True,
)
net.show_buttons(filter_="physics")
net.from_nx(G)
net.show("nx.html")

In [None]:
pos = nx.spring_layout(G, seed=42, k=1.5)
labels = nx.get_edge_attributes(G, 'label')
plt.figure(figsize=(20, 20))
nx.draw(G, pos, font_size=8, node_size=200, node_color='lightblue', edge_color='gray', alpha=0.6)
# nx.draw_networkx_edge_labels(G, pos, font_size=3, label_pos=0.3, verticalalignment='baseline')
plt.title('Knowledge Graph')
plt.show()

In [None]:
for edge in G.edges(data=True):
    print(edge)

In [None]:
print(G.number_of_nodes())

In [14]:

eval_dataset_path = Path("./app/data/evaluation/evaluation_evolved.json")

data_generator = DatasetGenerator.from_documents(
    kg_pairings.sample(n=3000),
    service_context=service_context,
    question_gen_query=question_gen_query,
    num_questions_per_chunk=2,
    show_progress = True
)
eval_dataset = data_generator.generate_dataset_from_nodes(130)
eval_dataset.save_json(eval_dataset_path)

Parsing nodes:   0%|          | 0/3000 [00:00<?, ?it/s]

  return cls(
 38%|███▊      | 49/130 [00:12<00:15,  5.20it/s]Retrying llama_index.llms.openai.base.OpenAI._achat in 0.646428048293021 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-3.5-turbo-0125 in organization org-rSW8VTGvcLlOXkwlKwR4NiVf on tokens per min (TPM): Limit 60000, Used 57796, Requested 2365. Please try again in 161ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}.
Retrying llama_index.llms.openai.base.OpenAI._achat in 0.5547895815040618 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-3.5-turbo-0125 in organization org-rSW8VTGvcLlOXkwlKwR4NiVf on tokens per min (TPM): Limit 60000, Used 57720, Requested 2390. Please try again in 110ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}.
R