In [None]:
import matplotlib.pyplot as plt
from pyvis.network import Network
import networkx as nx
from pathlib import Path
import pandas as pd
import numpy as np

import nest_asyncio

from llama_index.core import Response
from llama_index.core.evaluation import (
    FaithfulnessEvaluator,
    DatasetGenerator,
    RelevancyEvaluator,
    EvaluationResult,
    CorrectnessEvaluator,
    AnswerRelevancyEvaluator,
    ContextRelevancyEvaluator,
    BatchEvalRunner,
    RetrieverEvaluator,
    SemanticSimilarityEvaluator,
    QueryResponseDataset,
)

from notebooks.helpers.bot.promtps import  (ANSWER_REL_EVAL_TEMPLATE,
    question_gen_query,
    EVALUATION_CORRECTNESS_SYSTEM_TEMPLATE,
    FAITH_EVAL_TEMPLATE,
    CONTEXT_REL_PROMPT, EVALUATION_CORRECTNESS_SYSTEM_TEMPLATE)
from evaluate_pairings import prepare_evalution_qa, parse_evalutions, get_qr_pairs, evaluate_correctness, evaluate_faithfulness, default_parser
from app.notebooks.helpers.bot.kg_generation import create_kg_triplets
from notebooks.helpers.bot.bot import (
    get_chat_engine,
    get_query_engine,
    load_llm,
    load_embedding_model,
    setup_index_and_storage,
    generate_pairings_documents,
    service,
)


nest_asyncio.apply()
%matplotlib inline

In [None]:
def display_eval_df(query: str, response: Response, eval_result: Response) -> None:
    eval_df = pd.DataFrame(
        {
            "Query": query,
            "Response": str(response),
            "Source": (' '.join(eval_result.contexts)[:1000] + "..."),
            "Evaluation Result": eval_result.passing,
        },
        index = [0]
    )
    eval_df = eval_df.style.set_properties(
        **{
            "inline-size": "600px",
            "overflow-wrap": "break-word",
        },
        subset=["Response", "Source"]
    )
    display(eval_df)



In [None]:
def parse_triplets(text):
    triplets = []
    rows = text.split('\n')
    for row in rows[:-1]:
        triplet = row.split('**')
        triplets.append((triplet[0], triplet[1], triplet[2]))
    return triplets

kg_triplets = []

KG = create_kg_triplets(sample_size=10, for_model=False)
G = nx.DiGraph()
for _, row in KG.iterrows():
    triplets = parse_triplets(row['triplets'])
    for triplet in triplets:
        G.add_edge(triplet[0], triplet[2], label=triplet[1])

In [None]:
KG = create_kg_triplets()
kg_pairings = KG.apply(generate_pairings_documents, axis=1)

In [None]:
len(kg_pairings)

In [None]:
print(kg_pairings.iloc[0].get_content(metadata_mode='llm'))

In [None]:
llm = load_llm('openai3.5')
embed_model = load_embedding_model("openai3")
service_context = service(llm=llm, embed_model=embed_model)

In [None]:
storage_context, kg_index = setup_index_and_storage(
    service=service_context,
    kg_pairings=None,
    show_progress=False,
    force=False,
)


In [None]:
relevancy_eval = RelevancyEvaluator(service_context=service_context)
faithfulness_eval = FaithfulnessEvaluator(
    service_context=service_context, eval_template=FAITH_EVAL_TEMPLATE
)
semantic_eval = SemanticSimilarityEvaluator(service_context=service_context)
answer_eval = AnswerRelevancyEvaluator(
    service_context=service_context,
    eval_template=ANSWER_REL_EVAL_TEMPLATE,
    score_threshold=3.0,
)
context_eval = ContextRelevancyEvaluator(
    service_context=service_context,
)
correctness_eval = CorrectnessEvaluator(
    llm=load_llm("openai3.5"),
    parser_function=default_parser,
    eval_template=EVALUATION_CORRECTNESS_SYSTEM_TEMPLATE,
)

runner = BatchEvalRunner(
    {
        "relevancy": relevancy_eval,
        "answer_relevancy": answer_eval,
        "semantic": semantic_eval,
        "context_relevancy": context_eval,
    },
    workers=6,
    show_progress=True,
)


CHAT_MODE = "context"
RETRIEVER_MODE = "keyword"
RESPONSE_MODE = "compact"

In [None]:
query_engine = get_query_engine(
    kg_index,
    chat_mode=CHAT_MODE,
    retriver_mode=RETRIEVER_MODE,
    response_mode=RESPONSE_MODE,
    use_global_node_triplets=False,
    max_keywords_per_query=10,
    num_chunks_per_query=10,
    similarity_top_k=4,
    graph_store_query_depth=2,
    include_text=True,  # Do not include text of the node into the model
)


queries, references = get_qr_pairs()
# queries = queries[:10]
responses = [query_engine.query(query) for query in queries]

In [None]:
eval_results = runner.evaluate_responses(
    responses = responses,
    queries=queries,
    reference=references,  # type: ignore
)

In [None]:
eval_results["faithfulness"] = evaluate_faithfulness(
    faithfulness_eval=faithfulness_eval,
    queries = queries,
    references = references,
    responses = responses,
)

In [None]:
eval_results["correctnes"] = evaluate_correctness(
    correctness_eval,
    queries,
    references,
    responses = responses,
)

In [None]:
parse_evalutions(
    eval_results=eval_results,
    model="gpt-3.5",
    embedding_model="gpt-3.5",
    chat_mode=CHAT_MODE,
    retriever_mode=RETRIEVER_MODE,
    response_mode=RESPONSE_MODE,
    queries=queries,
    responses=references,
)

In [None]:
evaluator = FaithfulnessEvaluator(service_context=service_context)

chat_engine = get_chat_engine(
    kg_index,
    response_mode="compact",
    retriver_mode="hybrid",
    chat_mode="context",
    use_global_node_triplets=True,
    max_keywords_per_query=10,
    num_chunks_per_query=10,
    similarity_top_k=3,
    graph_store_query_depth=3,
    include_text=False,
)

def chat_with_query(query):
    response = chat_engine.chat(query)
    eval_response = evaluator.evaluate_response(query, response)
    return eval_response, query

def print_chat_interface(response, query):
    interface = f"""
    QUERY: {query}\n
    --------------------------\n
    Context: {response.contexts}\n
    ---------------------------\n
    Response: {response.response}
    """
    print(interface)


In [None]:
import random


queries, responses = get_qr_pairs(num=70)
indx = random.randint(a=0, b=70)
query = queries[indx]
reference = responses[indx]
eval_response, query = chat_with_query("What is the average price for 1990 Chateau Petrus?")

In [None]:
print_chat_interface(eval_response, query)

In [None]:
correctness_eval = CorrectnessEvaluator(service_context=service_context)
eval = correctness_eval.evaluate(query = query, response=eval_response.response, referece=reference)

In [None]:
eval.score

In [None]:
eval_response, query = chat_with_query("What is the average price for 1990 Chateau Petrus?")


In [None]:
print_chat_interface(eval_response, query)

In [None]:
G = kg_index.get_networkx_graph(limit=1000)

In [None]:
net = Network(notebook = True, cdn_resources = "remote",
                # bgcolor = "#222222",
                font_color = "black",
                height = "750px",
                width = "100%",
                select_menu = True,
                filter_menu = True,
)
net.show_buttons(filter_="physics")
net.from_nx(G)
net.show("nx.html")

In [None]:
pos = nx.spring_layout(G, seed=42, k=1.5)
labels = nx.get_edge_attributes(G, 'label')
plt.figure(figsize=(20, 20))
nx.draw(G, pos, font_size=8, node_size=200, node_color='lightblue', edge_color='gray', alpha=0.6)
# nx.draw_networkx_edge_labels(G, pos, font_size=3, label_pos=0.3, verticalalignment='baseline')
plt.title('Knowledge Graph')
plt.show()

In [None]:
for edge in G.edges(data=True):
    print(edge)

In [None]:
print(G.number_of_nodes())

In [None]:

eval_dataset_path = Path("./app/data/evaluation/evaluation_evolved.json")

data_generator = DatasetGenerator.from_documents(
    kg_pairings.sample(n=3000),
    service_context=service_context,
    question_gen_query=question_gen_query,
    num_questions_per_chunk=2,
    show_progress = True
)
eval_dataset = data_generator.generate_dataset_from_nodes(130)
eval_dataset.save_json(eval_dataset_path)