In [22]:
import nest_asyncio

nest_asyncio.apply()

In [23]:
import getpass
import os

import pandas as pd
from llama_index.llms.openai import OpenAI
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = ""
os.environ["OPENAI_API_KEY"] = ""

In [24]:
from llama_index.core import SimpleDirectoryReader
documents = SimpleDirectoryReader("data").load_data()

In [25]:
df = pd.read_csv("questions/Lyft2021_100_questions.csv")
queries = df["Question"].tolist()

In [26]:
from llama_index.core.node_parser import SentenceWindowNodeParser, SentenceSplitter

node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text"
)

text_splitter = SentenceSplitter()
llm = OpenAI(model="gpt-4o-mini", temperature=0.0)
embed_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2", max_length=512
)

from llama_index.core import Settings
Settings.llm = llm
Settings.embed_model = embed_model
Settings.text_splitter = text_splitter

In [None]:
nodes = node_parser.get_nodes_from_documents(documents)
base_nodes = text_splitter.get_nodes_from_documents(documents)

from llama_index.core import VectorStoreIndex

sentence_index = VectorStoreIndex(nodes)
base_index = VectorStoreIndex(base_nodes)

In [None]:
from llama_index.core.postprocessor import MetadataReplacementPostProcessor

In [None]:
from llama_index.core.evaluation import FaithfulnessEvaluator, RelevancyEvaluator

f_evaluator = FaithfulnessEvaluator(index=vector_index_chunk)
r_evaluator = RelevancyEvaluator(index=vector_index_chunk)
f_scores = []
r_scores = []

for query in queries[:10]:
    response = query_engine_chunk.query(query)

    if response and response.source_nodes:
        contexts = [node.get_content() for node in response.source_nodes]

        f_score = f_evaluator.evaluate_response(response=response).score * 100
        r_score = r_evaluator.evaluate_response(query=query, response=response).score * 100
        f_scores.append(f_score)
        r_scores.append(r_score)
    else:
        print(f"skipping")

avg_f = sum(f_scores) / len(f_scores)
avg_r = sum(r_scores) / len(r_scores)

print("Average Faithfulness Score:", avg_f)
print("Average Relevancy Score:", avg_r)