In [2]:
from dotenv import load_dotenv
load_dotenv()

True

# Create vector store

In [11]:
from llama_index.core import Settings
from qdrant_client import QdrantClient
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core.embeddings import resolve_embed_model
from utils import load_documents, get_nodes, create_index

# Set embedding model globally. This is because you need to use the same embedding model for both indexing and retrieving. 
Settings.embed_model = resolve_embed_model("local:BAAI/bge-small-en-v1.5")
DOCUMENTS_PATH = "./test_data"
DB_PATH = '../qdrant_data'
DB_COLLECTION_NAME = "paul_graham"

# Load Data
documents = load_documents(DOCUMENTS_PATH)

# Create document nodes
nodes = get_nodes(documents)

# Create chroma_db collection (database)
client = QdrantClient(path=DB_PATH)

# Create index (embeds documents and stores them)
vector_store = QdrantVectorStore("paul_graham", client=client, enable_hybrid=True, batch_size=20)
index = create_index(nodes, vector_store)

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

# Configure database retriever and query engine

In [12]:
from llama_index.llms.ollama import Ollama
from utils import create_retriever, configure_query_engine

# Choose Generation LLM
llm  = Ollama(model="gemma:2b", request_timeout=30.0)
retriever = create_retriever(index, similarity_top_k=5)
query_engine = configure_query_engine(index, llm, retriever)

In [13]:
# query
print(list(retriever.retrieve('What are the problems with HN?'))[0])
response = query_engine.query("What are the problems with HN?")
print(response)

Node ID: 5d3fcfaf-1e7d-4c4b-a6cb-af4184d3ae8a
Text: I had not originally intended YC to be a full-time job. I was
going to do three things: hack, write essays, and work on YC. As YC
grew, and I grew more excited about it, it started to take up a lot
more than a third of my attention. But for the first few years I was
still able to work on other things.  In the summer of 2006, Robert and
I started...
Score:  0.634

The problems with HN were a bizarre edge case that occurs when you both write essays and run a forum. When you run a forum, you're assumed to see if not every conversation, at least every conversation involving you. And when you write essays, people post highly imaginative misinterpretations of them on forums. Individually these two phenomena are tedious but bearable, but the combination is disastrous.


# Pipeline Evaluation

### Retreival Evaluation

In [6]:
import os
from llama_index.llms.openai import OpenAI
from llama_index.core.evaluation import generate_question_context_pairs, EmbeddingQAFinetuneDataset

# TODO: Upgrade to gpt-4 for final evaluation
evaluation_llm = OpenAI()

qa_dataset_path = "qa_dataset_2.json"
# Create QA dataset
if not os.path.exists(qa_dataset_path):
    qa_dataset = generate_question_context_pairs(
        nodes, llm=evaluation_llm, num_questions_per_chunk=2
    )
    qa_dataset.save_json(qa_dataset_path)

qa_dataset = EmbeddingQAFinetuneDataset.from_json(qa_dataset_path)


In [7]:
print(list(qa_dataset.queries.items())[0])

('bb7e4ad4-6955-4e1d-aea4-08e9b0b0b2a1', "How did the author's experience with writing short stories in their youth compare to their experience with programming on the IBM 1401 in 9th grade?")


In [10]:
import nest_asyncio
from llama_index.core.evaluation import RetrieverEvaluator

nest_asyncio.apply()

# Test the retriever evaluator
sample_id, sample_query = list(qa_dataset.queries.items())[0]
sample_expected = qa_dataset.relevant_docs[sample_id]
print(sample_expected[0])
print(sample_id)
print(sample_query)

# Configure retriever evaluator
retriever_evaluator = RetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate"], retriever=retriever
)

eval_result = retriever_evaluator.evaluate(sample_query, sample_expected)
print(eval_result)

40705139-1516-4761-91e1-9c4803314e39
bb7e4ad4-6955-4e1d-aea4-08e9b0b0b2a1
How did the author's experience with writing short stories in their youth compare to their experience with programming on the IBM 1401 in 9th grade?


AttributeError: 'NoneType' object has no attribute 'search_batch'

In [None]:
# Testing retriever evaluator on entire dataset
import nest_asyncio
nest_asyncio.apply()
eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)

In [None]:
import pandas as pd

# TODO: Create function that takes top-k as an input and returns the results with modified retriever

def display_results(name, eval_results):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    hit_rate = full_df["hit_rate"].mean()
    mrr = full_df["mrr"].mean()
    columns = {"retrievers": [name], "hit_rate": [hit_rate], "mrr": [mrr]}

    # crr_relevancy = full_df["openai_relevancy"].mean()
    # columns.update({"openai_relevancy": [crr_relevancy]})

    metric_df = pd.DataFrame(columns)

    return metric_df

display_results(f"top-{SIMILARITY_TOP_K} eval", eval_results)