In [None]:
import sys
if '/Users/nehiljain/code/find-your-mate-ai/src' not in sys.path:
    sys.path.append('/Users/nehiljain/code/find-your-mate-ai/src')

import nest_asyncio
nest_asyncio.apply()

In [None]:
from llama_index.core.evaluation import generate_question_context_pairs
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.openai import OpenAI
from find_your_mate_ai.config import settings
from find_your_mate_ai.data_ingestion import *

nodes = load_nodes_from_mongodb(settings.MONGO_URI)
nodes = nodes[:5]

In [None]:
nodes

In [None]:
from find_your_mate_ai.data_ingestion import *
qa_df = generate_synthetic_questions_data(nodes, "/tmp")
qa_df


In [None]:
from llama_index.llms.ollama import Ollama
# llm = Ollama(model="llama3")
llm = OpenAI()
openai.api_key = settings.OPENAI_API_KEY
vector_index = VectorStoreIndex(nodes)
retriever = vector_index.as_retriever(similarity_top_k=5)

In [None]:
retrieved_nodes = retriever.retrieve("Who are some candidates who have worked at facebook, meta, google or linkedin?")


In [None]:
from llama_index.core.response.notebook_utils import display_source_node

for node in retrieved_nodes:
    print(node.metadata['file_name'])
    display_source_node(node, source_length=1000)
    print("-"*100)
    print("-"*100)



In [None]:
nodes[0].text, nodes[0].doc_id



In [None]:
# Importing necessary libraries
import pandas as pd
from thefuzz import fuzz
import uuid

# Define the path to the labeled data from Label Studio
labeled_data_path = '/Users/nehiljain/Downloads/project-1-at-2024-05-10-23-13-9a0696b7.json'

# Load the labeled data into a DataFrame
df = pd.read_json(labeled_data_path)

# Function to extract curated questions from the labeled data
def extract_curated_questions(row):
    # Access the 'annotations' field from each row
    curated_questions = [
        text
        for item in row['annotations']
        for value in item['result']
        if value['from_name'] == 'question'  # Filter for items labeled as 'question'
        for text in (value['value']['text'] if isinstance(value['value']['text'], list) else [value['value']['text']])
    ]
    # Store the extracted questions back into the 'data' field under 'curated_questions'
    row['data']['curated_questions'] = curated_questions
    return row

# Apply the function to each row of the DataFrame
df = df.apply(extract_curated_questions, axis=1)

# Create a list of dictionaries containing the question and context for each item
curated_data = [
    {'question': question, 'context': context}
    for item in df['data']
    for question in item['curated_questions']
    for context in item['answer_contents']
]

# Create a dictionary to map context to doc_id using fuzzy matching
context_to_doc_id = {}
for node in nodes:
    context_to_doc_id[node.text] = node.doc_id

# Function to find the best matching doc_id for a given context
def find_best_matching_doc_id(context, context_to_doc_id):
    best_match = None
    highest_ratio = 0
    for text, doc_id in context_to_doc_id.items():
        ratio = fuzz.partial_ratio(context, text)
        if ratio > highest_ratio:
            highest_ratio = ratio
            best_match = doc_id
        if highest_ratio > 95:  # Break early if a very high match is found
            break
    return best_match

# Update curated_data with 'context_id' by finding the best matching doc_id
for item in curated_data:
    context = item['context']
    best_matching_doc_id = find_best_matching_doc_id(context, context_to_doc_id)
    item['context_id'] = best_matching_doc_id

# Prepare data structures for the EmbeddingQAFinetuneDataset
queries = {}
corpus = {}
relevant_docs = {}

# Populate the data structures with the curated data
for item in curated_data[:15]:
    query_id = uuid.uuid4().hex
    queries[query_id] = item['question']
    doc_id = item['context_id']
    corpus[doc_id] = item['context']
    if query_id not in relevant_docs:
        relevant_docs[query_id] = []
    relevant_docs[query_id].append(doc_id)

# Create an instance of EmbeddingQAFinetuneDataset with the prepared data
embedding_qa_dataset = EmbeddingQAFinetuneDataset(
    queries=queries,
    corpus=corpus,
    relevant_docs=relevant_docs
)

# Display the dataset
embedding_qa_dataset


In [None]:
from llama_index.core.evaluation import (
    generate_question_context_pairs,
    EmbeddingQAFinetuneDataset,
)
# llama3 = Ollama(model="llama3")
llm = OpenAI("gpt-4")
qa_dataset = generate_question_context_pairs(
    nodes, llm=llm, num_questions_per_chunk=2
)

In [None]:
queries = qa_dataset.queries.values()
output_path = Path("/tmp/qa_dataset.json")
qa_dataset.save_json(output_path)
qa_dataset



In [None]:
qa_dataset = EmbeddingQAFinetuneDataset.from_json("pg_eval_dataset.json")


In [None]:
include_cohere_rerank = False
from llama_index.core.evaluation import RetrieverEvaluator
from find_your_mate_ai.config import settings
import os
metrics = ["mrr", "hit_rate"]

if include_cohere_rerank:
    metrics.append(
        "cohere_rerank_relevancy"  # requires COHERE_API_KEY environment variable to be set
    )

retriever_evaluator = RetrieverEvaluator.from_metric_names(
    metrics, retriever=retriever
)

In [None]:

eval_result = await retriever_evaluator.aevaluate_dataset(embedding_qa_dataset)
print(eval_result)

In [None]:
# try it out on an entire dataset
# eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)
import pandas as pd


def display_results(name, eval_results):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    hit_rate = full_df["hit_rate"].mean()
    mrr = full_df["mrr"].mean()
    columns = {"retrievers": [name], "hit_rate": [hit_rate], "mrr": [mrr]}

    if include_cohere_rerank:
        crr_relevancy = full_df["cohere_rerank_relevancy"].mean()
        columns.update({"cohere_rerank_relevancy": [crr_relevancy]})

    metric_df = pd.DataFrame(columns)

    return metric_df

In [None]:
display_results("top-2 eval", eval_result)

In [None]:
# Let's investigate
import textwrap
from pprint import pprint
# Correcting the lambda function to properly access attributes of the RetrievalEvalResult object
sorted_eval_result = sorted(eval_result, key=lambda x: (x.metric_vals_dict['mrr'], -x.metric_vals_dict['hit_rate']))
eval_result_item = sorted_eval_result[1]
wrapped_query = textwrap.fill(eval_result_item.query, width=80)
pprint(eval_result_item.dict())
# print("-"*100)
# print(wrapped_query)
# print("-"*100)
# print(textwrap.fill(eval_result_item.retrieved_texts[-1], width=80))
# # Sorting eval_result based on MRR in ascending order and then HIT Rate in descending order
