# Exploration of Evaluation Dataset

### Execute code from root directory

In [20]:
import os
import sys

# Go from src/notebooks/ → RAGRepo/
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), "../../"))
os.chdir(PROJECT_ROOT)  # ← THIS sets the working directory permanently

# Optional: make sure you can import src.*
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

In [24]:
%pwd

'/home/mallahova/code/basics/projects/interview/RAGRepo'

In [30]:

from langchain_community.vectorstores import FAISS
from src.core.config_loader import load_config
from src.core.component_registry import (
    EMBEDDINGS,
)
import json

### Retrieve all filenames stored in FAISS

In [None]:
config = load_config("config/base.yaml")
embedding_cfg = config["embedding"]
embedding_cls = EMBEDDINGS[embedding_cfg["class"]]
embedding_model = embedding_cls(model_name=embedding_cfg["name"])

github_url = "https://github.com/viarotel-org/escrcpy.git"
index_dir = "src/data/index"
faiss_index = FAISS.load_local(
   index_dir, embeddings=embedding_model, allow_dangerous_deserialization=True
)

documents = faiss_index.docstore._dict  # This is an internal dict of documents

# Print all keys (usually these are UUIDs or similar identifiers)
filenames_faiss = set([doc.metadata["source"] for doc in documents.values()])

### Retrieve all filenames stored in evaluation dataset

In [35]:
with open("src/data/eval/escrcpy-commits-generated.json") as f:
    eval_data = json.load(f)

In [47]:
eval_filenames = set(file for query in eval_data for file in query["files"])

### Check if FAISS stores every file from the evaluation dataset

In [49]:
eval_filenames<=filenames_faiss

False

In [52]:
not_in_faiiss = eval_filenames - filenames_faiss
print("Not in FAISS index:")
for filename in not_in_faiiss:
    print(filename)

Not in FAISS index:
electron/resources/extra/linux/scrcpy/scrcpy.1


This folder doesn't exist, probably it was removed from the repository with newer commits.

In [57]:
%ls .temp_repo/electron/resources/extra/linux

[0m[01;34mtray[0m/


For correct evaluation, I removed this file from evaluation dataset.

### Explore evaluation dataset

In [59]:
print(f'Evaluation dataset length: {len(eval_data)}')

Evaluation dataset length: 34


In [60]:
print(f'Average number of files per query: {sum(len(query["files"]) for query in eval_data) / len(eval_data)}')

Average number of files per query: 2.1176470588235294


The dataset is quite small, so splitting it isn’t practical. I will use the whole dataset to compare between the model configuraions.