In [1]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.schema import MetadataMode
from llama_index.core.node_parser import MarkdownNodeParser
from typing import Any
from pathlib import Path
import hashlib
import ollama

In [2]:
def generate_doc_id(metadata:dict,section:str)->str:
    combined = f"{section}-{metadata['text'][:20]}"
    hash_comb = hashlib.sha256(combined.encode())
    hex_comb = hash_comb.hexdigest()[:10]
    return hex_comb

In [3]:
pth = Path("../weaviate_loader/documents").resolve()
documents = SimpleDirectoryReader(str(pth), recursive=True).load_data()

splitter = MarkdownNodeParser()

nodes = splitter.get_nodes_from_documents(documents, show_progress=True)

texts = [node.get_content(metadata_mode=MetadataMode.NONE) for node in nodes]

metadata_lst: list[dict[str, Any]] = []
word = "documents"
url = "https://docs.pola.rs/user-guide"

for node in nodes:
    node.get_content()
    meta = node.metadata | {
        "text": node.get_content(metadata_mode=MetadataMode.NONE)
    }
    file_path = node.metadata["file_path"]
    start_idx = file_path.find(word) + len(word)
    link = url + file_path[start_idx:-8]
    meta = meta | {"link": link} 
    doc_id = generate_doc_id(meta,file_path[start_idx:-8])
    meta = meta | {"id":doc_id}
    metadata_lst.append(meta)

Parsing nodes:   0%|          | 0/61 [00:00<?, ?it/s]

In [4]:
prompt_template = """
You are a programmer familiar with the Polars user guide.
Formulate 3 questions that you might ask based on this section of documentation.
The section should contain the answer to the question and the questions should be complete and not too short.
If possible, use as few words as possible from the section.

The section:
text {text}

Provide the answer in parsable json without using code blocks

["question1","question2","question3"]
""".strip()



In [51]:
import json
client = ollama.AsyncClient(host="http://localhost:11434")
output_lst = []

for i in range(0,40):
    print(i)
    meta = metadata_lst[i]
    prompt = prompt_template.format(text=meta["text"])
    resp = await client.generate(model="gemma2:2b",prompt=prompt)
    ans = resp["response"].replace("```json","").replace("```","")
    try:
        questions = json.loads(f"{ans}")
        output_lst.append(meta | {"question":questions})
    except Exception as e:
        print(e)
        print(i)
        continue

0
1
2
3
Extra data: line 2 column 521 (char 521)
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39


In [52]:
with open("results.json","w") as f:
    json.dump(output_lst,f)

In [None]:
#store in vector db

#retreive the question

#compute relavance

#compute metric

In [11]:
from itertools import islice
from typing import List

from langchain_core.embeddings import Embeddings
from pydantic import BaseModel
import httpx


def batched(iterable, n):
    if n < 1:
        raise ValueError("n must be at least one")
    iterator = iter(iterable)
    while batch := tuple(islice(iterator, n)):
        yield batch


class TextEmbeddingsInference(BaseModel, Embeddings):
    url: str
    """Url of text embeddings inference server"""
    normalize: bool = True

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """Compute doc embeddings using a Text Embeddings Inference server.

        Args:
            texts: The list of texts to embed.

        Returns:
            List of embeddings, one for each text.
        """
        embeddings = []
        for batch in batched(texts, 8):
            payload = {
                "inputs": list(batch),
                "normalize": self.normalize,
                "truncate": True,
            }
            response = httpx.post(f"{self.url}/embed", json=payload).json()
            embeddings.extend(response)

        return embeddings

    def embed_query(self, text: str) -> List[float]:
        """Compute query embeddings using a Text Embeddings Inference server.

        Args:
            text: The text to embed.

        Returns:
            Embeddings for the text.
        """
        return self.embed_documents([text])[0]


In [53]:
import weaviate
WEAVIATE_HOST="localhost"
WEAVIATE_PORT = 8090
TEI_HOST="localhost"
TEI_PORT="8080"
client = weaviate.connect_to_local(host=WEAVIATE_HOST, port=(WEAVIATE_PORT))
tei_url = f"http://{TEI_HOST}:{TEI_PORT}"
embeddings = TextEmbeddingsInference(url=tei_url, normalize=True)


In [37]:
collection = client.collections.get("document_collection")
relevance_lst = []
for i in range(20):
    for j in range(3):
        query = output_lst[i]["question"][j]
        vec = embeddings.embed_query(query)
        result = collection.query.hybrid(
                    query,
                    alpha=0.5,
                    vector=vec,
                    limit=3,
                    
                )
        props = [x.properties for x in result.objects]
        props_copy = []
        for p in props:
            file_path = p["file_path"]
            word = "documents"
            start_idx = file_path.find(word) + len(word)
            new_p = p | {"id":generate_doc_id(p,file_path[start_idx:-8])}
            props_copy.append(new_p)

        relevance = [y["id"]==output_lst[i]["id"] for y in props_copy]
        relevance_lst.append(relevance)

        

In [54]:
def hit_rate(relevance):
    cnt = 0
    for i,v in enumerate(relevance):
        if True in v:
            cnt+=1
    return cnt/len(relevance)

In [55]:
def mrr(relevance):
    ranks_lst = []
    for v in relevance:
        idx = v.index(True)+1 if True in v else 0
        if idx==0:
            rank = 0
        else:
            rank = 1/idx
        ranks_lst.append(rank)

    return sum(ranks_lst)/len(relevance)



In [56]:
collection = client.collections.get("document_collection")

hit_rate_lst = []
for alpha in [0.5,0.7,0.8,0.9,1]:
    relevance_lst = []
    for i in range(20):
        for j in range(3):
            query = output_lst[i]["question"][j]
            vec = embeddings.embed_query(query)
            result = collection.query.hybrid(
                        query,
                        alpha=alpha,
                        vector=vec,
                        limit=3,
                        
                    )
            props = [x.properties for x in result.objects]
            props_copy = []
            for p in props:
                file_path = p["file_path"]
                word = "documents"
                start_idx = file_path.find(word) + len(word)
                new_p = p | {"id":generate_doc_id(p,file_path[start_idx:-8])}
                props_copy.append(new_p)

            relevance = [y["id"]==output_lst[i]["id"] for y in props_copy]
            relevance_lst.append(relevance)
    hit_rate_lst.append((alpha,hit_rate(relevance_lst),mrr(relevance_lst)))
        



In [57]:
hit_rate_lst

[(0.5, 0.9, 0.7972222222222223),
 (0.7, 0.9166666666666666, 0.8222222222222223),
 (0.8, 0.9, 0.7972222222222223),
 (0.9, 0.9, 0.7583333333333333),
 (1, 0.8666666666666667, 0.7472222222222222)]

In [58]:
relevance_lst

[[True, False, False],
 [True, False, False],
 [True, False, False],
 [True, False, False],
 [True, False, False],
 [True, False, False],
 [True, False, False],
 [False, False, True],
 [False, False, False],
 [False, False, True],
 [False, False, False],
 [True, False, False],
 [True, False, False],
 [True, False, False],
 [False, False, False],
 [True, False, False],
 [True, False, False],
 [False, False, False],
 [False, True, False],
 [False, False, False],
 [False, False, False],
 [True, False, False],
 [True, False, False],
 [True, False, False],
 [True, False, False],
 [True, False, False],
 [True, False, False],
 [True, False, False],
 [True, False, False],
 [True, False, False],
 [True, False, False],
 [False, False, False],
 [True, False, False],
 [True, False, False],
 [True, False, False],
 [True, False, False],
 [False, False, True],
 [False, False, False],
 [False, True, False],
 [True, False, False],
 [True, False, False],
 [True, False, False],
 [True, False, False],
 [T