In [13]:

from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter, TokenTextSplitter, SemanticSplitterNodeParser

docs = [Document(text="Virological and serological methods were used in examination of 28 patients suffering from subacute thyroiditis de Quervain. Attempts to isolate a presumed viral agent from 8 patients were performed by inoculation of serum, urine, and aspiration biopsies of thyroid glands taken at different stages of the illness, into tissue cultures of different types of human and animal cells. Recovery of a cytopathic viral agent on cells of a rabbit lung continuous line was successful in 5 cases. Serological cross reactions exist between the isolated viruses and patient serum but not with serum of healthy people. Cases with the acquired illness and positive antibodies against the isolated viruses who had been in close and prolonged contact with patients suffering from subacute thyroiditis de Quervain were also investigated.")]


In [16]:
splitter = TokenTextSplitter(chunk_size=50, chunk_overlap=0)
splits = splitter.get_nodes_from_documents(docs)
for split in splits:
    print(sum([ len(word) for word in split.text.split()]), split.text)

Metadata length (2) is close to chunk size (50). Resulting chunks are less than 50 tokens. Consider increasing the chunk size or decreasing the size of your metadata to avoid this.
200 Virological and serological methods were used in examination of 28 patients suffering from subacute thyroiditis de Quervain. Attempts to isolate a presumed viral agent from 8 patients were performed by inoculation of serum, urine, and
204 aspiration biopsies of thyroid glands taken at different stages of the illness, into tissue cultures of different types of human and animal cells. Recovery of a cytopathic viral agent on cells of a rabbit lung continuous line was successful in
242 5 cases. Serological cross reactions exist between the isolated viruses and patient serum but not with serum of healthy people. Cases with the acquired illness and positive antibodies against the isolated viruses who had been in close and prolonged contact with patients suffering from
50 subacute thyroiditis de Quervain were a

In [25]:
splitter = SentenceSplitter(chunk_size=128, chunk_overlap=50)
splits = splitter.get_nodes_from_documents(docs)
for split in splits:
    print(sum([ len(word) for word in split.text.split()]), split.text)

512 Virological and serological methods were used in examination of 28 patients suffering from subacute thyroiditis de Quervain. Attempts to isolate a presumed viral agent from 8 patients were performed by inoculation of serum, urine, and aspiration biopsies of thyroid glands taken at different stages of the illness, into tissue cultures of different types of human and animal cells. Recovery of a cytopathic viral agent on cells of a rabbit lung continuous line was successful in 5 cases. Serological cross reactions exist between the isolated viruses and patient serum but not with serum of healthy people.
372 Recovery of a cytopathic viral agent on cells of a rabbit lung continuous line was successful in 5 cases. Serological cross reactions exist between the isolated viruses and patient serum but not with serum of healthy people. Cases with the acquired illness and positive antibodies against the isolated viruses who had been in close and prolonged contact with patients suffering from su

In [26]:
splitter = SentenceSplitter(chunk_size=128, chunk_overlap=0)
splits = splitter.get_nodes_from_documents(docs)
for split in splits:
    print(sum([ len(word) for word in split.text.split()]), split.text)

512 Virological and serological methods were used in examination of 28 patients suffering from subacute thyroiditis de Quervain. Attempts to isolate a presumed viral agent from 8 patients were performed by inoculation of serum, urine, and aspiration biopsies of thyroid glands taken at different stages of the illness, into tissue cultures of different types of human and animal cells. Recovery of a cytopathic viral agent on cells of a rabbit lung continuous line was successful in 5 cases. Serological cross reactions exist between the isolated viruses and patient serum but not with serum of healthy people.
184 Cases with the acquired illness and positive antibodies against the isolated viruses who had been in close and prolonged contact with patients suffering from subacute thyroiditis de Quervain were also investigated.


In [21]:
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
import os
from dotenv import load_dotenv
load_dotenv()

embed_model = AzureOpenAIEmbedding(
    model="text-embedding-ada-002",
    deployment_name="text-embedding-ada-002",
    api_key=os.environ['AZURE_OPENAI_API_KEY'],
    azure_endpoint=os.environ['AZURE_OPENAI_ENDPOINT'],
    api_version=os.environ['AZURE_OPENAI_API_VERSION'],
)

splitter = SemanticSplitterNodeParser(buffer_size=1, breakpoint_percentile_threshold=50, embed_model=embed_model)
splits = splitter.get_nodes_from_documents(docs)
for split in splits:
    print(sum([ len(word) for word in split.text.split()]), split.text)

411 Virological and serological methods were used in examination of 28 patients suffering from subacute thyroiditis de Quervain. Attempts to isolate a presumed viral agent from 8 patients were performed by inoculation of serum, urine, and aspiration biopsies of thyroid glands taken at different stages of the illness, into tissue cultures of different types of human and animal cells. Recovery of a cytopathic viral agent on cells of a rabbit lung continuous line was successful in 5 cases. 
101 Serological cross reactions exist between the isolated viruses and patient serum but not with serum of healthy people. 
184 Cases with the acquired illness and positive antibodies against the isolated viruses who had been in close and prolonged contact with patients suffering from subacute thyroiditis de Quervain were also investigated.


In [28]:

from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter, TokenTextSplitter, SemanticSplitterNodeParser
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.evaluation import RetrieverEvaluator
from llama_index.core.evaluation.retrieval.metrics import resolve_metrics, HitRate, MRR
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from datasets import load_dataset
import pandas as pd
import Stemmer
import os
from dotenv import load_dotenv
load_dotenv()

# Load dataset
ds = load_dataset("rag-datasets/rag-mini-bioasq", "text-corpus")
ds = ds['passages'].to_pandas().set_index('id', drop=True)
query_set = load_dataset("rag-datasets/rag-mini-bioasq", "question-answer-passages")
queries = query_set['test'].take(5)

# embeddings are required for semantic splitting
embed_model = AzureOpenAIEmbedding(
    model="text-embedding-ada-002",
    deployment_name="text-embedding-ada-002",
    api_key=os.environ['AZURE_OPENAI_API_KEY'],
    azure_endpoint=os.environ['AZURE_OPENAI_ENDPOINT'],
    api_version=os.environ['AZURE_OPENAI_API_VERSION'],
)

# Get required passages
passages_required = set()
[passages_required.update([int(id) for id in ids[1:-1].split(", ")]) for ids in query_set['test'].take(15)['relevant_passage_ids']]

# Create documents
docs = [Document(text=ds.loc[id].passage, metadata={'id': id}) for id in passages_required]
for x in docs:
    x.doc_id = str(x.metadata['id'])
    x.excluded_llm_metadata_keys = ['id']

# Define different splitting strategies
splitters = {
    # 'semantic_95': SemanticSplitterNodeParser(buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model),  # Warning : expensive both in time and credits
    # 'semantic_80': SemanticSplitterNodeParser(buffer_size=1, breakpoint_percentile_threshold=80, embed_model=embed_model),  # Warning : expensive both in time and credits
    'token_512': TokenTextSplitter(chunk_size=512, chunk_overlap=100), 
    'token_50': TokenTextSplitter(chunk_size=50, chunk_overlap=0),
    'sentance_512': SentenceSplitter(chunk_size=512, chunk_overlap=100),
    'sentance_512_0': SentenceSplitter(chunk_size=512, chunk_overlap=0),
    'sentance_50': SentenceSplitter(chunk_size=50, chunk_overlap=0),
    'sentance_128': SentenceSplitter(chunk_size=128, chunk_overlap=0),
}
# Evaluation metrics
metrics = ["precision", "recall", "ap", "ndcg"]
metrics = [x() for x in resolve_metrics(metrics)] + [HitRate(use_granular_hit_rate=True), MRR(use_granular_mrr=True)]

# Evaluate each splitting strategy
results = {}
for splitter_name, splitter in splitters.items():
    nodes = splitter.get_nodes_from_documents(docs)
    
    # Create BM25 retriever
    # TODO for participants : Try other retrievers / choice of K.
    bm25_retriever = BM25Retriever.from_defaults(
        nodes=nodes,
        similarity_top_k=5,
        stemmer=Stemmer.Stemmer("english"),
        language="english",
    )
    
    # Evaluate
    results_data = []
    for row in queries:
        row['relevant_passage_ids'] = row['relevant_passage_ids'][1:-1].split(', ')
        query = row['question']
        nodes = bm25_retriever.retrieve(query)
        retrieved_passage_ids = [str(node.metadata['id']) for node in nodes]
        
        metric_dict = {}
        for metric in metrics:
            eval_result = metric.compute(
                query, row['relevant_passage_ids'], retrieved_passage_ids,
            )
            metric_dict[metric.metric_name] = eval_result.score
        
        results_data.append({
            'splitter': splitter_name,
            'query': query,
            'retrieved_ids': retrieved_passage_ids,
            'relevant_ids': row['relevant_passage_ids'],
            **metric_dict
        })
    
    results[splitter_name] = pd.DataFrame(results_data)

# Combine all results
final_results = pd.concat(results.values())

print("\nAggregated Results by Splitting Strategy:")
print(final_results.drop(['query', 'retrieved_ids', 'relevant_ids'], axis=1).groupby('splitter').mean().sort_values(by='recall', ascending=False))

Metadata length (8) is close to chunk size (50). Resulting chunks are less than 50 tokens. Consider increasing the chunk size or decreasing the size of your metadata to avoid this.
Metadata length (8) is close to chunk size (50). Resulting chunks are less than 50 tokens. Consider increasing the chunk size or decreasing the size of your metadata to avoid this.
Metadata length (8) is close to chunk size (50). Resulting chunks are less than 50 tokens. Consider increasing the chunk size or decreasing the size of your metadata to avoid this.
Metadata length (8) is close to chunk size (50). Resulting chunks are less than 50 tokens. Consider increasing the chunk size or decreasing the size of your metadata to avoid this.
Metadata length (8) is close to chunk size (50). Resulting chunks are less than 50 tokens. Consider increasing the chunk size or decreasing the size of your metadata to avoid this.
Metadata length (8) is close to chunk size (50). Resulting chunks are less than 50 tokens. Cons

In [None]:
# Best config : BM25 + k=5 + Token(chunk_size = 512, overlap=100)

# Possible next steps : Try larger chunk sizes and K values