In [25]:
from pathlib import Path
import re, openai, yaml, os
import http.client as httplib
from llama_index.llms import AzureOpenAI
from llama_index.schema import MetadataMode
from llama_index.llm_predictor import LLMPredictor
from llama_index import set_global_service_context
from sentence_transformers import SentenceTransformer
from llama_index.embeddings import HuggingFaceEmbedding
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from llama_index.node_parser import SimpleNodeParser, SentenceWindowNodeParser
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.finetuning import (
                                    generate_qa_embedding_pairs,
                                    EmbeddingQAFinetuneDataset,
                                    SentenceTransformersFinetuneEngine
                                    )

In [2]:
with open('/Users/1zuu/Desktop/LLM RESEARCH/LLMPro/cadentials.yaml') as f:
    credentials = yaml.load(f, Loader=yaml.FullLoader)

os.environ['AD_OPENAI_API_KEY'] = credentials['AD_OPENAI_API_KEY']
os.environ['HUGGINGFACEHUB_API_TOKEN'] = credentials['HUGGINGFACEHUB_API_TOKEN']

In [3]:
train_dir = 'data/Camel Papers Train/'
val_dir = 'data/Camel Papers Test/'

In [4]:
def load_corpus(directory, verbose=False):
    if verbose:
        print(f"Loading files in {directory}")

    reader = SimpleDirectoryReader(directory)
    docs = reader.load_data()
    if verbose:
        print(f"Loaded {len(docs)} docs")

    parser = SimpleNodeParser.from_defaults()
    nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

    if verbose:
        print(f"Parsed {len(nodes)} nodes")

    return nodes

# Configure LLMs

In [9]:
embedding_llm = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
llm=AzureOpenAI(
                deployment_name=credentials['AD_DEPLOYMENT_ID'],
                model=credentials['AD_ENGINE'],
                api_key=credentials['AD_OPENAI_API_KEY'],
                api_version=credentials['AD_OPENAI_API_VERSION'],
                azure_endpoint=credentials['AD_OPENAI_API_BASE']
                )
chat_llm = LLMPredictor(llm)

service_context = ServiceContext.from_defaults(
                                                embed_model=embedding_llm,
                                                llm_predictor=chat_llm
                                                )
set_global_service_context(service_context)

# Creating Dataset

In [6]:
train_nodes = load_corpus(train_dir, verbose=True)
val_nodes = load_corpus(val_dir, verbose=True)

Loading files in data/Camel Papers Train/
Loaded 91 docs


Parsing nodes: 100%|██████████| 91/91 [00:00<00:00, 354.53it/s]


Parsed 156 nodes
Loading files in data/Camel Papers Test/
Loaded 9 docs


Parsing nodes: 100%|██████████| 9/9 [00:00<00:00, 245.19it/s]

Parsed 17 nodes





In [11]:
train_dataset = generate_qa_embedding_pairs(train_nodes, llm=llm)
train_dataset.save_json("generated/train_dataset.json")

100%|██████████| 156/156 [02:59<00:00,  1.15s/it]


In [12]:
val_dataset = generate_qa_embedding_pairs(val_nodes, llm=llm)
val_dataset.save_json("generated/val_dataset.json")

100%|██████████| 17/17 [00:20<00:00,  1.18s/it]


# Finetuning

In [13]:
train_dataset = EmbeddingQAFinetuneDataset.from_json("generated/train_dataset.json")
val_dataset = EmbeddingQAFinetuneDataset.from_json("generated/val_dataset.json")

In [15]:
finetune_engine = SentenceTransformersFinetuneEngine(
                                                    train_dataset,
                                                    model_id="BAAI/bge-small-en-v1.5",
                                                    model_output_path="bge-small-finetuned",
                                                    val_dataset=val_dataset,
                                                    epochs=2
                                                    )

In [16]:
finetune_engine.finetune()

Iteration: 100%|██████████| 31/31 [02:40<00:00,  5.19s/it]
Iteration: 100%|██████████| 31/31 [02:39<00:00,  5.13s/it]
Epoch: 100%|██████████| 2/2 [05:24<00:00, 162.05s/it]


In [17]:
finetuned_embedding_llm = finetune_engine.get_finetuned_model()

# Finetuned Embedding Evaluation

In [18]:
def evaluate_st(
                dataset,
                model_id,
                name,
                ):
    
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs, name=name)
    model = SentenceTransformer(model_id)
    output_path = "results/"
    Path(output_path).mkdir(exist_ok=True, parents=True)
    return evaluator(model, output_path=output_path)

In [19]:
evaluate_st(val_dataset, "BAAI/bge-small-en-v1.5", name="bge")

0.7340686274509803

In [20]:
evaluate_st(val_dataset, "bge-small-finetuned", name="finetuned")

0.8277777777777778

# Advanced Retrieval Method: Sentence Window Retrieval

Fine-tuning our embeddings is a powerful way to ensure we're better at retrieving the correct context - but we can go a step further and improve the way we actually look at context as well.

In this demonstration, we'll be leveraging the idea of a SentenceWindowNodeParser and metadata replacement to take our retrieval to the next level.

At a high level, what we're doing is straightforward:

1. We parse our document into sentence-wise nodes.
2. We find the most relevant sentence-wise nodes to our query.
3. We add additional context based on a "window" around that base sentence-wise node.
4. We use that enhanced context as context for our LLM!


Let's look at this with a visual example:

In [22]:
node_parser = SentenceWindowNodeParser.from_defaults(
                                                    window_size=6,
                                                    window_metadata_key="window",
                                                    original_text_metadata_key="original_text",
                                                    )

simple_node_parser = SimpleNodeParser.from_defaults() # simple node parser

llm=AzureOpenAI(
                deployment_name=credentials['AD_DEPLOYMENT_ID'],
                model=credentials['AD_ENGINE'],
                api_key=credentials['AD_OPENAI_API_KEY'],
                api_version=credentials['AD_OPENAI_API_VERSION'],
                azure_endpoint=credentials['AD_OPENAI_API_BASE']
                )
chat_llm = LLMPredictor(llm)

# base Embeddings model
embed_model_base = HuggingFaceEmbedding(model_name="BAAI/bge-small-en")

# fine-tuned Embeddings model
embed_model = HuggingFaceEmbedding(model_name="bge-small-finetuned")

# fine-tuned ServiceContext
ctx = ServiceContext.from_defaults(
                                    llm_predictor=chat_llm,
                                    embed_model=embed_model,
                                 )

# base ServiceContext
ctx_base = ServiceContext.from_defaults(
                                        llm_predictor=chat_llm,
                                        embed_model=embed_model_base
                                        )

config.json: 100%|██████████| 684/684 [00:00<00:00, 269kB/s]
model.safetensors: 100%|██████████| 133M/133M [00:15<00:00, 8.79MB/s] 
tokenizer_config.json: 100%|██████████| 366/366 [00:00<00:00, 148kB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.85MB/s]
tokenizer.json: 100%|██████████| 711k/711k [00:00<00:00, 5.75MB/s]
special_tokens_map.json: 100%|██████████| 125/125 [00:00<00:00, 94.0kB/s]


Let's create nodes using our `node_parser` and `simple_node_parser` after loading our documents found in the `TRAIN_FILES` directory.

In [23]:
documents = SimpleDirectoryReader(train_dir).load_data()
nodes = node_parser.get_nodes_from_documents(documents)
nodes_base = simple_node_parser.get_nodes_from_documents(documents)

In [24]:
sentence_index = VectorStoreIndex(nodes, service_context=ctx)
sentence_index_base = VectorStoreIndex(nodes_base, service_context=ctx_base)

In [26]:
query_engine = sentence_index.as_query_engine(
                                            similarity_top_k=3,
                                            node_postprocessors=[
                                                                MetadataReplacementPostProcessor(target_metadata_key="window")
                                                                ],
                                            )

In [27]:
window_response = query_engine.query("How do camelid genetics influence wool quality?")
window_response.response

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


'Genetics mechanisms controlling fiber traits in llamas and alpacas are not fully understood. However, a few genetic selection programs have been implemented in domestic camelids to improve fleece characteristics. The proteins that form the fiber are encoded by keratin genes (KRT) and keratin-associated proteins (KRTAP) which are expressed in a highly regulated manner during hair follicle growth. The presence of major genes affecting quantitative fiber traits such as fiber diameter, standard deviation of fiber diameter, variation coefficiency, and comfort factor in both Huacaya and Suri alpacas has been proposed by Perez-Cabal et al. based on segregation analysis.'