In [25]:
from pathlib import Path
import re, openai, yaml, os
import http.client as httplib
from llama_index.llms import AzureOpenAI
from llama_index.schema import MetadataMode
from llama_index.llm_predictor import LLMPredictor
from llama_index import set_global_service_context
from sentence_transformers import SentenceTransformer
from llama_index.embeddings import HuggingFaceEmbedding
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from llama_index.node_parser import SimpleNodeParser, SentenceWindowNodeParser
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.finetuning import (
                                    generate_qa_embedding_pairs,
                                    EmbeddingQAFinetuneDataset,
                                    SentenceTransformersFinetuneEngine
                                    )

In [2]:
with open('/Users/1zuu/Desktop/LLM RESEARCH/LLMPro/cadentials.yaml') as f:
    credentials = yaml.load(f, Loader=yaml.FullLoader)

os.environ['AD_OPENAI_API_KEY'] = credentials['AD_OPENAI_API_KEY']
os.environ['HUGGINGFACEHUB_API_TOKEN'] = credentials['HUGGINGFACEHUB_API_TOKEN']

In [3]:
train_dir = 'data/Camel Papers Train/'
val_dir = 'data/Camel Papers Test/'

In [4]:
def load_corpus(directory, verbose=False):
    if verbose:
        print(f"Loading files in {directory}")

    reader = SimpleDirectoryReader(directory)
    docs = reader.load_data()
    if verbose:
        print(f"Loaded {len(docs)} docs")

    parser = SimpleNodeParser.from_defaults()
    nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

    if verbose:
        print(f"Parsed {len(nodes)} nodes")

    return nodes

# Configure LLMs

In [9]:
embedding_llm = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
llm=AzureOpenAI(
                deployment_name=credentials['AD_DEPLOYMENT_ID'],
                model=credentials['AD_ENGINE'],
                api_key=credentials['AD_OPENAI_API_KEY'],
                api_version=credentials['AD_OPENAI_API_VERSION'],
                azure_endpoint=credentials['AD_OPENAI_API_BASE']
                )
chat_llm = LLMPredictor(llm)

service_context = ServiceContext.from_defaults(
                                                embed_model=embedding_llm,
                                                llm_predictor=chat_llm
                                                )
set_global_service_context(service_context)

# Creating Dataset

In [6]:
train_nodes = load_corpus(train_dir, verbose=True)
val_nodes = load_corpus(val_dir, verbose=True)

Loading files in data/Camel Papers Train/
Loaded 91 docs


Parsing nodes: 100%|██████████| 91/91 [00:00<00:00, 354.53it/s]


Parsed 156 nodes
Loading files in data/Camel Papers Test/
Loaded 9 docs


Parsing nodes: 100%|██████████| 9/9 [00:00<00:00, 245.19it/s]

Parsed 17 nodes





In [11]:
train_dataset = generate_qa_embedding_pairs(train_nodes, llm=llm)
train_dataset.save_json("generated/train_dataset.json")

100%|██████████| 156/156 [02:59<00:00,  1.15s/it]


In [12]:
val_dataset = generate_qa_embedding_pairs(val_nodes, llm=llm)
val_dataset.save_json("generated/val_dataset.json")

100%|██████████| 17/17 [00:20<00:00,  1.18s/it]


# Finetuning

In [13]:
train_dataset = EmbeddingQAFinetuneDataset.from_json("generated/train_dataset.json")
val_dataset = EmbeddingQAFinetuneDataset.from_json("generated/val_dataset.json")

In [15]:
finetune_engine = SentenceTransformersFinetuneEngine(
                                                    train_dataset,
                                                    model_id="BAAI/bge-small-en-v1.5",
                                                    model_output_path="bge-small-finetuned",
                                                    val_dataset=val_dataset,
                                                    epochs=2
                                                    )

In [16]:
finetune_engine.finetune()

Iteration: 100%|██████████| 31/31 [02:40<00:00,  5.19s/it]
Iteration: 100%|██████████| 31/31 [02:39<00:00,  5.13s/it]
Epoch: 100%|██████████| 2/2 [05:24<00:00, 162.05s/it]


In [17]:
finetuned_embedding_llm = finetune_engine.get_finetuned_model()

# Finetuned Embedding Evaluation

In [18]:
def evaluate_st(
                dataset,
                model_id,
                name,
                ):
    
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs, name=name)
    model = SentenceTransformer(model_id)
    output_path = "results/"
    Path(output_path).mkdir(exist_ok=True, parents=True)
    return evaluator(model, output_path=output_path)

In [19]:
evaluate_st(val_dataset, "BAAI/bge-small-en-v1.5", name="bge")

0.7340686274509803

In [20]:
evaluate_st(val_dataset, "bge-small-finetuned", name="finetuned")

0.8277777777777778