In [None]:
!pip install langchain faiss-gpu transformers evaluate ragas datasets huggingface_hub mlflow tqdm
# !gym stable-baselines3 onnx onnxruntime
# !pip install onnx onnxruntime
!pip install optimum[onnxruntime] onnx
!pip install sentence-transformers
!pip install rouge_score

In [None]:
!pip install packaging==23.0 mlflow==2.17.2

EXPORTS AND SETUP

In [None]:
import json
import mlflow
from datasets import Dataset as HFDataset
from torch.utils.data import Dataset, DataLoader
import torch
# from optimum.intel import IncQuantizer
# from torch.ao.quantization import get_default_qconfig, quantize_dynamic
import torch.cuda
from tqdm import tqdm
import mlflow.pytorch
# import onnx
# from onnxruntime.quantization import quantize_static, QuantizationMode, CalibrationDataReader, QuantType, QuantFormat
# import onnxruntime as ort
from optimum.onnxruntime import ORTModelForSeq2SeqLM
from optimum.onnxruntime.configuration import AutoQuantizationConfig
from transformers import pipeline, ElectraForQuestionAnswering, ElectraTokenizer, AlbertForQuestionAnswering, AlbertTokenizer, AutoModelForQuestionAnswering, AutoTokenizer, T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments

import os
# from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
# from mlflow.llm.evaluate import evaluate_llm, evaluate_rag
from langchain.chains import RetrievalQA
# from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
# from langchain.llms import HuggingFaceLLM
from transformers import default_data_collator
from langchain_core.load import dumpd, dumps, load, loads
import pickle
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.retrievers import ContextualCompressionRetriever
from __future__ import annotations
from typing import Dict, Optional, Sequence
from langchain.schema import Document
from accelerate import Accelerator
from torch.cuda.amp import autocast
# from langchain.pydantic_v1 import Extra, root_validator

from langchain.callbacks.manager import Callbacks
from langchain.retrievers.document_compressors.base import BaseDocumentCompressor

from sentence_transformers import CrossEncoder

from langchain.document_transformers.embeddings_redundant_filter import EmbeddingsRedundantFilter
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain.retrievers import ContextualCompressionRetriever
from langchain.document_transformers.long_context_reorder import LongContextReorder
# from langchain.retrievers.multi_query import MultiQueryRetriever

# if torch.cuda.is_available():
#     torch.set_default_tensor_type('torch.cuda.FloatTensor')
# else:
#     torch.set_default_tensor_type('torch.FloatTensor')

# Set default device
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print(f"Using device: {device}")

# print('ort version: ', ort.__version__)

quantize_llm = True
eval_llm = False
rag_chain_path = 'rag/rag_chain.pkl'
quantized_model_path = 'quantized_model'
quantized_tokenizer_path = 'quantized_tokenizer'
directories = ['fine_tuned_model', quantized_model_path, quantized_tokenizer_path, 'fine_tuned_tokenizer', 'rag', 'Vectorstore/chromadb', 'fine-tune-llm-results', 'fine-tune-llm-logs']
for directory in directories:
    if not os.path.exists(directory):
        os.makedirs(directory)

experiment_id = mlflow.create_experiment('BioASQ RAG')
mlflow.set_experiment('BioASQ RAG')

CREATE DATASETS

In [None]:

def load_bioasq_data(train_file, test_file):
    """Load and process BioASQ data, utilizing all fields (question, ideal_answer, articles, snippets)."""
    
    def build_dataset(data):
        dataset = []
        for item in data:
            question = item['question']
            ideal_answer = item['ideal_answer']
            
            context = ""
            for article in item.get('articles', []):
                title = article.get('title', '')  # Default to empty string if 'title' is missing
                abstract = article.get('abstract', '')  # Default to empty string if 'abstract' is missing
                context += f"{title} {abstract} "  # Concatenate safely
            
            for snippet in item.get('snippets', []):
                title = snippet.get('title', '')  # Default to empty string if 'title' is missing
                abstract = snippet.get('abstract', '')  # Default to empty string if 'abstract' is missing
                context += f"{title} {abstract} "  # Concatenate safely
            
            context += " ".join(item.get('concepts', []))  # Ensure concepts are strings
            
            # Construct train example
            dataset.append({
                'question': question,
                'context': context,
                'ideal_answer': ideal_answer
            })


        return dataset

    with open(train_file, 'r') as f:
        train_data = json.load(f)
    
    with open(test_file, 'r') as f:
        test_data = json.load(f)

    train_dataset = build_dataset(train_data)
    test_dataset = build_dataset(test_data)

    return train_dataset, test_dataset


# Initialize MLFlow logging for the entire pipeline
mlflow.autolog()
with mlflow.start_run(run_name="Dataset Creation") as dataset_creation_run:

    print('Dataset Creation')
    # Load BioASQ training and testing datasets
    train_file = '/kaggle/input/bio-asq/training12b_train.json'
    test_file = '/kaggle/input/bio-asq/training12b_test.json'
    train_dataset, test_dataset = load_bioasq_data(train_file, test_file)

    # Optionally log dataset info to MLFlow
    mlflow.log_param('bio_asq_train_dataset_size', len(train_dataset))
    mlflow.log_param('bio_asq_test_dataset_size', len(test_dataset))

    # Save datasets as JSON files
    train_output_file = 'bio_asq_train_dataset.json'
    test_output_file = 'bio_asq_test_dataset.json'

    with open(train_output_file, 'w') as f:
        json.dump(train_dataset, f, indent=4)

    with open(test_output_file, 'w') as f:
        json.dump(test_dataset, f, indent=4)

    # Log the file paths to MLFlow for tracking
    mlflow.log_artifact(train_output_file)
    mlflow.log_artifact(test_output_file)



FINE TUNE LLM

In [None]:
finetuned_model_path = '/kaggle/input/bioasq-fine-tuned-t5-small/transformers/default/1'
finetuned_tokenizer_path = '/kaggle/input/bioasq-fine-tuned-t5-small-tokenizer/transformers/default/1'

quantized_finetuned_model_path = '/kaggle/input/bioasq-quantized-fine-tuned-t5-small/onnx/default/1'
quantized_finetuned_tokenizer_path = '/kaggle/input/bioasq-quantized-fine-tuned-t5-small-tokenizer/transformers/default/1'

class BioASQDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __len__(self):
        return len(self.encodings['input_ids'])
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item
    

def fine_tune_llm(train_dataset, test_dataset, model_name='t5-small'):
    """Fine-tune a pre-trained LLM on BioASQ data."""
    
    if os.path.exists(finetuned_model_path) and os.path.exists(finetuned_tokenizer_path):
        print('Loading fine-tuned model and tokenizer from checkpoint files')
        model = T5ForConditionalGeneration.from_pretrained(finetuned_model_path)
        tokenizer = T5Tokenizer.from_pretrained(finetuned_tokenizer_path)
        return model, tokenizer
    else:
        # Load model and tokenizer from Hugging Face
        model = T5ForConditionalGeneration.from_pretrained(model_name)
        tokenizer = T5Tokenizer.from_pretrained(model_name)

    print(f'Loaded model {model_name} and tokenizer from Hugging Face')

    # Tokenize the questions and answers, include context for the input
    train_encodings = tokenizer([f"question: {item['question']} context: {item['context']}" for item in train_dataset],
                                truncation=True, padding=True, max_length=256)
    train_labels = tokenizer([item['ideal_answer'] for item in train_dataset], truncation=True, padding=True, max_length=256)

    test_encodings = tokenizer([f"question: {item['question']} context: {item['context']}" for item in test_dataset],
                               truncation=True, padding=True, max_length=256)
    test_labels = tokenizer([item['ideal_answer'] for item in test_dataset], truncation=True, padding=True, max_length=256)

    train_data = BioASQDataset(train_encodings, train_labels)
    test_data = BioASQDataset(test_encodings, test_labels)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir='/fine-tune-llm-results',
        evaluation_strategy="steps",
        learning_rate=5e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir='/fine-tune-llm-logs',
        logging_steps=10,
        save_steps=100,
        save_total_limit=3,
        load_best_model_at_end=True,
        seed=42,
        data_seed=42,
        report_to="none",
        fp16=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=test_data,
        tokenizer=tokenizer,
    )

    # Train the model
    trainer.train()

    return model, tokenizer


# Fine-tune the model on BioASQ dataset
mlflow.autolog()
with mlflow.start_run(run_name="Fine Tuning") as fine_tuning_run:
    if os.path.exists(quantized_finetuned_model_path) and os.path.exists(quantized_finetuned_tokenizer_path) and quantize_llm:
        print('Loading quantized_fine-tuned model and tokenizer from checkpoint files')
        llm = ORTModelForSeq2SeqLM.from_pretrained(quantized_finetuned_model_path)
        tokenizer = T5Tokenizer.from_pretrained(quantized_finetuned_tokenizer_path)
    else:
        model_name = 't5-small'
        fine_tuned_model, fine_tuned_tokenizer = fine_tune_llm(train_dataset, test_dataset, model_name)

        # Save the fine-tuned model
        fine_tuned_model.save_pretrained('fine_tuned_model')
        fine_tuned_tokenizer.save_pretrained('fine_tuned_tokenizer')

        # Log model parameters to MLFlow
        mlflow.log_param('fine_tuned_model_name', model_name)
        mlflow.log_param('quantize_llm', quantize_llm)

        # Save model weights to MLFlow
        torch.save(fine_tuned_model.state_dict(), 'fine_tuned_model_weights.pth')
        mlflow.log_artifact('fine_tuned_model_weights.pth', artifact_path='fine_tuned_model')
        
        if quantize_llm:
            print('Quantizing llm model')
            llm = ORTModelForSeq2SeqLM.from_pretrained(
                finetuned_model_path,
                from_transformers=True,
                export=True,  # Exports the model to ONNX
                # quantization_config=AutoQuantizationConfig.default()
            )
            tokenizer = T5Tokenizer.from_pretrained(finetuned_tokenizer_path)
            
            # Save the quantized model
            llm.save_pretrained(quantized_model_path)
            tokenizer.save_pretrained(quantized_tokenizer_path)
        else:
            llm = fine_tuned_model
            tokenizer = fine_tuned_tokenizer

EVALUATE LLM

In [None]:
from evaluate import load
# Load the ROUGE metric
import evaluate

rouge = load("rouge")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def chunk_text(text, max_tokens, tokenizer):
    """Split text into chunks within the token limit."""
    tokens = tokenizer.tokenize(text)
    chunks = [tokens[i:i + max_tokens] for i in range(0, len(tokens), max_tokens)]
    return [tokenizer.convert_tokens_to_string(chunk) for chunk in chunks]

def generate_from_chunks(question, context, tokenizer, model, max_tokens=512):
    """Generate an answer by aggregating outputs from context chunks."""
    chunks = chunk_text(context, max_tokens - len(tokenizer.tokenize(question)) - 10, tokenizer)
    answers = []
    for chunk in chunks:
        input_text = f"question: {question} context: {chunk}"
        inputs = tokenizer(input_text, return_tensors="pt", max_length=max_tokens, truncation=True)
        inputs = {key: value.to(device) for key, value in inputs.items()}
        # outputs = model.generate(inputs["input_ids"], max_length=100)
        outputs = model.generate(
            inputs['input_ids'],
            max_length=150, 
            num_beams=5,  # Beam width
            early_stopping=True, 
            length_penalty=1.0  # Control length balance
        )
        answers.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
    return " ".join(answers)  # Combine chunk outputs

# Function to evaluate the fine-tuned model using Hugging Face's evaluator
def evaluate_llm_with_huggingface(model, tokenizer, test_dataset):
    predictions = []
    references = []
    question_ans = 1
    for item in tqdm(test_dataset, desc="Evaluating LLM"):
        print(f'Answering question {question_ans}')
        question_ans += 1
        question = item['question']
        context = item['context']
        ideal_answer = item['ideal_answer']
        
        torch.cuda.empty_cache()
        
        # Move the quantized model to the same device
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model.to(device)

        
        # Generate answer
        with autocast():
            pred = generate_from_chunks(question, context, tokenizer, model)
        
        # Store predictions and references
        predictions.append(pred)
        references.append(ideal_answer)
    
    # Evaluate using ROUGE
    result = rouge.compute(predictions=predictions, references=references)
    for key, value in result.items():
        mlflow.log_metric(key, value)
    print("ROUGE scores:", result)
    
    return result

mlflow.autolog()
with mlflow.start_run(run_name="LLM Evaluation", parent_run_id=fine_tuning_run.info.run_id) as llm_evaluation_run:
    mlflow.log_param('eval_llm', eval_llm)
    if eval_llm:
        llm_metrics = evaluate_llm_with_huggingface(llm, tokenizer, test_dataset)
        print(f"LLM Evaluation Metrics: {llm_metrics}")

CREATE RAG DATASET

In [None]:

def create_rag_dataset(train_data, test_data):
    """Create a dataset for RAG from BioASQ data."""
    
    def build_dataset(data):
        dataset = []

        # Create text splitter for RAG
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size = 1250,
            chunk_overlap = 100,
            length_function = len,
            is_separator_regex = False
        )
        
        for item in data:
            chunks = text_splitter.split_text(item['context'])
            
            # Add each chunk as a separate document with metadata
            for chunk in chunks:
                dataset.append({
                    'context': chunk,
                    'metadata': {
                        'question': item['question'],
                        'ideal_answer': item['ideal_answer']
                    }
                })

        return dataset

    train_dataset = build_dataset(train_data)
    test_dataset = build_dataset(test_data)

    return train_dataset, test_dataset


# Initialize MLFlow logging for the entire pipeline
mlflow.autolog()
with mlflow.start_run(run_name="Dataset Creation for RAG", parent_run_id=llm_evaluation_run.info.run_id) as rag_dataset_creation_run:

    train_dataset_rag, test_dataset_rag = load_bioasq_data(train_dataset, test_dataset)

    # Save datasets as JSON files
    train_output_file = 'bio_asq_train_dataset_rag.json'
    test_output_file = 'bio_asq_test_dataset_rag.json'

    train_string = dumps(train_dataset_rag, pretty=True)
    test_string = dumps(test_dataset_rag, pretty=True)

    with open(train_output_file, 'w') as f:
        json.dump(train_string, f, indent=4)

    with open(test_output_file, 'w') as f:
        json.dump(test_string, f, indent=4)

    # Log the file paths to MLFlow for tracking
    mlflow.log_artifact(train_output_file)
    mlflow.log_artifact(test_output_file)

CREATE RAG PIPELINE

In [None]:

class BgeRerank(BaseDocumentCompressor):
    model_name:str = 'BAAI/bge-reranker-v2-m3'
    """Model name to use for reranking."""
    top_n: int = 3
    """Number of documents to return."""
    model:CrossEncoder = CrossEncoder(model_name)
    """CrossEncoder instance to use for reranking."""

    def bge_rerank(self, query,docs):
        model_inputs =  [[query, doc] for doc in docs]
        scores = self.model.predict(model_inputs)
        results = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
        return results[:self.top_n]


    class Config:
        """Configuration for this pydantic object."""

        extra = Extra.forbid
        arbitrary_types_allowed = True

    def compress_documents(
        self,
        documents: Sequence[Document],
        query: str,
        callbacks: Optional[Callbacks] = None,
    ) -> Sequence[Document]:
        """
        Compress documents using BAAI/bge-reranker models.

        Args:
            documents: A sequence of documents to compress.
            query: The query to use for compressing the documents.
            callbacks: Callbacks to run during the compression process.

        Returns:
            A sequence of compressed documents.
        """
        if len(documents) == 0:  # to avoid empty api call
            return []
        doc_list = list(documents)
        _docs = [d.page_content for d in doc_list]
        results = self.bge_rerank(query, _docs)
        final_results = []
        for r in results:
            doc = doc_list[r[0]]
            doc.metadata["relevance_score"] = r[1]
            final_results.append(doc)
        return final_results
    
def create_rag_pipeline(train_dataset_rag):
    """Create a RAGChain pipeline using the fine-tuned model and ChromaDB."""

    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    # Create Chroma index
    chroma_dir = 'Vectorstore/chromadb'
    vectorstore = Chroma(embedding_function = embedding_model,
                         persist_directory = chroma_dir,
                     collection_name = "bioasq_train_documents")

    vectorstore.add_documents(train_dataset_rag)
    vectorstore.persist()

    mlflow.log_param('rag_vectorstore', vectorstore)
    mlflow.log_artifact(chroma_dir)

    bm25_retriever = BM25Retriever.from_documents(train_dataset_rag)
    bm25_retriever.k=10    

    vs_retriever = vectorstore.as_retriever(search_kwargs = {"k":10})

    ensemble_retriever = EnsembleRetriever(retrievers = [bm25_retriever,vs_retriever], weight = [0.5,0.5])

    redundant_filter = EmbeddingsRedundantFilter(embeddings=embedding_model)

    reordering = LongContextReorder()

    reranker = BgeRerank()

    pipeline_compressor = DocumentCompressorPipeline(transformers = [redundant_filter, reordering, reranker])

    compression_pipeline = ContextualCompressionRetriever(base_compressor = pipeline_compressor, base_retriever = ensemble_retriever)

    qa_advanced = RetrievalQA.from_chain_type(llm = quantized_model,
                                    chain_type = "stuff",
                                    retriever = compression_pipeline,
                                    return_source_documents = True)
    mlflow.log_param('rag_chain_created', True)
    return qa_advanced

def save_rag_pipeline(rag_chain, rag_chain_path):
    """Save the RAGChain pipeline to a file."""
    with open(rag_chain_path, 'wb') as f:
        pickle.dump(rag_chain, f)

    mlflow.log_artifact(rag_chain_path)

mlflow.autolog()
with mlflow.start_run(run_name="RAG Pipeline Creation", parent_run_id=rag_dataset_creation_run.info.run_id) as rag_pipeline_creation_run:
    rag_chain = create_rag_pipeline(train_dataset_rag)
    save_rag_pipeline(rag_chain, rag_chain_path)



EVALUATE RAG PIPELINE

In [None]:

from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)

def evaluate_rag_pipeline(rag_chain, test_dataset_rag):
    """Evaluate the RAGChain pipeline using the test dataset."""
    questions = []
    answers = []
    contexts = []
    ground_truths = []

    for item in tqdm(test_dataset_rag, desc="Evaluating RAG Pipeline"):
        question = item['question']
        ideal_answer = item['ideal_answer']
        result = rag_chain.invoke({"query": question})
        questions.append(question)
        answers.append(result['result'])
        contexts.append([context.page_content for context in result['source_documents']])
        ground_truths.append(ideal_answer)

    response_dataset = Dataset.from_dict({
        "question" : questions,
        "answer" : answers,
        "contexts" : contexts,
        "ground_truth" : ground_truths
    })

    metrics = [
        faithfulness,
        answer_relevancy,
        context_recall,
        context_precision,
        answer_correctness,
    ]
    #
    eval_results = evaluate(response_dataset, metrics,raise_exceptions=False)
    
    mlflow.log_metric('rag_pipeline_evaluation', eval_results)
    return eval_results

mlflow.autolog()
with mlflow.start_run(run_name="RAG Pipeline Evaluation", parent_run_id=rag_pipeline_creation_run.info.run_id) as rag_pipeline_evaluation_run:
    eval_results = evaluate_rag_pipeline(rag_chain, test_dataset_rag)
    print(f"RAG Pipeline Evaluation Metrics: {eval_results}")
