In [None]:
!lscpu

In [None]:
!pip install langchain faiss-gpu transformers evaluate datasets huggingface_hub mlflow tqdm chromadb
!pip install rank_bm25
!pip install onnxruntime-gpu
# !gym stable-baselines3 onnx onnxruntime
# !pip install onnx onnxruntime
!pip install optimum[onnxruntime] onnx
!pip install sentence-transformers
!pip install rouge_score
!pip install -U langsmith openai
!pip install langchain-openai ragas
!pip install -U langsmith openai

In [None]:
!pip install packaging==23.0 mlflow==2.17.2

EXPORTS AND SETUP

In [None]:
import json
import mlflow
from datasets import Dataset as HFDataset
from torch.utils.data import Dataset, DataLoader
import torch
# from optimum.intel import IncQuantizer
# from torch.ao.quantization import get_default_qconfig, quantize_dynamic
import torch.cuda
import random
from tqdm import tqdm
import mlflow.pytorch
# import onnx
# from onnxruntime.quantization import quantize_static, QuantizationMode, CalibrationDataReader, QuantType, QuantFormat
# import onnxruntime as ort
from optimum.onnxruntime import ORTModelForSeq2SeqLM
from optimum.onnxruntime.configuration import AutoQuantizationConfig
from transformers import pipeline, ElectraForQuestionAnswering, ElectraTokenizer, AlbertForQuestionAnswering, AlbertTokenizer, AutoModelForQuestionAnswering, AutoTokenizer, T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModel

import os
import time
# from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
# from mlflow.llm.evaluate import evaluate_llm, evaluate_rag
from langchain.chains import (
    RetrievalQA, StuffDocumentsChain, LLMChain, create_retrieval_chain
)
# from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.embeddings.base import Embeddings
# from langchain.llms import HuggingFaceLLM
from langchain.llms.base import LLM
from transformers import default_data_collator
from langchain_core.runnables.base import coerce_to_runnable
from langchain_core.load import dumpd, dumps, load, loads
import pickle
import uuid
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.retrievers import (
    BM25Retriever, EnsembleRetriever, ContextualCompressionRetriever
)
from __future__ import annotations
from typing import Dict, Optional, Sequence
from langchain.schema import Document
from accelerate import Accelerator
from torch.cuda.amp import autocast
# from langchain.pydantic_v1 import Extra, root_validator
from pydantic import Extra

from langchain.callbacks.manager import Callbacks
from langchain.retrievers.document_compressors.base import BaseDocumentCompressor

from langchain.prompts import PromptTemplate, ChatPromptTemplate
from sentence_transformers import CrossEncoder
from langchain.chains.combine_documents import create_stuff_documents_chain

from langchain import hub
from langchain.document_transformers.embeddings_redundant_filter import EmbeddingsRedundantFilter
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain.document_transformers.long_context_reorder import LongContextReorder
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings



from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
os.environ["OPENAI_API_KEY"] = user_secrets.get_secret("OpenAI API Key")

os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"]=user_secrets.get_secret("LangChain-API-key")
os.environ["LANGCHAIN_PROJECT"]="Bio-ASQ-RAG-LangSmith-eval"

quantize_llm = True
eval_llm = False
rag_chain_path = 'rag/rag_chain.pkl'
quantized_model_path = 'quantized_model'
quantized_tokenizer_path = 'quantized_tokenizer'
chroma_dir = 'Vectorstore/chromadb3'
chroma_dir_in = 'Vectorstore/chromadb3'

# /kaggle/input/bioasq-rag-chromadb-vector-store/chromadb
    
directories = ['fine_tuned_model', quantized_model_path, quantized_tokenizer_path, 'fine_tuned_tokenizer', 'rag', chroma_dir, 'fine-tune-llm-results', 'fine-tune-llm-logs']
for directory in directories:
    if not os.path.exists(directory):
        os.makedirs(directory)

experiment_id = mlflow.create_experiment('BioASQ RAG')
mlflow.set_experiment('BioASQ RAG')

CREATE DATASETS

In [None]:
fine_tune_llm_train_input_file = '/kaggle/input/bioasq-fine-tune-llm-data/bio_asq_train_dataset.json'
fine_tune_llm_test_input_file = '/kaggle/input/bioasq-fine-tune-llm-data/bio_asq_test_dataset.json'

def load_fine_tune_llm_datasets():
    if os.path.exists(fine_tune_llm_train_input_file) and os.path.exists(fine_tune_llm_test_input_file):
        with open(fine_tune_llm_train_input_file, 'r') as f:
            train_data = json.load(f)
        with open(fine_tune_llm_test_input_file, 'r') as f:
            test_data = json.load(f)
        return train_data, test_data
    return None, None

def load_bioasq_data(train_file, test_file):
    """Load and process BioASQ data, utilizing all fields (question, ideal_answer, articles, snippets)."""
    
    def build_dataset(data):
        dataset = []
        for item in data:
            question = item['question']
            ideal_answer = item['ideal_answer']
            
            context = ""
            for article in item.get('articles', []):
                title = article.get('title', '')  # Default to empty string if 'title' is missing
                abstract = article.get('abstract', '')  # Default to empty string if 'abstract' is missing
                context += f"{title} {abstract} "  # Concatenate safely
            
            for snippet in item.get('snippets', []):
                title = snippet.get('title', '')  # Default to empty string if 'title' is missing
                abstract = snippet.get('abstract', '')  # Default to empty string if 'abstract' is missing
                context += f"{title} {abstract} "  # Concatenate safely
            
            context += " ".join(item.get('concepts', []))  # Ensure concepts are strings
            
            # Construct train example
            dataset.append({
                'question': question,
                'context': context,
                'ideal_answer': ideal_answer
            })


        return dataset

    with open(train_file, 'r') as f:
        train_data = json.load(f)
    
    with open(test_file, 'r') as f:
        test_data = json.load(f)

    train_dataset = build_dataset(train_data)
    test_dataset = build_dataset(test_data)

    return train_dataset, test_dataset


# Initialize MLFlow logging for the entire pipeline
mlflow.autolog()
with mlflow.start_run(run_name="Dataset Creation") as dataset_creation_run:
    train_dataset, test_dataset = load_fine_tune_llm_datasets()
    if train_dataset:
        print('Fine tune LLM dataset loaded')
        mlflow.log_param('Fine tune LLM dataset loaded', True)
    else:
        print('Fine tune LLM dataset Creation')
        mlflow.log_param('Fine tune LLM dataset loaded', False)
        # Load BioASQ training and testing datasets
        train_file = '/kaggle/input/bio-asq/training12b_train.json'
        test_file = '/kaggle/input/bio-asq/training12b_test.json'
        train_dataset, test_dataset = load_bioasq_data(train_file, test_file)
    
        # Optionally log dataset info to MLFlow
        mlflow.log_param('bio_asq_train_dataset_size', len(train_dataset))
        mlflow.log_param('bio_asq_test_dataset_size', len(test_dataset))
    
        # Save datasets as JSON files
        train_output_file = 'bio_asq_train_dataset.json'
        test_output_file = 'bio_asq_test_dataset.json'
    
        with open(train_output_file, 'w') as f:
            json.dump(train_dataset, f, indent=4)
    
        with open(test_output_file, 'w') as f:
            json.dump(test_dataset, f, indent=4)
    
        # Log the file paths to MLFlow for tracking
        mlflow.log_artifact(train_output_file)
        mlflow.log_artifact(test_output_file)



FINE TUNE LLM

In [None]:
finetuned_model_path = '/kaggle/input/bioasq-fine-tuned-t5-small/transformers/default/1'
finetuned_tokenizer_path = '/kaggle/input/bioasq-fine-tuned-t5-small-tokenizer/transformers/default/1'

quantized_finetuned_model_path = '/kaggle/input/bioasq-quantized-fine-tuned-t5-small/onnx/default/1'
quantized_finetuned_tokenizer_path = '/kaggle/input/bioasq-quantized-fine-tuned-t5-small-tokenizer/transformers/default/1'

class BioASQDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __len__(self):
        return len(self.encodings['input_ids'])
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item
    

def fine_tune_llm(train_dataset, test_dataset, model_name='t5-small'):
    """Fine-tune a pre-trained LLM on BioASQ data."""
    
    if os.path.exists(finetuned_model_path) and os.path.exists(finetuned_tokenizer_path):
        print('Loading fine-tuned model and tokenizer from checkpoint files')
        model = T5ForConditionalGeneration.from_pretrained(finetuned_model_path)
        tokenizer = T5Tokenizer.from_pretrained(finetuned_tokenizer_path)
        return model, tokenizer
    else:
        # Load model and tokenizer from Hugging Face
        model = T5ForConditionalGeneration.from_pretrained(model_name)
        tokenizer = T5Tokenizer.from_pretrained(model_name)

    print(f'Loaded model {model_name} and tokenizer from Hugging Face')

    # Tokenize the questions and answers, include context for the input
    train_encodings = tokenizer([f"question: {item['question']} context: {item['context']}" for item in train_dataset],
                                truncation=True, padding=True, max_length=256)
    train_labels = tokenizer([item['ideal_answer'] for item in train_dataset], truncation=True, padding=True, max_length=256)

    test_encodings = tokenizer([f"question: {item['question']} context: {item['context']}" for item in test_dataset],
                               truncation=True, padding=True, max_length=256)
    test_labels = tokenizer([item['ideal_answer'] for item in test_dataset], truncation=True, padding=True, max_length=256)

    train_data = BioASQDataset(train_encodings, train_labels)
    test_data = BioASQDataset(test_encodings, test_labels)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir='/fine-tune-llm-results',
        evaluation_strategy="steps",
        learning_rate=5e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir='/fine-tune-llm-logs',
        logging_steps=10,
        save_steps=100,
        save_total_limit=3,
        load_best_model_at_end=True,
        seed=42,
        data_seed=42,
        report_to="none",
        fp16=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=test_data,
        tokenizer=tokenizer,
    )

    # Train the model
    trainer.train()

    return model, tokenizer


# Fine-tune the model on BioASQ dataset
mlflow.autolog()
with mlflow.start_run(run_name="Fine Tuning") as fine_tuning_run:
    if os.path.exists(quantized_finetuned_model_path) and os.path.exists(quantized_finetuned_tokenizer_path) and quantize_llm:
        print('Loading quantized_fine-tuned model and tokenizer from checkpoint files')
        llm = ORTModelForSeq2SeqLM.from_pretrained(quantized_finetuned_model_path)
        tokenizer = T5Tokenizer.from_pretrained(quantized_finetuned_tokenizer_path)
    else:
        model_name = 't5-small'
        fine_tuned_model, fine_tuned_tokenizer = fine_tune_llm(train_dataset, test_dataset, model_name)

        # Save the fine-tuned model
        fine_tuned_model.save_pretrained('fine_tuned_model')
        fine_tuned_tokenizer.save_pretrained('fine_tuned_tokenizer')

        # Log model parameters to MLFlow
        mlflow.log_param('fine_tuned_model_name', model_name)
        mlflow.log_param('quantize_llm', quantize_llm)

        # Save model weights to MLFlow
        torch.save(fine_tuned_model.state_dict(), 'fine_tuned_model_weights.pth')
        mlflow.log_artifact('fine_tuned_model_weights.pth', artifact_path='fine_tuned_model')
        
        if quantize_llm:
            print('Quantizing llm model')
            llm = ORTModelForSeq2SeqLM.from_pretrained(
                finetuned_model_path,
                from_transformers=True,
                export=True,  # Exports the model to ONNX
                # quantization_config=AutoQuantizationConfig.default()
            )
            tokenizer = T5Tokenizer.from_pretrained(finetuned_tokenizer_path)
            
            # Save the quantized model
            llm.save_pretrained(quantized_model_path)
            tokenizer.save_pretrained(quantized_tokenizer_path)
        else:
            llm = fine_tuned_model
            tokenizer = fine_tuned_tokenizer

EVALUATE LLM

In [None]:
from evaluate import load
# Load the ROUGE metric
import evaluate

rouge = load("rouge")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def chunk_text(text, max_tokens, tokenizer):
    """Split text into chunks within the token limit."""
    tokens = tokenizer.tokenize(text)
    chunks = [tokens[i:i + max_tokens] for i in range(0, len(tokens), max_tokens)]
    return [tokenizer.convert_tokens_to_string(chunk) for chunk in chunks]

def generate_from_chunks(question, context, tokenizer, model, max_tokens=512):
    """Generate an answer by aggregating outputs from context chunks."""
    chunks = chunk_text(context, max_tokens - len(tokenizer.tokenize(question)) - 10, tokenizer)
    answers = []
    for chunk in chunks:
        input_text = f"question: {question} context: {chunk}"
        inputs = tokenizer(input_text, return_tensors="pt", max_length=max_tokens, truncation=True)
        inputs = {key: value.to(device) for key, value in inputs.items()}
        # outputs = model.generate(inputs["input_ids"], max_length=100)
        outputs = model.generate(
            inputs['input_ids'],
            max_length=150, 
            num_beams=5,  # Beam width
            early_stopping=True, 
            length_penalty=1.0  # Control length balance
        )
        answers.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
    return " ".join(answers)  # Combine chunk outputs

# Function to evaluate the fine-tuned model using Hugging Face's evaluator
def evaluate_llm_with_huggingface(model, tokenizer, test_dataset):
    predictions = []
    references = []
    question_ans = 1
    for item in tqdm(test_dataset, desc="Evaluating LLM"):
        print(f'Answering question {question_ans}')
        question_ans += 1
        question = item['question']
        context = item['context']
        ideal_answer = item['ideal_answer']
        
        torch.cuda.empty_cache()
        
        # Move the quantized model to the same device
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model.to(device)

        
        # Generate answer
        with autocast():
            pred = generate_from_chunks(question, context, tokenizer, model)
        
        # Store predictions and references
        predictions.append(pred)
        references.append(ideal_answer)
    
    # Evaluate using ROUGE
    result = rouge.compute(predictions=predictions, references=references)
    for key, value in result.items():
        mlflow.log_metric(key, value)
    print("ROUGE scores:", result)
    
    return result

mlflow.autolog()
with mlflow.start_run(run_name="LLM Evaluation", parent_run_id=fine_tuning_run.info.run_id) as llm_evaluation_run:
    mlflow.log_param('eval_llm', eval_llm)
    if eval_llm:
        llm_metrics = evaluate_llm_with_huggingface(llm, tokenizer, test_dataset)
        print(f"LLM Evaluation Metrics: {llm_metrics}")

CREATE RAG DATASET

In [None]:
# rag_train_input_file = '/kaggle/input/bioasq-rag-data/bio_asq_train_dataset_rag.json'
# rag_test_input_file = '/kaggle/input/bioasq-rag-data/bio_asq_test_dataset_rag.json'

# def load_rag_datasets():
#     if os.path.exists(rag_train_input_file) and os.path.exists(rag_test_input_file):
#         with open(rag_train_input_file, 'r') as f:
#             train_data = json.load(f)
#         with open(rag_test_input_file, 'r') as f:
#             test_data = json.load(f)
#         return train_data, test_data
#     return None, None

rag_train_input_file = '/kaggle/input/bioasq-rag-data/bio_asq_train_dataset_rag.pkl'
rag_test_input_file = '/kaggle/input/bioasq-rag-data/bio_asq_test_dataset_rag.pkl'

def load_rag_datasets():
    if os.path.exists(rag_train_input_file) and os.path.exists(rag_test_input_file):
        with open(rag_train_input_file, 'rb') as f:
            train_data = pickle.load(f)
        with open(rag_test_input_file, 'rb') as f:
            test_data = pickle.load(f)
        return train_data, test_data
    return None, None



def create_rag_dataset(train_data, test_data):
    """Create a dataset for RAG from BioASQ data."""
    
    def build_dataset(data):
        dataset = []      

        # Create text splitter for RAG
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size = 450,
            chunk_overlap = 50,
            length_function = len,
            is_separator_regex = False
        )

        # docs = [
        #     Document(
        #         page_content = item['context'],
        #         metadata = {
        #             'question': item['question'],
        #             'ideal_answer': item['ideal_answer']
        #         }
        #     ) for item in data
        # ]
        
        # split_docs = text_splitter.split_text(docs)

        # for i, chunk in enumerate(split_docs):
        #     chunk.id = str(uuid.uuid4())
        #     chunk.metadata["length"] = len(chunk.page_content)
        #     dataset.append(chunk)

        # id = 0
        
        for item in data:
            chunks = text_splitter.split_text(item['context'])
            
            # Add each chunk as a separate document with metadata
            for chunk in chunks:
                dataset.append(
                    Document(
                        id = str(uuid.uuid4()),
                        page_content = chunk,
                        metadata = {
                            'question': item['question'],
                            'ideal_answer': item['ideal_answer'],
                            'length': len(chunk)
                        }
                    )
                )

        return dataset

    train_dataset = build_dataset(train_data)
    test_dataset = build_dataset(test_data)

    return train_dataset, test_dataset


# Initialize MLFlow logging for the entire pipeline
mlflow.autolog()
with mlflow.start_run(run_name="Dataset Creation for RAG", parent_run_id=llm_evaluation_run.info.run_id) as rag_dataset_creation_run:
    train_dataset_rag, test_dataset_rag = load_rag_datasets()
    if train_dataset_rag:
        print('RAG dataset loaded')
        mlflow.log_param('RAG dataset loaded', True)
        
    else:
        print('RAG dataset Creation')
        train_dataset_rag, test_dataset_rag = create_rag_dataset(train_dataset, test_dataset)

        train_output_file = 'bio_asq_train_dataset_rag.pkl'
        test_output_file = 'bio_asq_test_dataset_rag.pkl'
        
        # # Save datasets as JSON files
        # train_output_file = 'bio_asq_train_dataset_rag.json'
        # test_output_file = 'bio_asq_test_dataset_rag.json'
    
        # train_string = dumps(train_dataset_rag, pretty=True)
        # test_string = dumps(test_dataset_rag, pretty=True)
    
        # with open(train_output_file, 'w') as f:
        #     json.dump(train_string, f, indent=4)
    
        # with open(test_output_file, 'w') as f:
        #     json.dump(test_string, f, indent=4)

        with open(train_output_file, 'wb') as f:
            pickle.dump(train_dataset_rag, f)
    
        with open(test_output_file, 'wb') as f:
            pickle.dump(test_dataset_rag, f)
    
        # Log the file paths to MLFlow for tracking
        mlflow.log_artifact(train_output_file)
        mlflow.log_artifact(test_output_file)

CREATE RAG PIPELINE

In [None]:
from langchain.llms.base import LLM
from typing import Optional
import torch

class ONNXLLM(LLM):
    model: Optional[object] = None  # Use Optional for model and tokenizer since they're not JSON serializable
    tokenizer: Optional[object] = None
    device: Optional[torch.device] = None

    def __init__(self, model, tokenizer, device):
        super().__init__()
        self.model = model
        self.tokenizer = tokenizer
        self.device = device

    @property
    def _llm_type(self) -> str:
        return "onnx_seq2seq"

    def _call(self, prompt: str, stop: list = None) -> str:
        inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
        inputs = {key: value.to(self.device) for key, value in inputs.items()}
        outputs = self.model.generate(
            inputs["input_ids"],
            max_length=150,
            num_beams=5,
            early_stopping=True,
            length_penalty=1.0
        )
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)


class BgeRerank(BaseDocumentCompressor):
    model_name: str = 'BAAI/bge-reranker-v2-m3'
    top_n: int = 5
    model: CrossEncoder = CrossEncoder(model_name)

    def bge_rerank(self, query, docs):
        model_inputs = [[query, doc] for doc in docs]
        scores = self.model.predict(model_inputs)
        results = sorted(zip(docs, scores), key=lambda x: x[1], reverse=True)
        return results[:self.top_n]

    class Config:

        extra = Extra.allow
        arbitrary_types_allowed = True

    def compress_documents(
        self,
        documents: Sequence[Document],
        query: str,
        callbacks: Optional[Callbacks] = None,
    ) -> Sequence[Document]:
        """
        Compress documents using BAAI/bge-reranker models.

        Args:
            documents: A sequence of documents to compress.
            query: The query to use for compressing the documents.
            callbacks: Callbacks to run during the compression process.

        Returns:
            A sequence of compressed documents.
        """
        if len(documents) == 0:
            return []
        doc_list = list(documents)
        original_docs = {d.page_content: d for d in doc_list}
        _docs = [d.page_content for d in doc_list]
    
        # Rerank documents based on the query
        results = self.bge_rerank(query, _docs)
    
        # Prepare final results by updating the original documents
        final_results = []
        for doc_text, score in results:
            original_doc = original_docs[doc_text]
            original_doc.metadata["relevance_score"] = score
            final_results.append(original_doc)
        final_results.sort(key=lambda doc: doc.metadata["relevance_score"], reverse=True)
        return final_results

class BiomedRobertaEmbeddings(Embeddings):
        def __init__(self, model, tokenizer, device="cuda" if torch.cuda.is_available() else "cpu"):
            self.model = model.to(device)
            self.tokenizer = tokenizer
            self.device = device
    
        def embed_query(self, text: str) -> List[float]:
            tokens = self.tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
            tokens = {key: value.to(self.device) for key, value in tokens.items()}
            with torch.no_grad():
                outputs = self.model(**tokens)
                # Use CLS token representation as embedding
                return outputs.last_hidden_state[:, 0, :].cpu().numpy().tolist()[0]
    
        def embed_documents(self, texts: List[str]) -> List[List[float]]:
            embeddings = []
            for text in texts:
                embeddings.append(self.embed_query(text))
            return embeddings

def vectorstore_add_documents_in_batches(vectorstore, documents, batch_size=40000):
    """Add documents to the vectorstore in smaller batches."""
    for i in range(0, len(documents), batch_size):
        batch = documents[i:i + batch_size]
        print(f"Adding documents to vectorstore: batch {i // batch_size + 1}: {len(batch)} documents")
        vectorstore.add_documents(batch)

def format_documents_with_scores(documents: List[Document]) -> str:
    formatted_docs = [
        f"Content: {doc.page_content} \nRelevance Score: {doc.metadata.get('relevance_score', 'N/A')}" for doc in documents
    ]
    return "\n\n".join(formatted_docs)

def create_rag_pipeline(train_dataset_rag):
    """Create a RAGChain pipeline using the fine-tuned model and ChromaDB."""
    mlflow.log_param('device', 'cuda' if torch.cuda.is_available() else 'cpu')


    # biomed_model_name = "allenai/biomed_roberta_base"
    # biomed_tokenizer = AutoTokenizer.from_pretrained(biomed_model_name)
    # biomed_model = AutoModel.from_pretrained(biomed_model_name)
    
    # biomed_embeddings = BiomedRobertaEmbeddings(biomed_model, biomed_tokenizer)

    
    embedding_model = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={"device": "cuda:0"} if torch.cuda.is_available() else {"device": "cpu"}
    )

    if os.path.exists(chroma_dir_in):
        vectorstore = Chroma(persist_directory=chroma_dir_in, embedding_function=embedding_model)
        print('Vector store loaded from disk')
    else:
        vectorstore = Chroma(
            embedding_function=embedding_model,
            persist_directory=chroma_dir,
            collection_name="bioasq_train_documents",
        )
        print('Vector store initialized')
    
        # vectorstore.add_documents(train_dataset_rag)
    
        vectorstore_add_documents_in_batches(vectorstore, train_dataset_rag, batch_size=40000)
        
        print('Added documents to Vector store')
        vectorstore.persist()
        print('Vector store persisted')
    
        mlflow.log_param('rag_vectorstore_description', 'Chroma Vectorstore for BioASQ')
        mlflow.log_artifact(chroma_dir)

    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    llm.to(device)
    langchain_llm = ONNXLLM(model=llm, tokenizer=tokenizer, device=device)
    
    print('Creating bm25 retriever')
    bm25_retriever = BM25Retriever.from_documents(train_dataset_rag)
    bm25_retriever.k = 15
    print('bm25 retriever created')
    vs_retriever = vectorstore.as_retriever(search_kwargs={"k": 15})
    print('vs retriever created')

    ensemble_retriever = EnsembleRetriever(
        retrievers=[bm25_retriever, vs_retriever], weight=[0.5, 0.5]
    )
    print('ensemble retriever created')

    redundant_filter = EmbeddingsRedundantFilter(embeddings=embedding_model)
    reordering = LongContextReorder()
    reranker = BgeRerank()

    pipeline_compressor = DocumentCompressorPipeline(
        transformers=[redundant_filter, reordering, reranker]
    )
    print('DocumentCompressorPipeline created')

    compression_pipeline = ContextualCompressionRetriever(
        base_compressor=pipeline_compressor, base_retriever=ensemble_retriever
    )
    print('ContextualCompressionRetriever created')

    runnable_llm = coerce_to_runnable(langchain_llm)

    qa_prompt_template = """Answer the following question based on the provided context
    <context>
    {context}
    </context>

    Question:
    {input}
    
    Answer:"""
    
    document_prompt = PromptTemplate(
        template="Context:\nrelevance score:{relevance_score}\ncontent:{page_content}",
        input_variables=["page_content", "relevance_score"],
    )
    qa_prompt = PromptTemplate.from_template(qa_prompt_template)
    # qa_prompt = ChatPromptTemplate.from_template(qa_prompt_template)
    # qa_prompt = PromptTemplate(
    #     input_variables=["context", "input"],
    #     template=qa_prompt_template
    # )
    # qa_prompt = hub.pull("langchain-ai/retrieval-qa-chat")
    llm_chain = LLMChain(prompt=qa_prompt, llm=runnable_llm, callbacks=None, verbose=True)

    # combine_docs_chain = StuffDocumentsChain(
    #     llm_chain=llm_chain,
    #     document_variable_name="context",
    #     document_prompt=document_prompt,
    #     callbacks=None,
    # )
    
    # qa_advanced = RetrievalQA.from_chain_type(
    #     combine_documents_chain=combine_docs_chain,
    #     callbacks=None,
    #     verbose=True,
    #     llm=runnable_llm,
    #     retriever=compression_pipeline,
    #     return_source_documents=True,
    #     chain_type="stuff",
    #     # chain_type_kwargs = {
    #     #     "combine_documents_chain": combine_docs_chain,
    #     #     "verbose": True
    #     # },
    #     # template=prompt
    # )
    combine_docs_chain = create_stuff_documents_chain(
        llm=runnable_llm,
        prompt=qa_prompt,
        document_prompt=document_prompt
    )
    qa_advanced = create_retrieval_chain(compression_pipeline, combine_docs_chain)
    print('RAG pipeline created')

    mlflow.log_param('rag_pipeline_created', True)
    return qa_advanced

def save_rag_pipeline(rag_chain, rag_chain_path):
    """Save the RAGChain pipeline to a file."""
    rag_chain.llm 
    with open(rag_chain_path, 'wb') as f:
        pickle.dump(rag_chain, f)
    print('RAG pipeline saved')
    mlflow.log_artifact(rag_chain_path)

mlflow.autolog()
with mlflow.start_run(run_name="RAG Pipeline Creation", parent_run_id=rag_dataset_creation_run.info.run_id) as rag_pipeline_creation_run:
    rag_chain = create_rag_pipeline(train_dataset_rag)
    # save_rag_pipeline(rag_chain, rag_chain_path)



EVALUATE RAG PIPELINE

In [None]:
from langsmith import Client
from langsmith.evaluation import LangChainStringEvaluator, evaluate
import textwrap

def get_unique_qa_pairs(test_dataset_rag):
    seen_qa_pairs = set()
    unique_qa_pairs = []
    
    for item in test_dataset_rag:
        question = item.metadata['question']
        answer = item.metadata['ideal_answer']
        qa_pair = (question, answer)
        if qa_pair not in seen_qa_pairs:
            seen_qa_pairs.add(qa_pair)
            unique_qa_pairs.append({
                "question": question,
                "answer": answer,
            })
    print(f"Filtered {len(test_dataset_rag)} items to {len(unique_qa_pairs)} unique QA pairs.")
    return unique_qa_pairs

def predict_rag_answer(example: dict):
    """Use this for answer evaluation"""
    response = rag_chain.invoke({"input": example["input"]})
    return {"answer": response['answer']}

def predict_rag_answer_with_context(example: dict):
    """Use this for evaluation of retrieved documents and hallucinations"""
    response = rag_chain.invoke({"input": example["input"]})
    return {"answer": response['answer'], "contexts": [context.page_content for context in response['context']]}

grade_prompt_answer_accuracy = prompt = hub.pull("langchain-ai/rag-answer-vs-reference")

def answer_evaluator(run, example) -> dict:
    """
    A simple evaluator for RAG answer accuracy
    """

    # Get question, ground truth answer, RAG chain answer
    input_question = example.inputs["input_question"]
    reference = example.outputs["output_answer"]
    prediction = run.outputs["answer"]

    # LLM grader
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

    # Structured prompt
    answer_grader = grade_prompt_answer_accuracy | llm

    # Run evaluator
    score = answer_grader.invoke({"question": input_question,
                                  "correct_answer": reference,
                                  "student_answer": prediction})
    score = score["Score"]

    return {"key": "answer_v_reference_score", "score": score}

def evaluate_rag_pipeline(rag_chain, test_dataset_rag, progress_file, batch_size = 20):
    """Evaluate the RAGChain pipeline using the test dataset."""
    print('Evaluating RAG pipeline')
    client = Client()
    # client.init_project("Bio-ASQ-RAG-LangSmith-eval")
    
    unique_qa_pairs = get_unique_qa_pairs(test_dataset_rag)
    
    dataset_name = f"BioASQ RAG evaluation {uuid.uuid4().hex[:4]}"
    dataset = client.create_dataset(
        dataset_name=dataset_name,
        description=f"Test QA for BioASQ RAG",
    )
    random_50_qa_pairs = random.sample(unique_qa_pairs, 50)
    inputs, outputs = zip(
        *[({"input": qa["question"]}, {"expected": qa["answer"]}) for qa in random_50_qa_pairs]
    )
    client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)
    
    # qa_evalulator = [
    #     LangChainStringEvaluator(
    #         "cot_qa",
    #         prepare_data=lambda run, example: (
    #             {
    #                 "prediction": run.outputs["answer"],
    #                 "reference": example.outputs["expected"],
    #                 "input": example.inputs["input"],
    #             }
    #         ),
    #     )
    # ]
    # qa_evalulator_experiment_results = evaluate(
    #     predict_rag_answer,
    #     data=dataset_name,
    #     evaluators=qa_evalulator,
    #     experiment_prefix=f"rag-bioasq-oai",
    #     metadata={"variant": "LCEL context, gpt-4o-mini"},
    # )
    
    answer_hallucination_evaluator = LangChainStringEvaluator(
        "labeled_score_string",
        config={
            "criteria": {
                "accuracy": """Is the Assistant's Answer grounded in the Ground Truth documentation? A score of [[1]] means that the
                Assistant answer contains is not at all based upon / grounded in the Groun Truth documentation. A score of [[5]] means 
                that the Assistant answer contains some information (e.g., a hallucination) that is not captured in the Ground Truth 
                documentation. A score of [[10]] means that the Assistant answer is fully based upon the in the Ground Truth documentation."""
            },
            # If you want the score to be saved on a scale from 0 to 1
            "normalize_by": 10,
        },
        prepare_data=lambda run, example: {
            "prediction": run.outputs["answer"],
            "reference": run.outputs["contexts"],
            "input": example.inputs["input"],
        },
    )
    answer_hallucination_evaluator_experiment_results = evaluate(
        predict_rag_answer_with_context,
        data=dataset_name,
        evaluators=[answer_hallucination_evaluator],
        experiment_prefix=f"rag-bioasq-oai-hallucination",
        # Any experiment metadata can be specified here
        metadata={
            "variant": "LCEL context, gpt-4o-mini",
        },
    )

    docs_relevance_evaluator = LangChainStringEvaluator(
        "score_string",
        config={
            "criteria": {
                "document_relevance": textwrap.dedent(
                    """The response is a set of documents retrieved from a vectorstore. The input is a question
                used for retrieval. You will score whether the Assistant's response (retrieved docs) is relevant to the Ground Truth 
                question. A score of [[1]] means that none of the  Assistant's response documents contain information useful in answering or addressing the user's input.
                A score of [[5]] means that the Assistant answer contains some relevant documents that can at least partially answer the user's question or input. 
                A score of [[10]] means that the user input can be fully answered using the content in the first retrieved doc(s)."""
                )
            },
            # If you want the score to be saved on a scale from 0 to 1
            "normalize_by": 10,
        },
        prepare_data=lambda run, example: {
            "prediction": run.outputs["contexts"],
            "input": example.inputs["input"],
        },
    )
    docs_relevance_evaluator_experiment_results = evaluate(
        predict_rag_answer_with_context,
        data=dataset_name,
        evaluators=[docs_relevance_evaluator],
        experiment_prefix=f"rag-bioasq-oai-doc-relevance",
        # Any experiment metadata can be specified here
        metadata={
            "variant": "LCEL context, gpt-4o-mini",
        },
    )      

        

mlflow.autolog()
with mlflow.start_run(run_name="RAG Pipeline Evaluation", parent_run_id=rag_pipeline_creation_run.info.run_id) as rag_pipeline_evaluation_run:
    progress_file = '/kaggle/input/bioasq-rag-evaluation-progress/eval_progress_2.json'
    eval_results = evaluate_rag_pipeline(rag_chain, test_dataset_rag, progress_file)
    # mlflow.log_metric('rag_pipeline_evaluation', eval_results)
