In [None]:
from datasets import load_dataset


In [None]:
import os
os.environ["OPENAI_API_KEY"] = "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"

In [None]:
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
from langchain_openai import OpenAIEmbeddings
from tqdm.auto import tqdm
import time

# OpenAI Embeddings Class remains unchanged
class OpenAIEmbedding:
    def __init__(self, dimensions=3072):
        self.embedding_model = OpenAIEmbeddings(model="text-embedding-3-large", dimensions=1024)
        
    def encode(self, text):
        embeddings = self.embedding_model.embed_documents([text])
        return np.array(embeddings[0])

class HuggingFaceEmbedding:
    def __init__(self, model_name):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        if torch.cuda.is_available():
            self.model.to("cuda")
        
    def encode(self, text):
        inputs = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
        if torch.cuda.is_available():
            inputs = inputs.to("cuda")
        with torch.no_grad():
            outputs = self.model(**inputs)
            pooled_output = outputs.pooler_output if outputs.pooler_output is not None else outputs.last_hidden_state.mean(dim=1)
        return pooled_output.cpu().numpy().squeeze()  # Move back to CPU for numpy compatibility
        

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Evaluation function updated with tqdm and execution time measurement

def evaluate_embeddings(dataset, embeddings, k=1):
    start_time = time.time()
    results = []
    for name, embedding_model in embeddings.items():
        hit_rates = []
        mrr_scores = []
        for example in tqdm(dataset, desc=f"Evaluating {name}"):
            question_embedding = embedding_model.encode(example['question'])
            contexts_embeddings = [embedding_model.encode(context) for context in example['context']['contexts']]
            scores = [cosine_similarity(question_embedding, ctx_emb) for ctx_emb in contexts_embeddings]
            sorted_scores_idx = np.argsort(scores)[::-1]
            correct_answer_rank = sorted_scores_idx.tolist().index(0) + 1
            
            if correct_answer_rank <= k:
                hit_rates.append(1)
            else:
                hit_rates.append(0)
                
            mrr_scores.append(1 / correct_answer_rank)
            
        hit_rate = np.mean(hit_rates)
        mrr = np.mean(mrr_scores)
        results.append({"Embedding": name, "Hit Rate": hit_rate, "MRR": mrr})
    print(f"Total evaluation time: {time.time() - start_time:.2f} seconds")
    return pd.DataFrame(results)

# Load dataset
dataset = load_dataset("pubmed_qa", "pqa_labeled", split='train[:1000]')  
embeddings = {
    "OpenAI": OpenAIEmbedding(dimensions=1024),
    "bge-large": HuggingFaceEmbedding(model_name='BAAI/bge-large-en'),
    "pubmedbert": HuggingFaceEmbedding(model_name='bert-base-uncased')
}

# Evaluation and display section remains unchanged

eval_results = evaluate_embeddings(dataset, embeddings, k=5)
print(eval_results)

Evaluating OpenAI: 100%
 1000/1000 [18:59<00:00,  1.15it/s]
Evaluating bge-large: 100%
 1000/1000 [01:56<00:00,  9.20it/s]
Evaluating pubmedbert: 100%
 1000/1000 [00:52<00:00, 20.04it/s]
Total evaluation time: 1308.37 seconds

Embedding	Hit Rate	MRR
0	OpenAI	1.000	0.849433
1	bge-large	0.986	0.588398
2	pubmedbert	0.985	0.550596