In [None]:
!pip install evaluate

In [None]:
from datasets import load_dataset
pubmedqa = load_dataset("nnilayy/pubmedqa-qoccl")
medmcqa = load_dataset("nnilayy/medmcqa-qoccs")
medqa = load_dataset("nnilayy/medqa-qoc")
mmlu_medical = load_dataset("nnilayy/mmlu-medical-qoc")

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
hugging_face_api_key = user_secrets.get_secret("HUGGING_FACE_API_KEY")

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
checkpoint = "meta-llama/Meta-Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, 
                                          padding_side="left",
                                          token=hugging_face_api_key
                                         )
tokenizer.pad_token = tokenizer.eos_token
# model = AutoModelForCausalLM.from_pretrained(checkpoint, 
#                                              device_map="auto",
#                                              torch_dtype=torch.float16,
#                                              token=hugging_face_api_key,
#                                             )

In [None]:
import torch

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        # Convert lists or numpy arrays to tensors
        item = {
            'input_ids': torch.tensor(self.dataset['input_ids'][idx]),
            'attention_mask': torch.tensor(self.dataset['attention_mask'][idx]),
            'labels': self.dataset['Correct Answer'][idx],
        }        
        return item

In [None]:
from tqdm import tqdm 
from evaluate import load
from torch.utils.data import DataLoader


class DataHelper:
    def __init__(self):
        self.datasets_dict = None        
        self.current_datasets_dict = {}
        self.formatted_datasets_dict = {}
        self.tokenized_datasets_dict = {}

        self.dataloaders_dict = {}

        self.tokenizer = None
#         self.tokenizer.pad_token = None

        self.system_instruction = "You are a Helpful AI Assistant."
        self.user_instruction = "Please answer the following Question: "
        self.user_query = None
        
        #datasets configurations
        self.batch_size = None
        self.shuffle = None
        self.max_length = None
        self.return_tensors = None
        self.padding = None
        self.truncation = None
        
        # Config Columns
        self.user_query_column = None        
        self.column_to_tokenize = None  
        
# LOADING DATASETS DICT
    def load_datasets_dict(self, datasets_dict):
        self.datasets_dict = datasets_dict
        self.current_datasets_dict = self.datasets_dict        
        return self.datasets_dict

# LOADING DATASET CONFIGURATION
    def set_dataset_config(self, dataset_configuration):
        self.batch_size = dataset_configuration['batch_size']
        self.shuffle = dataset_configuration['shuffle']
        self.max_length = dataset_configuration['max_length']
        self.return_tensors = dataset_configuration['return_tensors']
        self.padding = dataset_configuration['padding']
        self.truncation = dataset_configuration['truncation']
        self.column_to_tokenize = dataset_configuration['column_to_tokenize']

# LOADING TOKENIZER
    def load_tokenizer(self, tokenizer):
        self.tokenizer = tokenizer
        return self.tokenizer        

# SYSTEM & USER PROMPT
    def set_system_instruction(self, system_instruction):
        self.system_instruction = system_instruction.strip()
        return self.system_instruction
    
    def set_user_instruction(self, user_instruction):
        self.user_instruction = user_instruction.strip()
        return self.user_instruction

# CONVERTING DATASETS TO DATALOADER
    def datasets_to_dataloader(self):
        if self.tokenized_datasets_dict:
            self.current_datasets_dict = self.tokenized_datasets_dict
        elif self.formatted_datasets_dict:
            self.current_datasets_dict = self.formatted_datasets_dict
        else:
            self.current_datasets_dict = self.datasets_dict

        for dataset_name, dataset in self.current_datasets_dict.items():
            dataset = CustomDataset(dataset)
            dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=self.shuffle)
            self.dataloaders_dict.update({dataset_name+"_dataloader":dataloader})
        return self.dataloaders_dict
    
# TOKENIZATION CODE    
    def tokenization_function(self, example):
        return self.tokenizer(text=example[self.column_to_tokenize],
                              padding=self.padding,
                              truncation=self.truncation,
                              max_length=self.max_length,
                              return_tensors = self.return_tensors,
                             )
    
    def tokenize_datasets(self):
        for dataset_name, dataset in self.current_datasets_dict.items():
            tokenized_dataset = dataset.map(self.tokenization_function, batched = True)
            self.tokenized_datasets_dict.update({dataset_name+"_tokenized":tokenized_dataset}) 
        return self.tokenized_datasets_dict    
    
    def remove_columns(self):
        dataset_name = list(self.datasets_dict.keys())[0]
        base_dataset_columns = list(self.datasets_dict[dataset_name].features.keys())
        tokenized_dataset_columns = list(self.current_datasets_dict[dataset_name].features.keys())
        final_columns = list(set(tokenized_dataset_columns) - set(base_dataset_columns))
        pass

In [None]:
# from helper_classes.datahelper import DataHelper
class CustomDataHelper(DataHelper):
    def __init__(self):
        super().__init__()
        self.current_question = None
        self.current_options = None

    def processing_fn_format(self, example):
        if self.tokenizer:
            self.current_question = example['Question']
            self.current_options = example['Options']
            chat_message = [
                {"role": "system", "content": self.system_instruction},
                {"role": "user", "content": self.user_instruction + "\nQuestion: " + str(self.current_question) + "\nOptions:\n"+ str(self.current_options)}
            ]            
            example['message'] = self.tokenizer.apply_chat_template(chat_message,
                                                                    tokenize=False,
                                                                    add_generation_prompt=True,
                                                                    return_tensors = self.return_tensors
                                                                   )
            return example
        else:
            raise Exception("Tokenizer has not been provided. Please load a tokenizer")
    
    def format_datasets(self):
        self.current_datasets_dict = self.datasets_dict
        for dataset_name, dataset in self.current_datasets_dict.items():
            formatted_dataset = dataset.map(self.processing_fn_format)
            self.formatted_datasets_dict.update({dataset_name+"_formatted":formatted_dataset})
            self.current_datasets_dict = self.formatted_datasets_dict
        return self.current_datasets_dict

In [None]:
datasets_dict = {
    "anatomy": mmlu_medical['anatomy'],
    "clinical_kg": mmlu_medical['clinical_kg'],
    "college_biology": mmlu_medical['college_biology']
}

dataset_configuration = {
    "batch_size": 1,
    "shuffle": True,
    "return_tensors": "pt",
    "max_length":300,
    "padding": "max_length",
    "truncation": True,
    "column_to_tokenize": "message",
}

# data_helper = CustomDataHelper()
data_helper = CustomDataHelper()
data_helper.set_system_instruction("""
You are a USMLE Passed Medical Doctor and a Helpful AI-Assitant.
""")

data_helper.set_user_instruction("""
You will be presented queries in the format of Multiple Choice Question and Answer. 
Understand the question and depth and from the mentioned "Option" select the one that is 
seems to be the correct Answer. Do not provide any explanation and just return 
the answer as "Final Answer: (A/B/C/D)".
""")

data_helper.load_datasets_dict(datasets_dict)
data_helper.set_dataset_config(dataset_configuration)

data_helper.load_tokenizer(tokenizer)

data_helper.format_datasets()
data_helper.tokenize_datasets()
# dataloader = data_helper.datasets_to_dataloader()
# dataloader

In [None]:
from tqdm import tqdm 
from evaluate import load
from torch.utils.data import DataLoader

demo_dataset = CustomDataset(data_helper.tokenized_datasets_dict['anatomy_formatted_tokenized'])
dataloader = DataLoader(demo_dataset, batch_size=10, shuffle=False)

model.eval()
labels, all_logits = [], []
for batch in tqdm(dataloader, desc="Evaluating"):
    inputs = {k:v.to("cuda") for k,v in batch.items() if k!="labels"}
    with torch.no_grad():
        outputs = model.generate(**inputs,
                                 max_new_tokens=300,
                                 pad_token_id=tokenizer.eos_token_id
                                )
#     sent = tokenizer.batch_decode(sequences=outputs, skip_special_tokens=True)
    sent = tokenizer.batch_decode(sequences=outputs[:, inputs['input_ids'].shape[1]:], 
                                  skip_special_tokens=True)
    break
    
#     logits = outputs.logits
#     predictions = torch.argmax(logits, dim=-1)
#     all_logits.append(predictions)
#     labels.append(inputs['labels'])

# labels = torch.cat(labels, dim=0)
# all_logits = torch.cat(all_logits, dim=0)

# accuracy.compute(predictions = all_logits, references = labels)

## Evaluator

In [None]:
from tqdm import tqdm 
class Evaluator:
    def __init__(self):
        self.models_dict = None
        self.metrics_dict = None
        self.datasets_dict = None

        self.device = None
        self.current_model = None
        self.current_model_name = None
        self.current_dataset = None
        self.current_dataset_name = None
        
        self.labels = None
        self.all_logits = None

# LOADING FUNCTIONS
    def load_models_dict(self, models_dict):
        self.models_dict = models_dict
        return self.models_dict
           
    def set_device(self, device):
        self.device = device
        return self.device

    def load_datasets_dict(self, datasets_dict):
        self.datasets_dict = datasets_dict
        return self.datasets_dict
    
    def load_metrics_dict(self, metrics_dict):
        self.metrics_dict = metrics_dict
        return self.metrics_dict
    
    
# METRICS COMPUTATION
    def compute_metrics(self):
        computed_metrics = {}
        for _, metric in self.metrics_dict.items():
            result = metric.compute(predictions = self.all_logits, references = self.labels)
            computed_metrics.update(result)
        return computed_metrics


    def evaluate_qbq(self):
        self.current_model.eval()
        all_logits = []
        labels = []
        for index in tqdm(range(len(self.current_dataset)), desc=f"Evaluating {self.current_model_name} on {self.current_dataset_name}"):
            input_ids = self.current_dataset['input_ids'][index].unsqueeze(0).to(self.device)
            attention_mask = self.current_dataset['attention_mask'][index].unsqueeze(0).to(self.device)  
            token_type_ids = self.current_dataset['token_type_ids'][index].unsqueeze(0).to(self.device)
            with torch.no_grad():
                outputs = self.current_model(input_ids = input_ids,
                                             attention_mask = attention_mask,
                                             token_type_ids = token_type_ids
                                            )
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)[0]
            label = self.current_dataset['labels'][index].to(self.device)
            
            all_logits.append(predictions)
            labels.append(label)
        self.all_logits = all_logits
        self.labels = labels
        
        evaluated_metrics = self.compute_metrics()
        return evaluated_metrics


    def evaluate_batch(self):
        self.current_model.eval()
        all_logits = []
        for batch in tqdm(self.current_dataset, desc="Evaluating"):
            inputs = {k:v.to(self.device) for k,v in batch.items()}
            with torch.no_grad():
                outputs = self.current_model(**inputs)
            logits = outputs.logits
            all_logits.append(logits)
        self.all_logits = all_logits        
        evaluated_metrics = self.compute_metrics()
        return evaluated_metrics


    def evaluate_models(self):
        evaluation_results = {}
        for model_name, model in self.models_dict.items():
            self.current_model = model
            self.current_model_name = model_name
            result = self.evaluate_qbq()
            evaluation_results.update({model_name: result})
        return evaluation_results


    def evaluate_datasets(self):
        evaluation_results = {}
        for dataset_name, dataset in self.datasets_dict.items():
            self.current_dataset = dataset
            self.current_dataset_name = dataset_name
            result = self.evaluate_models()
            evaluation_results.update({dataset_name: result})
        return evaluation_results
    

In [None]:
from tqdm import tqdm 
from itertools import chain
class Evaluator:
    def __init__(self):
        self.models_dict = None
        self.metrics_dict = None
        self.datasets_dict = None

        self.device = None
        self.current_model = None
        self.current_model_name = None
        self.current_dataset = None
        self.current_dataset_name = None
        
        self.all_labels = None
        self.all_logits = None
        self.logits_and_labels = {}
        
        self.model_generation_type = None        

        #Generation Configurations
        self.tokenizer = None
        self.max_new_tokens = None
        self.pad_token_id = None

# LOADING FUNCTIONS
    def load_models_dict(self, models_dict):
        self.models_dict = models_dict
        return self.models_dict
           
    def set_device(self, device):
        self.device = device
        return self.device

    def load_datasets_dict(self, datasets_dict):
        self.datasets_dict = datasets_dict
        return self.datasets_dict
    
    def load_metrics_dict(self, metrics_dict):
        self.metrics_dict = metrics_dict
        return self.metrics_dict
    
    def set_model_generation_type(self, model_generation_type):
        self.model_generation_type = model_generation_type
        return self.model_generation_type
    
    def set_generation_config(self, generation_config):
        self.tokenizer = generation_config['tokenizer']
        self.max_new_tokens = generation_config['max_new_tokens']
        self.pad_token_id = generation_config['pad_token_id']
        
        
# METRICS COMPUTATION
    def compute_metrics(self):
        if self.metrics_dict:
            computed_metrics = {}
            for _, metric in self.metrics_dict.items():
                result = metric.compute(predictions = self.all_logits, references = self.labels)
                computed_metrics.update(result)
            return computed_metrics

# PROCESSING FUNCTIONS TO FORMAT AND CLEAN LOGITS
    def process_logits_labels(self):
        
        self.all_logits = list(chain.from_iterable(self.all_logits))
        self.all_labels = list(chain.from_iterable(self.all_labels))
        
        logits_labels_dict_name = self.current_model_name+"-"+self.current_dataset_name+"-logits-labels"
        self.logits_and_labels.update({
            logits_labels_dict_name:{
            "logits":self.all_logits,
            "labels":self.all_labels
            }})
# EVALUATING
#     def evaluate_qbq(self):
#         self.current_model.eval()
#         all_logits = []
#         labels = []
#         for index in tqdm(range(len(self.current_dataset)), desc=f"Evaluating {self.current_model_name} on {self.current_dataset_name}"):
#             input_ids = self.current_dataset['input_ids'][index].unsqueeze(0).to(self.device)
#             attention_mask = self.current_dataset['attention_mask'][index].unsqueeze(0).to(self.device)  
#             token_type_ids = self.current_dataset['token_type_ids'][index].unsqueeze(0).to(self.device)
#             with torch.no_grad():
#                 outputs = self.current_model(input_ids = input_ids,
#                                              attention_mask = attention_mask,
#                                              token_type_ids = token_type_ids
#                                             )
#             logits = outputs.logits
#             predictions = torch.argmax(logits, dim=-1)[0]
#             label = self.current_dataset['labels'][index].to(self.device)
            
#             all_logits.append(predictions)
#             labels.append(label)
#         self.all_logits = all_logits
#         self.labels = labels
        
#         evaluated_metrics = self.compute_metrics()
#         return evaluated_metrics


    def evaluate(self):
        all_labels, all_logits = [], []

        self.current_model.eval()        
        for batch in tqdm(self.current_dataset, desc=f"Evaluating {self.current_model_name} on {self.current_dataset_name}"):
            inputs = {k: v.to(self.device) for k, v in batch.items() if k != 'labels'}
            
            with torch.no_grad():
                if self.model_generation_type == "default":
                    outputs = self.current_model(**inputs)
                    logits = outputs.logits
                elif self.model_generation_type == "generate":
                    outputs = self.current_model.generate(**inputs,
                                                          max_new_tokens = self.max_new_tokens,
                                                          pad_token_id =  self.pad_token_id,)
                    logits = self.tokenizer.batch_decode(sequences=outputs[:, inputs['input_ids'].shape[1]:],
                                                         skip_special_tokens=True)
                else:
                    raise Exception("Not a Valid Model Generation Type. Please set a valid generation type.")

            labels = batch['labels']
            all_logits.append(logits)
            all_labels.append(labels) 
        
        self.all_logits = all_logits
        self.all_labels = all_labels
        
        #Process logits and labels before evaluation
        self.process_logits_labels()
        
        # Calculate Metrics
        return self.compute_metrics()


    def evaluate_models(self):
        if not self.models_dict:
            raise Exception("No Models were provided. Please Provide a Model")
        evaluation_results = {}
        for model_name, model in self.models_dict.items():
            self.current_model = model
            self.current_model_name = model_name
            result = self.evaluate()
            evaluation_results.update({model_name: result})
        return evaluation_results


    def evaluate_datasets(self):
        if not self.datasets_dict:
            raise Exception("No Datasets were provided. Please Provide a Dataset")
        evaluation_results = {}
        for dataset_name, dataset in self.datasets_dict.items():
            self.current_dataset = dataset
            self.current_dataset_name = dataset_name
            result = self.evaluate_models()
            evaluation_results.update({dataset_name: result})
        return evaluation_results
    

In [None]:
# LOADING MODELS
from transformers import set_seed
from peft import PeftConfig, PeftModelForSequenceClassification

base_model_id = "bert-base-uncased"
fine_tuned_model_id = "/kaggle/working/test-model-5/"
base_model = AutoModelForSequenceClassification.from_pretrained(base_model_id).to("cuda")
fine_tuned_model = AutoModelForSequenceClassification.from_pretrained(fine_tuned_model_id).to("cuda")

# LOADING METRICS
from evaluate import load
accuracy = load("accuracy")
f1 = load("f1")
recall = load("recall")


In [None]:
models_dict = {
    'llama':model,
}

metrics_dict = None

dataloader_dict = {
    "demo-1":dataloader,
    "demo-2":dataloader,
}

In [None]:
evaluator = Evaluator()
evaluator.load_models_dict(models_dict)
evaluator.load_metrics_dict(metrics_dict)
evaluator.load_datasets_dict(dataloader_dict)

evaluator.set_device("cuda")
evaluator.set_model_generation_type("generate")
evaluator.set_generation_config({
    'tokenizer':tokenizer,
    'max_new_tokens':300,
    'pad_token_id': tokenizer.eos_token_id
})

evaluator.evaluate_datasets()

In [None]:
evaluator.logits_labels_dict