## API KEYS

In [6]:
import os
import wandb
from huggingface_hub import HfApi
from kaggle_secrets import UserSecretsClient

# LOADING API-KEYS
user_secrets = UserSecretsClient()

# HUGGINGFACE LOGIN
hugging_face_token = user_secrets.get_secret("HUGGING_FACE_API_KEY")
api = HfApi(token=hugging_face_token)

# WANDB LOGIN
wandb_api_token = user_secrets.get_secret("WANDB_API_KEY")
wandb.login(key = wandb_api_token)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

## LOADING MODEL AND TOKENIZERS

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
# tokenizer.pad_token = tokenizer.eos_token
checkpoint = "meta-llama/Meta-Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, token=hugging_face_token)
model = AutoModelForCausalLM.from_pretrained(checkpoint, 
                                             device_map="auto",
                                             torch_dtype=torch.float16,
                                             token=hugging_face_token,
                                            )

In [None]:
from transformers import TextStreamer
streamer = TextStreamer(
    tokenizer,
    skip_prompt=True,
    skip_special_tokens=True,
)

In [None]:
from datasets import load_dataset
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import DataLoader
from tqdm import tqdm

# Load dataset
dataset = load_dataset("pubmed_qa", "pqa_labeled")
test_set = dataset["train"]

# Load model and tokenizer
checkpoint = "meta-llama/Meta-Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, token=hugging_face_token)
model = AutoModelForCausalLM.from_pretrained(checkpoint, 
                                             device_map="auto",
                                             torch_dtype=torch.float16,
                                             token=hugging_face_token)
tokenizer.pad_token = tokenizer.eos_token
device = next(model.parameters()).device

def collate_fn(batch):
    input_texts = [f"Question: {item['question']}\nAnswer:" for item in batch]
    inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True)
    return {
        'inputs': inputs,
        'true_answers': [item['final_decision'] for item in batch]
    }

batch_size = 8  # Adjust based on your GPU memory
dataloader = DataLoader(test_set, batch_size=batch_size, collate_fn=collate_fn)

# Evaluation loop
results = []
total_batches = len(dataloader)

model.eval()
for batch in tqdm(dataloader, total=total_batches, desc="Evaluating"):
    inputs = batch['inputs']
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=1024)
    
    generated_answers = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    
    for true_answer, generated_answer in zip(batch['true_answers'], generated_answers):
        results.append({
            "true_answer": true_answer,
            "generated_answer": generated_answer
        })

# Calculate metrics
correct = sum(1 for r in results if r["true_answer"] in r["generated_answer"])
accuracy = correct / len(results)
print(f"Accuracy: {accuracy:.2f}")

## DATASET HANDLING

In [None]:
# from datasets import load_dataset
class DataHelper:
    def __init__(self):
        self.dataset = None
        self.tokenizer = tokenizer
        self.tokenizer.pad_token = self.tokenizer.eos_token

        self.system_instruction = "You are a Helpful AI Assistant."
        self.user_instruction = "Please answer the following Question: "
        self.user_query = None
        
        # Config Columns
        self.user_query_column = None        
        self.columns_to_tokenize = None    

# DATASETS CLASS
    def load_dataset(self, dataset):
        self.dataset = dataset
        return self.dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        return {
            'input_ids': self.dataset['input_ids'][idx].unsqueeze(0),
            'attention_mask': self.dataset['attention_mask'][idx].unsqueeze(0),
            'token_type_ids': self.dataset.get('token_type_ids', torch.tensor([]))[idx].unsqueeze(0) if 'token_type_ids' in self.dataset else None
            }

# LOADING IMPORTANT COLUMNS
    def load_config_columns(self, columns_dictionary):
        self.user_query_column = columns_dictionary["user_query_column"]
        self.columns_to_tokenize = columns_dictionary["columns_to_tokenize"]
        
# SYSTEM & USER PROMPT
    def set_system_instruction(self, system_instruction):
        self.system_instruction = system_instruction
        return self.system_instruction
    
    def set_user_instruction(self, user_instruction):
        self.user_instruction = user_instruction
        return self.user_instruction
    
    def set_user_query(self, user_query):
        self.user_query = user_query
        return self.user_query

# HANDLING INPUT COLUMN
    def handle_input_columns(self):
        pass

# LOADING TOKENIZER
    def load_tokenizer(self, tokenizer):
        self.tokenizer = tokenizer
        return self.tokenizer
    
# FORMATTING DATASET CODE
    def convert_input_to_chat_template(self):
        message = [
            {"role": "system", "content": self.system_instruction},
            {"role": "user", "content": self.user_instruction + self.user_query}
        ]
        formatted_input = self.tokenizer.apply_chat_template(message,
                                                                tokenize=False,
                                                                add_generation_prompt=True,
                                                                return_tensors="pt"
                                                            )
        return formatted_input

    def create_chat_template_dataset(self, example):
        self.user_query = example[self.user_query_column]
        example['training_input'] = self.convert_input_to_chat_template()
        return example

    def format_dataset(self, dataset):        
        formatted_dataset = dataset.map(self.create_chat_template_dataset)
        return formatted_dataset
    
# TOKENIZATION CODE    
    def tokenization_function(self, example):
        return self.tokenizer(example[self.columns_to_tokenize],
                                padding=True,
                                max_length=1024,
                                truncation=True,
                                return_tensors="pt"
                                )
    
    def tokenize_dataset(self, dataset):
        tokenized_dataset = dataset.map(self.tokenization_function,
                                                       batched=True, 
                                                       batch_size=128, 
                                                       num_proc=8
                                                      )
        return tokenized_dataset    
    
    def clean_up_dataset(self, base_dataset, tokenized_dataset):
        base_dataset_columns = list(base_dataset.features.keys())
        tokenized_dataset_columns = list(tokenized_dataset.features.keys())
        final_columns = list(set(tokenized_dataset_columns) - set(base_dataset_columns))
        
        final_dataset = tokenized_dataset.remove_columns(base_dataset_columns)
        final_dataset.set_format(type='pt', columns=final_columns, output_all_columns=True)
        return final_dataset

In [None]:
# LOADING DATASET
from datasets import load_dataset
dataset = load_dataset("medalpaca/medical_meadow_medqa")
dataset

In [None]:
# Extracting Datasets
train_dataset = dataset['train']
test_dataset = dataset['test']
validation_dataset = dataset['validation']
datasets = [train_dataset, test_dataset, validation_dataset]

In [None]:
datahelper = DataHelper(tokenizer=tokenizer, user_query_column = "input", columns_to_tokenize="training_input")

formatted_datasets = [datahelper.format_dataset(dataset) for dataset in datasets]
tokenized_datasets = [datahelper.tokenize_dataset(dataset) for dataset in formatted_datasets]

train_dataset, test_dataset, validation_dataset = tokenized_datasets

In [None]:
# tokenized_datasets
# datahelper.clean_up_dataset(base_dataset=dataset['train'], tokenized_dataset=tokenized_dataset)
# train_dataset = dataset['train'].map(preprocessing, batched=True, batch_size=32)
# train_dataset = train_dataset.remove_columns(['input', 'instruction', 'output', 'final_text'])
# train_dataset.set_format(type='pt', columns=['input_ids', 'attention_mask'], output_all_columns=True)

In [None]:
with tqdm(total=len(train_dataset.batch(batch_size=8)), desc="Generating responses", unit="batch") as pbar:
    for batch in train_dataset.batch(batch_size=8):
        batch = {k: v.to("cuda:0") for k, v in batch.items() if isinstance(v, torch.Tensor)}
        
        with torch.no_grad():
            output = model.generate(**batch,max_length=1024)
            results.append(output)
    #         decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
    #         results.append(decoded_output)
            pbar.update(1)

## Trainer Code

In [None]:
import torch
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=28,
    save_total_limit=1,
    eval_strategy="steps",
    save_strategy="epoch",
    save_steps = 100,
    # label_names = ['not_equivalent', 'equivalent'],
    fp16=torch.cuda.is_available()  # Use mixed precision if GPUs support it
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

In [3]:
!pip install evaluate peft transformers==4.42.0

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting transformers==4.42.0
  Downloading transformers-4.42.0-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m690.4 kB/s[0m eta [36m0:00:00[0m:--:--[0m
Downloading transformers-4.42.0-py3-none-any.whl (9.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading peft-0.12.0-py3-none-any.whl (296 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.4/296.4 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers, peft, evalua

In [7]:
import os
import torch
import warnings
import numpy as np
from time import time
from evaluate import load
from datasets import load_dataset
from transformers import DataCollatorWithPadding
from peft.utils import get_peft_model_state_dict
from transformers import TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
from transformers import AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, BertForSequenceClassification

    
warnings.filterwarnings("ignore", category=RuntimeWarning)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from transformers import set_seed
set_seed(42)

def main():
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return accuracy.compute(predictions=predictions, references=labels)

    def encode(examples):
        output = tokenizer(examples['sentence1'], 
                           examples['sentence2'], 
                           truncation=True, 
                           padding='max_length', 
                           max_length=128,
                          )
        
        output['labels'] = examples['label']
        return output

# MODEL
    checkpoint = "bert-base-uncased"
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint,
                                                  #torch_dtype=torch.float16,
                                                 )

#     peft_config = LoraConfig(inference_mode=False,
#                              r=32,
#                              lora_alpha = 512,
#                              lora_dropout = 0.1,
#                              bias="none",
#                              peft_type = TaskType.SEQ_CLS, #" CAUSAL_LM"
#                              )

#     model = prepare_model_for_kbit_training(model)
#     model = get_peft_model(model, peft_config)
#     model.print_trainable_parameters()

# TYPICAL TRAINING CODE
    accuracy = load("accuracy")
    tokenizer = AutoTokenizer.from_pretrained(checkpoint, padding_side="right")
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    
#   DATASET PREP
    dataset = load_dataset("glue", "mrpc")
    dataset = dataset.map(encode, batched=True)    
    dataset = dataset.remove_columns(['sentence1', 'sentence2', 'label', 'idx'])
    dataset.set_format(type='pt', columns=['input_ids', 'attention_mask', 'labels',], output_all_columns=True)
    data_collator = DataCollatorWithPadding(tokenizer)
    
        
# TRAINING ARGUMENTS
    training_args = TrainingArguments(
    # DIRECTORIES FOR SAVING AND LOGGING
        output_dir="/kaggle/working/glue_model_checkpointing_test-8",
        logging_dir =  "/kaggle/working/logs", 
    
    #  BASIC PARAMS
        num_train_epochs=5,
        fp16=True,
        seed=42,
        data_seed=42,
        
    # OPTIMIZER SETUP
        optim="rmsprop",
        learning_rate=1e-4,
        lr_scheduler_type="cosine",
        #lr_scheduler_kwargs={"power": 2.0},
        warmup_ratio=0.2,
        #warmup_steps=200,
        
    # DATA RELATED ARGUMENTS
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        dataloader_num_workers=4, # Reduces Training time by a decent percentage
        dataloader_pin_memory=True,
        dataloader_persistent_workers=True, 
        ddp_find_unused_parameters=False,        
        
    # LOGGING
        logging_strategy="epoch", # Logs the Training Loss
        label_names = ['labels'], # If Peft is off, keep this off doesnt do anything, if Peft is on, Logs the Validation Loss and Validation Accuracy
        #report_to = tensorboard
        
    # EVALUATION
        eval_strategy="epoch", # Doesnt Evaluate the model per epoch, Reducing the training time
        #eval_steps        
        
    # SAVING TO HUB
        save_strategy="epoch",
        save_total_limit=1,
        push_to_hub=True,
        hub_token = hugging_face_token,
        hub_strategy="every_save",
        hub_model_id="nnilayy/glue_model_checkpointing_test-8",
        
#     SAVING VRAM
#         torch_empty_cache_steps=40, #Clears vram cache during training after a few steps
#         gradient_checkpointing=True,
#         gradient_accumulation_steps=4,
    )

# TRAINER CONSTRUCTOR
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

# Pushing Tokenizer, Model Card, Label Mapping to Hub 
#     tokenizer.push_to_hub("nnilayy/glue_model_checkpointing_test-8")
#     model.config.label2id = {'equivalent': 0, 'not_equivalent': 1}
#     model.config.id2label = {0: 'equivalent', 1: 'not_equivalent'}
#     model.config.push_to_hub("nnilayy/glue_model_checkpointing_test-8")

    trainer.train()
    model.save_pretrained("/kaggle/working/test-model-5")

if __name__ == "__main__":
    from accelerate import notebook_launcher
    notebook_launcher(main, num_processes=2)

Launching training on 2 GPUs.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/649k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
[34m[1mwandb[0m: Currently logged in as: [33mnnilayy[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5674,0.484929,0.78087
2,0.3649,0.445038,0.826667
3,0.1491,0.551907,0.830725
4,0.057,0.727788,0.838261
5,0.0242,0.696037,0.845797


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5674,0.484929,0.78087
2,0.3649,0.445038,0.826667
3,0.1491,0.551907,0.830725
4,0.057,0.727788,0.838261
5,0.0242,0.696037,0.845797


No files have been modified since last commit. Skipping to prevent empty commit.


In [11]:
from transformers import AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, BertForSequenceClassification
from datasets import load_dataset
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, padding_side="right")

def encode(examples):
    output = tokenizer(examples['sentence1'], 
                       examples['sentence2'], 
                       truncation=True, 
                       padding='max_length', 
                       max_length=128,
                      return_tensors="pt")
    output['labels'] = examples['label']
    return output

dataset = load_dataset("glue", "mrpc")
val_dataset = dataset['test']
val_dataset = val_dataset.map(encode, batched=True) 
val_dataset = val_dataset.remove_columns(['sentence1', 'sentence2', 'label', 'idx'])
val_dataset.set_format(type='pt', columns=['input_ids', 'attention_mask', "token_type_ids",'labels',], output_all_columns=True)
val_dataset

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 1725
})

In [12]:
# validation_dataset = val_dataset
test_dataset = val_dataset

In [13]:
from transformers import set_seed
from peft import PeftConfig, PeftModelForSequenceClassification

# set_seed(42)

base_model_id = "bert-base-uncased"
fine_tuned_model_id = "/kaggle/working/test-model-5/"
base_model = AutoModelForSequenceClassification.from_pretrained(base_model_id).to("cuda")
fine_tuned_model = AutoModelForSequenceClassification.from_pretrained(fine_tuned_model_id).to("cuda")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from evaluate.visualization import radar_plot


In [None]:
class Evaluator:
    def __init__(self):
        self.dataset = None
        self.models_dict = None
        # self.base_model = None
        # self.fine_tuned_model = None
        self.device = None
        self.metric = None
        self.current_model = None


    # def load_base_model(self, model):
    #     self.base_model = model
    #     return self.base_model
    
    # def load_fine_tuned_model(self, model):
    #     self.fine_tuned_model = model
    #     return self.fine_tuned_model

    def load_models_dict(self, models_dict):
        self.models_dict = models_dict
        return self.models_dict
           
    def set_device(self, device):
        self.device = device
        return self.device

    def load_dataset_to_evaluate(self, dataset):
        self.dataset = dataset
        return self.dataset
    
    def load_metric(self, metric):
        self.metric = metric
        return self.metric
    
    def compute_metrics(self):
        pass
    
    def evaluate_qbq(self):
        self.current_model.eval()
        for index in tqdm(range(len(self.dataset)), desc="Evaluating"):
            input_ids = self.dataset['input_ids'][index].unsqueeze(0).to("cuda")
            attention_mask = self.dataset['attention_mask'][index].unsqueeze(0).to("cuda")    
            with torch.no_grad():
                outputs = self.model(input_ids = input_ids, 
                                           attention_mask = attention_mask
                                          )
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            self.metric.add(predictions=predictions, references=val_dataset['labels'][index])
            
        results = self.metric.compute()
        return results
    
    def evaluate_batch(self):
        self.current_model.eval()
        for batch in tqdm(self.dataset, desc="Evaluating"):
            inputs = {k:v.to(device) for k,v in batch.items()}
            with torch.no_grad():
                outputs = model(**inputs)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            self.metric.add(predictions = predictions, references = batch['labels'])
        results = self.metric.compute()
        return results


    def evaluate_models(self):
        evaluation_results = {}
        for model_name, model in self.models_dict.items():
            self.current_model = model
            result = self.evaluate_batch()
            evaluation_results.update({model_name: result})
        return evaluation_results
    
    



In [14]:
from tqdm import tqdm 
class Evaluator:
    def __init__(self):
        self.models_dict = None
        self.metrics_dict = None
        self.datasets_dict = None

        self.device = None
        self.current_model = None
        self.current_model_name = None
        self.current_dataset = None
        self.current_dataset_name = None
        

        self.labels = None
        self.all_logits = None

# LOADING FUNCTIONS
    def load_models_dict(self, models_dict):
        self.models_dict = models_dict
        return self.models_dict
           
    def set_device(self, device):
        self.device = device
        return self.device

    def load_datasets_dict(self, datasets_dict):
        self.datasets_dict = datasets_dict
        return self.datasets_dict
    
    def load_metrics_dict(self, metrics_dict):
        self.metrics_dict = metrics_dict
        return self.metrics_dict
    
    
# METRICS COMPUTATION
    def compute_metrics(self):
        computed_metrics = {}
        for _, metric in self.metrics_dict.items():
            result = metric.compute(predictions = self.all_logits, references = self.labels)
            computed_metrics.update(result)
        return computed_metrics


    def evaluate_qbq(self):
        self.current_model.eval()
        all_logits = []
        labels = []
        for index in tqdm(range(len(self.current_dataset)), desc=f"Evaluating {self.current_model_name} on {self.current_dataset_name}"):
            input_ids = self.current_dataset['input_ids'][index].unsqueeze(0).to(self.device)
            attention_mask = self.current_dataset['attention_mask'][index].unsqueeze(0).to(self.device)  
            token_type_ids = self.current_dataset['token_type_ids'][index].unsqueeze(0).to(self.device)
            with torch.no_grad():
                outputs = self.current_model(input_ids = input_ids,
                                             attention_mask = attention_mask,
                                             token_type_ids = token_type_ids
                                            )
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)[0]
            label = self.current_dataset['labels'][index].to(self.device)
            
            all_logits.append(predictions)
            labels.append(label)
        self.all_logits = all_logits
        self.labels = labels
        
        evaluated_metrics = self.compute_metrics()
        return evaluated_metrics


    def evaluate_batch(self):
        self.current_model.eval()
        all_logits = []
        for batch in tqdm(self.current_dataset, desc="Evaluating"):
            inputs = {k:v.to(self.device) for k,v in batch.items()}
            with torch.no_grad():
                outputs = self.current_model(**inputs)
            logits = outputs.logits
            all_logits.append(logits)
        self.all_logits = all_logits        
        evaluated_metrics = self.compute_metrics()
        return evaluated_metrics


    def evaluate_models(self):
        evaluation_results = {}
        for model_name, model in self.models_dict.items():
            self.current_model = model
            self.current_model_name = model_name
            result = self.evaluate_qbq()
            evaluation_results.update({model_name: result})
        return evaluation_results


    def evaluate_datasets(self):
        evaluation_results = {}
        for dataset_name, dataset in self.datasets_dict.items():
            self.current_dataset = dataset
            self.current_dataset_name = dataset_name
            result = self.evaluate_models()
            evaluation_results.update({dataset_name: result})
        return evaluation_results
    

## DataLoader

In [62]:
from tqdm import tqdm 
from evaluate import load
from torch.utils.data import DataLoader

# test_dataset = DataHelper(dataset)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True)

accuracy = load("accuracy")

labels, all_logits = [], []
for batch in tqdm(test_dataloader, desc="Evaluating"):
    inputs = {k:v.to("cuda") for k,v in batch.items()}
    with torch.no_grad():
        outputs = fine_tuned_model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    all_logits.append(predictions)
    labels.append(inputs['labels'])

labels = torch.cat(labels, dim=0)
all_logits = torch.cat(all_logits, dim=0)

accuracy.compute(predictions = all_logits, references = labels)

Evaluating: 100%|██████████| 54/54 [00:12<00:00,  4.34it/s]


{'accuracy': 0.8457971014492753}

In [47]:
import torch

# Create two one-dimensional tensors
tensor1 = torch.tensor([1, 2, 3])
tensor2 = torch.tensor([4, 5, 6])

# Concatenate the tensors
result_tensor = torch.cat([tensor1, tensor2], dim=0)
result_tensor

tensor([1, 2, 3, 4, 5, 6])

In [15]:
# LOADING MODELS
from transformers import set_seed
from peft import PeftConfig, PeftModelForSequenceClassification

base_model_id = "bert-base-uncased"
fine_tuned_model_id = "/kaggle/working/test-model-5/"
base_model = AutoModelForSequenceClassification.from_pretrained(base_model_id).to("cuda")
fine_tuned_model = AutoModelForSequenceClassification.from_pretrained(fine_tuned_model_id).to("cuda")

# LOADING METRICS
from evaluate import load
accuracy = load("accuracy")
f1 = load("f1")
recall = load("recall")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

In [17]:

models_dict = {
    "fine_tuned_model": fine_tuned_model,
    "base_model": base_model,
}

metrics_dict = {
    "accuracy": accuracy,
    "f1": f1,
    "recall": recall,   
}

datasets_dict = {
#     "train_dataset": train_dataset,
    "test_dataset": test_dataset,
    "validation_dataset": validation_dataset,
}


evaluator = Evaluator()
evaluator.set_device("cuda")
evaluator.load_models_dict(models_dict)
evaluator.load_metrics_dict(metrics_dict)
evaluator.load_datasets_dict(datasets_dict)

result = evaluator.evaluate_datasets()
result

Evaluating fine_tuned_model on test_dataset: 100%|██████████| 1725/1725 [00:36<00:00, 47.75it/s]
Evaluating base_model on test_dataset: 100%|██████████| 1725/1725 [00:37<00:00, 45.86it/s]
Evaluating fine_tuned_model on validation_dataset: 100%|██████████| 408/408 [00:05<00:00, 81.19it/s]
Evaluating base_model on validation_dataset: 100%|██████████| 408/408 [00:04<00:00, 83.06it/s]


{'test_dataset': {'fine_tuned_model': {'accuracy': 0.8457971014492753,
   'f1': 0.888235294117647,
   'recall': 0.9215344376634699},
  'base_model': {'accuracy': 0.664927536231884,
   'f1': 0.7987465181058496,
   'recall': 1.0}},
 'validation_dataset': {'fine_tuned_model': {'accuracy': 0.8529411764705882,
   'f1': 0.894736842105263,
   'recall': 0.9139784946236559},
  'base_model': {'accuracy': 0.6838235294117647,
   'f1': 0.8122270742358079,
   'recall': 1.0}}}

In [57]:
from evaluate import load

accuracy = load("accuracy")

logits = []
for logit in evaluator.all_logits:
    prediction = torch.argmax(logit, dim=-1)[0]
    logits.append(prediction)
# # logits
# evaluator.labels

accuracy.compute(predictions=logits, references=evaluator.labels)

{'accuracy': 0.8529411764705882}

In [None]:
# test_dict = {
#     "param-1":1,
#     "param-2":2,
#     "param-3":3,
#     "param-4":4,
#     }
# for k, v in test_dict.items():
#     print(k,v)

def evaluate(num):
    return num + 1

models = {"model-1":1, 
          "model-2":2, 
          "model-3":3,
          }
evaluation_results={}
    
for models_name, model in models.items():
    result = evaluate(model)  
    evaluation_results.update({models_name:result})
evaluation_results

In [None]:
metrics_dict = {
    "accuracy":1,
    "f-1 score":2,
    "recall":3,
}

def evaluate(num):
    return num + 2

computed_metrics={}
for metric_name, metric in metrics_dict.items():
    result = evaluate(metric)
    computed_metrics.update({metric_name: result})
computed_metrics

In [None]:
from tqdm import tqdm
import evaluate
import torch
metric = evaluate.load("accuracy")
model = fine_tuned_model

model.eval()
for index in tqdm(range(len(val_dataset)), desc="Evaluating"):
    input_ids = val_dataset['input_ids'][index].unsqueeze(0).to("cuda")
    attention_mask = val_dataset['attention_mask'][index].unsqueeze(0).to("cuda")
    token_type_ids = val_dataset['token_type_ids'][index].unsqueeze(0).to("cuda")
    with torch.no_grad():
        outputs = model(input_ids = input_ids, 
                        attention_mask = attention_mask, 
                        token_type_ids = token_type_ids
                       )
        
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add(predictions=predictions, references=val_dataset['labels'][index])

metric.compute()

## Evaluate Code

In [None]:
from tqdm import tqdm
import torch
import logging

class Evaluate:
    def __init__(self, tokenizer, model):
        
        logging.getLogger("transformers").setLevel(logging.ERROR)

        self.tokenizer = tokenizer
        self.model = model
        self.streamer = None
        self.model.generation_config.pad_token_id = self.tokenizer.pad_token_id
        self.model.generation_config.eos_token_id = self.tokenizer.eos_token_id
        
    def single_question_evaluate(self, question, return_prompt=False, stream_response=False):
        self.model.eval()
        with torch.no_grad():
            inputs = self.tokenizer(question, return_tensors="pt").to("cuda")
            output = self.model.generate(
                **inputs,
                max_new_tokens=1000,
                return_dict_in_generate=True,
                temperature=0.5,
                do_sample=True,
                top_k=50, 
                num_beams=1,
                early_stopping=False,
                eos_token_id=self.tokenizer.eos_token_id,
                pad_token_id=self.tokenizer.pad_token_id,
                streamer=(self.streamer if stream_response else None),
            )
            
            if return_prompt:
                response = self.tokenizer.decode(output.sequences[0], skip_special_tokens=True)
            else:
                response = self.tokenizer.decode(output.sequences[:, inputs.input_ids.shape[1]:][0], skip_special_tokens=True)  
                
            return response
                
    def load_streamer(self, streamer):
        self.streamer = streamer
        return self.streamer
    
    def batch_evaluate(self, dataset, batch_size, return_prompt=False):
        pass
    
    def qbq_evaluate(self, dataset, return_prompt=False):
        model_responses = []
        self.model.eval()
        with tqdm(total=len(dataset), desc="Generating responses", unit="question") as pbar:
            for index in range(len(dataset)):
                with torch.no_grad():
                    question = dataset['training_input'][index]
                    inputs = self.tokenizer(question, return_tensors="pt").to("cuda")
                    output = self.model.generate(
                        **inputs,
                        max_new_tokens=1000,
                        return_dict_in_generate=True,
                        temperature=0.5,
                        do_sample=True,
                        top_k=50, 
                        num_beams=1,
                        early_stopping=False,
                    )
                    if return_prompt:
                        response = self.tokenizer.decode(output.sequences[0], skip_special_tokens=True)
                    else:
                        response = self.tokenizer.decode(output.sequences[:, inputs.input_ids.shape[1]:][0], skip_special_tokens=True)
                        
                    model_responses.append(response)
                    pbar.update(1)
                    
        return model_responses


In [None]:
model_config_dict = {
    ""

}