In [3]:
from transformers import AutoModelForSeq2SeqLM
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
from peft import AutoPeftModelForCausalLM, AutoPeftModelForSequenceClassification
from transformers import AutoTokenizer
import torch
import os
from datetime import datetime
import re
import numpy as np
# import dataloader from torch
from torch.utils.data import DataLoader
import evaluate

In [4]:


# find the model with the latest date in this folder

def extract_datetime(x):
    ''' 
    Extract the datetime from a string

    Args:
    x (str): string to extract datetime from
    
    Returns:
    datetime: datetime object
    
    '''
    
    dt = x.split('/')[-1]
    # print(dt)
    dt = datetime.strptime(dt, '%d-%m-%Y--%H-%M')
    return dt

def get_latest_model(model_dir):
    
    ''' 
    Get the latest model in a directory
    
    Args:
    model_dir (str): directory to search for the latest model
    
    Returns:
    str: path to the latest model
    
    '''
    
    # first list the full paths of all subdirectories
    subdirs = [os.path.join(model_dir, o) for o in os.listdir(model_dir) if os.path.isdir(os.path.join(model_dir,o))]
    
    # now loop over the subdirs and extract the datetime from the path and sort by datetime
    subdirs = sorted(subdirs, key=extract_datetime)
    
    # latest model is the last one
    latest_subdir = subdirs[-1]
    
    # now we want to find the checkpoint folder in this directory that has the highest tail number split on '-'
    # lets get the paths inside the latest_subdir
    subsubdirs = [os.path.join(latest_subdir, o) for o in os.listdir(latest_subdir) if os.path.isdir(os.path.join(latest_subdir,o))]
    
    # now loop over the subdirs and extract the tail number from the path and sort by tail number
    subsubdirs = sorted(subsubdirs, key=lambda x: int(x.split('-')[-1]))
    
    # latest model is the last one
    latest_model = subsubdirs[-1]
    return latest_model


In [5]:
# set directory for a task and model 

task = "mimic-mp"
peft_type = "LORA" # | Full
model_name = "bio-mobilebert"

model_dir = f"/mnt/sdh/effecient_ml/ckpts/mimic-mp/full/{model_name}/{peft_type}/"

In [6]:
full_model_dir = get_latest_model(model_dir)

In [47]:
full_model_dir

'/mnt/sdh/effecient_ml/ckpts/mimic-mp/full/bio-mobilebert/LORA/01-12-2023--13-10/checkpoint-5310'

In [7]:
reloaded_model = AutoPeftModelForSequenceClassification.from_pretrained(full_model_dir)

  return self.fget.__get__(instance, owner)()
Some weights of MobileBertForSequenceClassification were not initialized from the model checkpoint at nlpie/bio-mobilebert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from huggingface_hub import notebook_login, create_repo

In [14]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [13]:
# reloaded_model

In [15]:
# lets test pushing to hub
reloaded_model.push_to_hub("NTaylor/bio-mobilebert-mimic-mp-lora", use_auth_token=True)



adapter_model.safetensors:   0%|          | 0.00/910k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/NTaylor/bio-mobilebert-mimic-mp-lora/commit/0d5e0fcb1b33fdfb94f005a5affbcfab47f49a96', commit_message='Upload model', commit_description='', oid='0d5e0fcb1b33fdfb94f005a5affbcfab47f49a96', pr_url=None, pr_revision=None, pr_num=None)

**warning** 

The AutoPeftModel... seems to actually work based on the eval performance. 
The warning above is related to the base model being loaded and I guess the library doesn't notice that the adapter weights include the classifier head.

In [75]:
reloaded_model.classifier.modules_to_save.default.weight

Parameter containing:
tensor([[ 0.0136,  0.0475,  0.0532,  ...,  0.0615,  0.0177,  0.0178],
        [ 0.0028,  0.0364, -0.0520,  ..., -0.0609, -0.0284, -0.0447]],
       requires_grad=True)

In [89]:
tokenizer = AutoTokenizer.from_pretrained(full_model_dir)

Not sure the autopeft model is working well with all models 

## Try more manual reload

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftConfig, PeftModel

In [6]:
# load config
config = PeftConfig.from_pretrained(full_model_dir)
# load base model 
model = AutoModelForSequenceClassification.from_pretrained(config.base_model_name_or_path, num_labels = 2)

  return self.fget.__get__(instance, owner)()
Some weights of MobileBertForSequenceClassification were not initialized from the model checkpoint at nlpie/bio-mobilebert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

In [52]:
config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='nlpie/bio-mobilebert', revision=None, task_type='SEQ_CLS', inference_mode=True, r=8, target_modules={'value', 'query', 'key'}, lora_alpha=8, lora_dropout=0.1, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={})

In [8]:
reloaded_model = PeftModel.from_pretrained(model, full_model_dir)

In [56]:
reloaded_model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): MobileBertForSequenceClassification(
      (mobilebert): MobileBertModel(
        (embeddings): MobileBertEmbeddings(
          (word_embeddings): Embedding(30522, 128, padding_idx=0)
          (position_embeddings): Embedding(512, 512)
          (token_type_embeddings): Embedding(2, 512)
          (embedding_transformation): Linear(in_features=384, out_features=512, bias=True)
          (LayerNorm): NoNorm()
          (dropout): Dropout(p=0.0, inplace=False)
        )
        (encoder): MobileBertEncoder(
          (layer): ModuleList(
            (0-23): 24 x MobileBertLayer(
              (attention): MobileBertAttention(
                (self): MobileBertSelfAttention(
                  (query): Linear(
                    in_features=128, out_features=128, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
         

In [9]:
# now merge and unload
# reloaded_model.merge_and_unload()

# Test evaluation performance

In [10]:
import yaml
with open('../datasets.yaml', 'r') as f:
    datasets = yaml.load(f, yaml.FullLoader)

try:
    dataset_info = datasets[task]

except KeyError:
    print(f"Task name {task} not in datasets.yaml. Available tasks are: {list(datasets.keys())}")
    exit(0)


In [79]:
dataset_info

{'training_data_dir': '/mnt/sdd/efficient_ml_data/datasets/mimic3-clinical-outcomes/mp',
 'eval_data_dir': '/mnt/sdd/efficient_ml_data/datasets/mimic3-clinical-outcomes/mp',
 'data_dir': '',
 'training_file': 'train.csv',
 'validation_file': 'valid.csv',
 'test_file': 'test.csv',
 'task_type': 'SEQ_CLS',
 'label_name': 'hospital_expire_flag',
 'text_column': 'text',
 'remove_columns': ['text']}

In [11]:
from datasets import load_dataset
datasets = load_dataset("csv", 
                        data_files = {"train":f"{dataset_info['training_data_dir']}/{dataset_info['training_file']}",
                                    "validation":f"{dataset_info['eval_data_dir']}/{dataset_info['validation_file']}",
                                    "test":f"{dataset_info['eval_data_dir']}/{dataset_info['validation_file']}",
                                    },
                    cache_dir = None)

In [12]:
# create dictionary of various datasets and their sentence keys
task_to_keys ={
                "cola": ("sentence", None),
                "mnli": ("premise", "hypothesis"),
                "mnli-mm": ("premise", "hypothesis"),
                "mrpc": ("sentence1", "sentence2"),
                "qnli": ("question", "sentence"),
                "qqp": ("question1", "question2"),
                "rte": ("sentence1", "sentence2"),
                "sst2": ("sentence", None),
                "stsb": ("sentence1", "sentence2"),
                "wnli": ("sentence1", "sentence2"),
                "mimic-note-category": ("TEXT", None),
                "icd9-triage":("text", None),
                "icd9-triage-no-category-in-text":("text", None),
                "ICD9-Triage":("text", None),
                "mednli":("sentence1", "sentence2"),
                "mimic-mp":(dataset_info["text_column"], None),
                }

In [13]:
# get number of labels
num_labels = len(np.unique(datasets["train"][dataset_info["label_name"]]))


sentence1_key, sentence2_key = task_to_keys[task]

In [14]:
model_name_or_path = full_model_dir
batch_size = 16
if any(k in full_model_dir for k in ("gpt", "opt", "bloom")):
    padding_side = "left"
else:
    padding_side = "right"


if getattr(tokenizer, "pad_token_id") is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id



# own
def tokenize_function(examples):
    # max_length is important when using prompt tuning  or prefix tuning or p tuning as virtual tokens are added - which can overshoot the max length in pefts current form
    # for now set to 480 and see how it goes
    if sentence2_key is None:
        return tokenizer(examples[sentence1_key], truncation=True, max_length = 480)
    return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True, max_length=480)

# own
tokenized_datasets = datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset_info["remove_columns"]
)


def collate_fn(examples):
    return tokenizer.pad(examples, padding="longest", return_tensors="pt")

if "labels" not in tokenized_datasets["train"].features:
        tokenized_datasets = tokenized_datasets.rename_column(dataset_info["label_name"], "labels")

# Instantiate dataloaders.
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=batch_size
)

Map:   0%|          | 0/33954 [00:00<?, ? examples/s]

Map:   0%|          | 0/4908 [00:00<?, ? examples/s]

Map:   0%|          | 0/4908 [00:00<?, ? examples/s]

In [15]:
reloaded_model.device

device(type='cpu')

In [137]:
# check eval dataloader
for batch in eval_dataloader:
    print(batch)
    break

{'id': tensor([176763, 173211, 116333, 161102, 116799, 162982, 114396, 143396, 108327,
        143022, 130766, 140160, 140485, 166483, 184549, 104154]), 'labels': tensor([0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0]), 'input_ids': tensor([[  101,  2708, 12087,  ...,  2575,  2232,   102],
        [  101,  2708, 12087,  ...,  1008,  2902,   102],
        [  101,  2708, 12087,  ...,  1011, 16021,   102],
        ...,
        [  101,  2708, 12087,  ...,  1007,  1012,   102],
        [  101,  2708, 12087,  ...,     0,     0,     0],
        [  101,  2708, 12087,  ...,  1999,  1031,   102]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],

In [16]:
from tqdm import tqdm

In [17]:
from sklearn.metrics import roc_curve, auc, f1_score, precision_score, recall_score, classification_report, accuracy_score

In [18]:
def compute_metrics(predictions, pred_scores, labels):
    
    # use from evaluate for now
    precision_score = evaluate.load("precision")
    recall_score = evaluate.load("recall")
    accuracy_score = evaluate.load("accuracy")
    f1_score = evaluate.load("f1")    
          
    print(f"Labels are: {labels}\n")
    print(f"Preds are: {predictions}")
    precision = precision_score.compute(predictions=predictions, references=labels, average = "macro")["precision"]
    recall = recall_score.compute(predictions=predictions, references=labels, average = "macro")["recall"]
    accuracy = accuracy_score.compute(predictions=predictions, references=labels)["accuracy"]
    f1_macro = f1_score.compute(predictions=predictions, references=labels, average = "macro")["f1"]
    f1_weighted = f1_score.compute(predictions=predictions, references=labels, average = "weighted")["f1"]
    # roc_auc has slightly different format - needs the probs/scores rather than predicted labels
    # change roc based on number of labels
    if len(np.unique(labels)) == 2:   

        roc_auc_score = evaluate.load("roc_auc", "binary")
        roc_auc = roc_auc_score.compute(references=labels,
                                        # just take the probabilties of the positive class
                                        prediction_scores = pred_scores[:,1]                                         
                                        )['roc_auc']
    else:
        roc_auc_score = evaluate.load("roc_auc", "multiclass")

        roc_auc = roc_auc_score.compute(references=labels,
                                        prediction_scores = pred_scores,
                                        multi_class = 'ovr', 
                                        average = "macro")['roc_auc']        
   
    return {"precision": precision, 
            "recall": recall,
            "accuracy": accuracy,
            "f1_macro":f1_macro,
            "f1_weighted":f1_weighted,
            "roc_auc_macro":roc_auc}

In [20]:
# send model to cuda
model.cuda()
model.eval()
all_preds = []
all_preds_raw = []
all_labels = []
for batch in tqdm(eval_dataloader):               
    with torch.no_grad():
        # send batch to cuda
        batch = {k: v.cuda() for k, v in batch.items()}
        outputs = reloaded_model(input_ids = batch["input_ids"], 
                                 attention_mask = batch["attention_mask"],
                                 token_type_ids = batch["token_type_ids"])

    # apply softmax to the logits of the output - using the softmax function
    preds_raw = outputs.logits.softmax(dim=-1).cpu()           

    
    # get argmax of preds raw
    preds = np.argmax(preds_raw, axis = -1)             
    
    all_preds_raw.extend(list(preds_raw))
    all_preds.extend(list(preds))
    all_labels.extend(list(batch["labels"].cpu().numpy()))



all_preds_raw = np.stack(all_preds_raw)
all_preds = np.stack(all_preds)
all_labels = np.stack(all_labels)

print(f"all_preds_raw shape is: {all_preds_raw.shape}")
print(f"all_preds shape is: {all_preds.shape} \n\n {all_preds}")
print(f"all_labels shape is: {all_labels.shape} \n\n {all_labels}")
# print(f"all_embeddings shape is: {all_embeddings.shape} \n\n {all_embeddings}")
# metrics = all_metrics(yhat=all_preds, y=all_labels, yhat_raw=all_preds_raw)


100%|██████████| 307/307 [00:26<00:00, 11.53it/s]


all_preds_raw shape is: (4908, 2)
all_preds shape is: (4908,) 

 [0 0 0 ... 0 0 0]
all_labels shape is: (4908,) 

 [0 0 1 ... 0 0 0]


In [22]:
# use compute metrics with args: predictions, pred_scores, labels
metrics = compute_metrics(all_preds, all_preds_raw, all_labels)

Labels are: [0 0 1 ... 0 0 0]

Preds are: [0 0 0 ... 0 0 0]


In [149]:
metrics

{'precision': 0.7036768485908739,
 'recall': 0.5350194809192531,
 'accuracy': 0.8946617766911166,
 'f1_macro': 0.5404014104010101,
 'f1_weighted': 0.8587342343219413,
 'roc_auc_macro': 0.7897045760371499}