In [54]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, DataCollatorWithPadding

from datasets import DatasetDict, Features, Sequence, Value, load_dataset

import torch
from torch.utils.data import DataLoader
from torch.profiler import profile, record_function, ProfilerActivity
import gc

import os
import sys
sys.path.append(os.getcwd()+"/../..")
from src import paths

import tqdm

In [2]:
# Download model
# checkpoint = "meta-llama/Llama-2-7b-chat-hf"
# model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")
# tokenizer = AutoTokenizer.from_pretrained(model)

# # Save model and tokenizer
# model.save_pretrained(paths.MODEL_PATH/'llama2-chat')
# tokenizer.save_pretrained(paths.MODEL_PATH/'llama2-chat')


In [2]:
# Low precision config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(paths.MODEL_PATH/'llama2-chat', device_map="auto", quantization_config=bnb_config)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [38]:
model = torch.compile(model)

In [4]:
tokenizer = AutoTokenizer.from_pretrained(paths.MODEL_PATH/'llama2', padding_side='left')
print("Vocabulary Size: ", len(tokenizer))

Vocabulary Size:  32000


In [7]:
# Check if the pad token is already in the tokenizer vocabulary
if '<pad>' not in tokenizer.get_vocab():
    # Add the pad token
    tokenizer.add_special_tokens({"pad_token":"<pad>"})

#Resize the embeddings
model.resize_token_embeddings(len(tokenizer))

#Configure the pad token in the model
model.config.pad_token_id = tokenizer.pad_token_id

# Check if they are equal
assert model.config.pad_token_id == tokenizer.pad_token_id, "The model's pad token ID does not match the tokenizer's pad token ID!"

# Print the pad token ids
print('Tokenizer pad token ID:', tokenizer.pad_token_id)
print('Model pad token ID:', model.config.pad_token_id)
print('Model config pad token ID:', model.config.pad_token_id)

Pad was not in Vocab
Tokenizer pad token ID: 32000
Model pad token ID: 32000
Model config pad token ID: 32000


In [39]:
# Check device allocation
for name, param in model.named_parameters():
    print(f"Device of {name}: ", param.device)

Device of _orig_mod.model.embed_tokens.weight:  cuda:0
Device of _orig_mod.model.layers.0.self_attn.q_proj.weight:  cuda:0
Device of _orig_mod.model.layers.0.self_attn.k_proj.weight:  cuda:0
Device of _orig_mod.model.layers.0.self_attn.v_proj.weight:  cuda:0
Device of _orig_mod.model.layers.0.self_attn.o_proj.weight:  cuda:0
Device of _orig_mod.model.layers.0.mlp.gate_proj.weight:  cuda:0
Device of _orig_mod.model.layers.0.mlp.up_proj.weight:  cuda:0
Device of _orig_mod.model.layers.0.mlp.down_proj.weight:  cuda:0
Device of _orig_mod.model.layers.0.input_layernorm.weight:  cuda:0
Device of _orig_mod.model.layers.0.post_attention_layernorm.weight:  cuda:0
Device of _orig_mod.model.layers.1.self_attn.q_proj.weight:  cuda:0
Device of _orig_mod.model.layers.1.self_attn.k_proj.weight:  cuda:0
Device of _orig_mod.model.layers.1.self_attn.v_proj.weight:  cuda:0
Device of _orig_mod.model.layers.1.self_attn.o_proj.weight:  cuda:0
Device of _orig_mod.model.layers.1.mlp.gate_proj.weight:  cuda:0


In [12]:
base_prompt = "<s>[INST]\n<<SYS>>\n{system_prompt}\n<</SYS>>\n\n{user_prompt}[/INST]\nBased on the information provided in the text, the most likely diagnosis for the patient is: "
def get_classification_llama(text):
    input = base_prompt.format(system_prompt = "Is the MS diagnosis in the text of type \"Sekundär progrediente Multiple Sklerose (SPMS)\", \"primäre progrediente Multiple Sklerose (PPMS)\" or \"schubförmig remittierende Multiple Sklerose (RRMS)\"?",
                               user_prompt = text)
    return input

def preprocess(example):
    example = tokenizer(get_classification_llama(example["text"]), return_tensors ="pt")
    return example

In [42]:
for batch in dataloader:
    torch.cuda.empty_cache()
    gc.collect()
    #model_inputs = {k: v.to(torch.int64).to("cuda") for k, v in model_inputs.items()}
    input_ids = batch["input_ids"].to("cuda")
    attention_mask = batch["attention_mask"].to("cuda")
    with torch.inference_mode():
        generated_ids = model.generate(input_ids = input_ids, attention_mask = attention_mask, max_new_tokens=20, num_beams=4, do_sample=True, temperature = 1, num_return_sequences = 1).to("cpu")
    print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True))

['[INST]\n<<SYS>>\nIs the MS diagnosis in the text of type "Sekundär progrediente Multiple Sklerose (SPMS)", "primäre progrediente Multiple Sklerose (PPMS)" or "schubförmig remittierende Multiple Sklerose (RRMS)"?\n<</SYS>>\n\ntext[/INST]\nBased on the information provided in the text, the most likely diagnosis for the patient is: \n\n"Sekundär progrediente Multiple Sklerose (SPMS)"\n', '[INST]\n<<SYS>>\nIs the MS diagnosis in the text of type "Sekundär progrediente Multiple Sklerose (SPMS)", "primäre progrediente Multiple Sklerose (PPMS)" or "schubförmig remittierende Multiple Sklerose (RRMS)"?\n<</SYS>>\n\ntexttext[/INST]\nBased on the information provided in the text, the most likely diagnosis for the patient is: \n\n"Sekundär progrediente Multiple Sklerose (SPMS)"\n']
['[INST]\n<<SYS>>\nIs the MS diagnosis in the text of type "Sekundär progrediente Multiple Sklerose (SPMS)", "primäre progrediente Multiple Sklerose (PPMS)" or "schubförmig remittierende Multiple Sklerose (RRMS)"?\n<<

In [47]:
# Load data
data_files = {"train": "ms-diag_clean_train.csv", "validation": "ms-diag_clean_val.csv", "test": "ms-diag_clean_test.csv"}
df = load_dataset(os.path.join(paths.DATA_PATH_PREPROCESSED,'ms-diag'), data_files = data_files)
#df = df.map(preprocess, remove_columns=["rid", "date", "text"])

In [48]:
tokens = [tokenizer(get_classification_llama(t)) for t in df["train"]["text"]]

# Default collate function 
collate_fn = DataCollatorWithPadding(tokenizer, padding=True) #padding=True, 'max_length'

dataloader = torch.utils.data.DataLoader(dataset=tokens, collate_fn=collate_fn, batch_size=2, shuffle = False) 

for batch in dataloader:
    print(batch["input_ids"].shape)
    break

torch.Size([2, 873])


In [59]:
outputs = []
for batch in dataloader:
    torch.cuda.empty_cache()
    gc.collect()
    input_ids = batch["input_ids"].to("cuda")
    attention_mask = batch["attention_mask"].to("cuda")
    with torch.inference_mode():
        generated_ids = model.generate(input_ids = input_ids, attention_mask = attention_mask, max_new_tokens=10, num_beams=1, do_sample=True, temperature = 1, num_return_sequences = 1).to("cpu")
    outputs.append(tokenizer.batch_decode(generated_ids, skip_special_tokens=True))

OutOfMemoryError: CUDA out of memory. Tried to allocate 86.00 MiB. GPU 1 has a total capacty of 10.75 GiB of which 71.62 MiB is free. Including non-PyTorch memory, this process has 10.68 GiB memory in use. Of the allocated memory 9.20 GiB is allocated by PyTorch, and 671.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [58]:
len(outputs)

2

In [None]:
set(df["train"]["labels"])

In [None]:
for idx, entry in enumerate(df["train"]):
    if entry["labels"] == list(set(df["train"]["labels"]))[1]:
        print(entry)
        print(idx)

In [None]:
print("Length of class labels: ")
[len(label) for label in tokenizer(list(set(df["train"]["labels"])))["input_ids"]]

In [51]:
def check_gpu_memory():
    if torch.cuda.is_available():
        num_gpus = torch.cuda.device_count()
        for gpu_id in range(num_gpus):
            gpu_properties = torch.cuda.get_device_properties(gpu_id)
            print(f"GPU {gpu_id}: {gpu_properties.name}")
            print(f"   Total Memory: {gpu_properties.total_memory / (1024 ** 3):.2f} GB")
            print(f"   Free Memory : {torch.cuda.memory_allocated(gpu_id) / (1024 ** 3):.2f} GB")
            print(f"   Used Memory : {torch.cuda.memory_reserved(gpu_id) / (1024 ** 3):.2f} GB")
    else:
        print("No GPU available.")

# Call the function to check GPU memory
check_gpu_memory()

GPU 0: NVIDIA GeForce RTX 2080 Ti
   Total Memory: 10.75 GB
   Free Memory : 5.71 GB
   Used Memory : 7.33 GB
GPU 1: NVIDIA GeForce RTX 2080 Ti
   Total Memory: 10.75 GB
   Free Memory : 8.32 GB
   Used Memory : 9.91 GB
