In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, DataCollatorWithPadding, AutoModelForSequenceClassification

from datasets import DatasetDict, Features, Sequence, Value, load_dataset

import torch
from torch.utils.data import DataLoader
from torch.profiler import profile, record_function, ProfilerActivity
import gc

import os
import sys
sys.path.append(os.getcwd()+"/../..")
from src import paths

import tqdm

In [2]:
# Download model
# checkpoint = "meta-llama/Llama-2-7b-chat-hf"
# model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")
# tokenizer = AutoTokenizer.from_pretrained(model)

# # Save model and tokenizer
# model.save_pretrained(paths.MODEL_PATH/'llama2-chat')
# tokenizer.save_pretrained(paths.MODEL_PATH/'llama2-chat')


In [2]:
def check_gpu_memory():
    if torch.cuda.is_available():
        num_gpus = torch.cuda.device_count()
        for gpu_id in range(num_gpus):
            free_mem, total_mem = torch.cuda.mem_get_info(gpu_id)
            gpu_properties = torch.cuda.get_device_properties(gpu_id)
            print(f"GPU {gpu_id}: {gpu_properties.name}")
            print(f"   Total Memory: {total_mem / (1024 ** 3):.2f} GB")
            print(f"   Free Memory: {free_mem / (1024 ** 3):.2f} GB")
            print(f"   Allocated Memory : {torch.cuda.memory_allocated(gpu_id) / (1024 ** 3):.2f} GB")
            print(f"   Reserved Memory : {torch.cuda.memory_reserved(gpu_id) / (1024 ** 3):.2f} GB")
    else:
        print("No GPU available.")
# Call the function to check GPU memory
check_gpu_memory()

GPU 0: NVIDIA GeForce RTX 2080 Ti
   Total Memory: 10.75 GB
   Free Memory: 10.20 GB
   Allocated Memory : 0.00 GB
   Reserved Memory : 0.00 GB


In [3]:
# Low precision config
print("Memory before Model is loaded:\n")
check_gpu_memory()
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(paths.MODEL_PATH/'leo-hessianai-7b', 
                                             device_map="auto", 
                                             quantization_config = bnb_config, 
                                             trust_remote_code = False)
print("Memory after Model is loaded:\n")
check_gpu_memory()

Memory before Model is loaded:

GPU 0: NVIDIA GeForce RTX 2080 Ti
   Total Memory: 10.75 GB
   Free Memory: 10.20 GB
   Allocated Memory : 0.00 GB
   Reserved Memory : 0.00 GB


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Memory after Model is loaded:

GPU 0: NVIDIA GeForce RTX 2080 Ti
   Total Memory: 10.75 GB
   Free Memory: 6.42 GB
   Allocated Memory : 3.77 GB
   Reserved Memory : 3.78 GB


In [61]:
# Compile Model for faster inference. # To-Do https://pytorch.org/blog/pytorch-compile-to-speed-up-inference/
# model = torch.compile(model)

In [5]:
tokenizer = AutoTokenizer.from_pretrained(paths.MODEL_PATH/'leo-hessianai-7b', padding_side='left')
print("Vocabulary Size without Pad Token: ", len(tokenizer))

# Check if the pad token is already in the tokenizer vocabulary
if '<pad>' not in tokenizer.get_vocab():
    # Add the pad token
    tokenizer.add_special_tokens({"pad_token":"<pad>"})

#Resize the embeddings
model.resize_token_embeddings(len(tokenizer))

#Configure the pad token in the model
model.config.pad_token_id = tokenizer.pad_token_id

# Check if they are equal
assert model.config.pad_token_id == tokenizer.pad_token_id, "The model's pad token ID does not match the tokenizer's pad token ID!"

# Print the pad token ids
print('Tokenizer pad token ID:', tokenizer.pad_token_id)
print('Model pad token ID:', model.config.pad_token_id)
print('Model config pad token ID:', model.config.pad_token_id)
print("Vocabulary Size with Pad Token: ", len(tokenizer))

Vocabulary Size without Pad Token:  32000
Tokenizer pad token ID: 32000
Model pad token ID: 32000
Model config pad token ID: 32000
Vocabulary Size with Pad Token:  32001


In [8]:
base_prompt = "<s>[INST]\n<<SYS>>\n{system_prompt}\n<</SYS>>\n\n{user_prompt}[/INST]\n Translation: \n"
def get_classification_llama(text):
    # Shorten Text so that beam-search can be performed
    text = text
    input = base_prompt.format(system_prompt = "Translate the Text to English.",
                               user_prompt = text)
    return input

def preprocess(example):
    example = tokenizer(get_classification_llama(example["text"]), return_tensors ="pt")
    return example

In [9]:
# Load data
data_files = {"train": "ms-diag_clean_train.csv", "validation": "ms-diag_clean_val.csv", "test": "ms-diag_clean_test.csv"}
df = load_dataset(os.path.join(paths.DATA_PATH_PREPROCESSED,'ms-diag'), data_files = data_files)
#df = df.map(preprocess, remove_columns=["rid", "date", "text"])

In [10]:
tokens = [tokenizer(get_classification_llama(t)) for t in df["train"]["text"]]

# Default collate function 
collate_fn = DataCollatorWithPadding(tokenizer, padding=True) #padding=True, 'max_length'

dataloader = torch.utils.data.DataLoader(dataset=tokens, collate_fn=collate_fn, batch_size=2, shuffle = False) 

It seems as though reserved memory is extremely high when using beam search. If I have longer input sequences this will lead to out of memory issues. I will try to set number of tokens to a lower number and check if beam search works then. I truncate the text directly because if I truncate after the prompt insertion I will loose the end of the prompt.

In [None]:
outputs = []
print("Memory Consumption before loop\n")
check_gpu_memory()
for idx, batch in enumerate(dataloader):
    
    torch.cuda.empty_cache()
    gc.collect()

    print("Memory Consumption before Batch: ", idx)
    check_gpu_memory()
    
    input_ids = batch["input_ids"].to("cuda")
    attention_mask = batch["attention_mask"].to("cuda")
    with torch.inference_mode():
        generated_ids = model.generate(input_ids = input_ids, attention_mask = attention_mask, max_new_tokens=20, num_beams=1, do_sample=True, temperature = 0.9, num_return_sequences = 1, top_p = 0.6).to("cpu")
    outputs.append(tokenizer.batch_decode(generated_ids, skip_special_tokens=True))
    break
    outputs

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Memory Consumption before loop

GPU 0: NVIDIA GeForce RTX 2080 Ti
   Total Memory: 10.75 GB
   Free Memory: 5.92 GB
   Allocated Memory : 4.26 GB
   Reserved Memory : 4.27 GB
Memory Consumption before Batch:  0
GPU 0: NVIDIA GeForce RTX 2080 Ti
   Total Memory: 10.75 GB
   Free Memory: 5.92 GB
   Allocated Memory : 3.77 GB
   Reserved Memory : 4.27 GB


In [76]:
check_gpu_memory()
torch.cuda.empty_cache()
gc.collect()
check_gpu_memory()

GPU 0: Tesla V100-SXM2-32GB
   Total Memory: 31.74 GB
   Free Memory: 22.73 GB
   Allocated Memory : 3.70 GB
   Reserved Memory : 8.04 GB
GPU 0: Tesla V100-SXM2-32GB
   Total Memory: 31.74 GB
   Free Memory: 26.85 GB
   Allocated Memory : 3.70 GB
   Reserved Memory : 3.91 GB


In [77]:
from itertools import chain
outputs = list(chain.from_iterable(outputs))
pd.Series(outputs).to_csv(paths.RESULTS_PATH/'ms_diag-llama2-chat_zero_shot-shortened300_beam2.csv')

In [79]:
results = [out.split("\nBased on the information provided in the text, the most likely diagnosis for the patient is:")[1] for out in outputs]

In [86]:
set(df["train"]["labels"])

{'primary_progressive_multiple_sclerosis',
 'relapsing_remitting_multiple_sclerosis',
 'secondary_progressive_multiple_sclerosis'}

In [87]:
# Dictionary to map keywords to labels
keyword_label_mapping = {
    "RRMS": 'relapsing_remitting_multiple_sclerosis',
    "SPMS": 'secondary_progressive_multiple_sclerosis',
    "PPMS": 'primary_progressive_multiple_sclerosis',
}

# Function to assign labels based on text content
def assign_label(text):
    for keyword, label in keyword_label_mapping.items():
        if keyword in text:
            return label
    return "unknown"  # Default label if no keyword is found

# Assign labels to each text in the list
labels = [assign_label(text) for text in results]

In [99]:
correct = 0
for i in range(len(labels)):
    if labels[i] == df["train"]["labels"][i]:
        correct += 1
correct/len(labels)

0.6016260162601627