In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, DataCollatorWithPadding

from datasets import DatasetDict, Features, Sequence, Value, load_dataset

import torch
from torch.utils.data import DataLoader
from torch.profiler import profile, record_function, ProfilerActivity
import gc

import os
import sys
sys.path.append(os.getcwd()+"/../..")
from src import paths

import tqdm

In [2]:
# Download model
# checkpoint = "meta-llama/Llama-2-7b-chat-hf"
# model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")
# tokenizer = AutoTokenizer.from_pretrained(model)

# # Save model and tokenizer
# model.save_pretrained(paths.MODEL_PATH/'llama2-chat')
# tokenizer.save_pretrained(paths.MODEL_PATH/'llama2-chat')


In [2]:
def check_gpu_memory():
    if torch.cuda.is_available():
        num_gpus = torch.cuda.device_count()
        for gpu_id in range(num_gpus):
            free_mem, total_mem = torch.cuda.mem_get_info(gpu_id)
            gpu_properties = torch.cuda.get_device_properties(gpu_id)
            print(f"GPU {gpu_id}: {gpu_properties.name}")
            print(f"   Total Memory: {total_mem / (1024 ** 3):.2f} GB")
            print(f"   Free Memory: {free_mem / (1024 ** 3):.2f} GB")
            print(f"   Allocated Memory : {torch.cuda.memory_allocated(gpu_id) / (1024 ** 3):.2f} GB")
            print(f"   Reserved Memory : {torch.cuda.memory_reserved(gpu_id) / (1024 ** 3):.2f} GB")
    else:
        print("No GPU available.")
# Call the function to check GPU memory
check_gpu_memory()

GPU 0: Tesla V100-SXM2-32GB
   Total Memory: 31.74 GB
   Free Memory: 30.93 GB
   Allocated Memory : 0.00 GB
   Reserved Memory : 0.00 GB


In [3]:
# Low precision config
print("Memory before Model is loaded:\n")
check_gpu_memory()
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(paths.MODEL_PATH/'llama2-chat', device_map="auto", quantization_config=bnb_config)
print("Memory after Model is loaded:\n")
check_gpu_memory()

Memory before Model is loaded:

GPU 0: Tesla V100-SXM2-32GB
   Total Memory: 31.74 GB
   Free Memory: 30.93 GB
   Allocated Memory : 0.00 GB
   Reserved Memory : 0.00 GB


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Memory after Model is loaded:

GPU 0: Tesla V100-SXM2-32GB
   Total Memory: 31.74 GB
   Free Memory: 27.05 GB
   Allocated Memory : 3.69 GB
   Reserved Memory : 3.88 GB


In [61]:
# Compile Model for faster inference. # To-Do https://pytorch.org/blog/pytorch-compile-to-speed-up-inference/
model = torch.compile(model)

In [62]:
tokenizer = AutoTokenizer.from_pretrained(paths.MODEL_PATH/'llama2', padding_side='left')
print("Vocabulary Size without Pad Token: ", len(tokenizer))

# Check if the pad token is already in the tokenizer vocabulary
if '<pad>' not in tokenizer.get_vocab():
    # Add the pad token
    tokenizer.add_special_tokens({"pad_token":"<pad>"})

#Resize the embeddings
model.resize_token_embeddings(len(tokenizer))

#Configure the pad token in the model
model.config.pad_token_id = tokenizer.pad_token_id

# Check if they are equal
assert model.config.pad_token_id == tokenizer.pad_token_id, "The model's pad token ID does not match the tokenizer's pad token ID!"

# Print the pad token ids
print('Tokenizer pad token ID:', tokenizer.pad_token_id)
print('Model pad token ID:', model.config.pad_token_id)
print('Model config pad token ID:', model.config.pad_token_id)
print("Vocabulary Size with Pad Token: ", len(tokenizer))

Vocabulary Size without Pad Token:  32000
Tokenizer pad token ID: 32000
Model pad token ID: 32000
Model config pad token ID: 32000
Vocabulary Size with Pad Token:  32001


In [63]:
# Check device allocation
for name, param in model.named_parameters():
    print(f"Device of {name}: ", param.device)

Device of _orig_mod.model.embed_tokens.weight:  cuda:0
Device of _orig_mod.model.layers.0.self_attn.q_proj.weight:  cuda:0
Device of _orig_mod.model.layers.0.self_attn.k_proj.weight:  cuda:0
Device of _orig_mod.model.layers.0.self_attn.v_proj.weight:  cuda:0
Device of _orig_mod.model.layers.0.self_attn.o_proj.weight:  cuda:0
Device of _orig_mod.model.layers.0.mlp.gate_proj.weight:  cuda:0
Device of _orig_mod.model.layers.0.mlp.up_proj.weight:  cuda:0
Device of _orig_mod.model.layers.0.mlp.down_proj.weight:  cuda:0
Device of _orig_mod.model.layers.0.input_layernorm.weight:  cuda:0
Device of _orig_mod.model.layers.0.post_attention_layernorm.weight:  cuda:0
Device of _orig_mod.model.layers.1.self_attn.q_proj.weight:  cuda:0
Device of _orig_mod.model.layers.1.self_attn.k_proj.weight:  cuda:0
Device of _orig_mod.model.layers.1.self_attn.v_proj.weight:  cuda:0
Device of _orig_mod.model.layers.1.self_attn.o_proj.weight:  cuda:0
Device of _orig_mod.model.layers.1.mlp.gate_proj.weight:  cuda:0


In [69]:
base_prompt = "<s>[INST]\n<<SYS>>\n{system_prompt}\n<</SYS>>\n\n{user_prompt}[/INST]\nBased on the information provided in the text, the most likely diagnosis for the patient is: "
def get_classification_llama(text):
    # Shorten Text so that beam-search can be performed
    text = text[:500]
    input = base_prompt.format(system_prompt = "Is the MS diagnosis in the text of type \"Sekundär progrediente Multiple Sklerose (SPMS)\", \"primäre progrediente Multiple Sklerose (PPMS)\" or \"schubförmig remittierende Multiple Sklerose (RRMS)\"?",
                               user_prompt = text)
    return input

def preprocess(example):
    example = tokenizer(get_classification_llama(example["text"]), return_tensors ="pt")
    return example

In [70]:
# Load data
data_files = {"train": "ms-diag_clean_train.csv", "validation": "ms-diag_clean_val.csv", "test": "ms-diag_clean_test.csv"}
df = load_dataset(os.path.join(paths.DATA_PATH_PREPROCESSED,'ms-diag'), data_files = data_files)
#df = df.map(preprocess, remove_columns=["rid", "date", "text"])

In [71]:
max_len = 0
index = 0
for i, text in enumerate(df["train"]["text"]):
    if max_len < len(text):
        max_len = len(text)
        index = i
print(max_len)
print(index)

5504
48


In [72]:
get_classification_llama(df["train"]["text"][48])

'<s>[INST]\n<<SYS>>\nIs the MS diagnosis in the text of type "Sekundär progrediente Multiple Sklerose (SPMS)", "primäre progrediente Multiple Sklerose (PPMS)" or "schubförmig remittierende Multiple Sklerose (RRMS)"?\n<</SYS>>\n\n1. Primär progrediente Multiple Sklerose, EM 1992, ED 1996, aktuell EDSS 7.0 INDENT Verlauf: Erstmanifestation mit Sensibilitätsstörungen am rechten Bein. Seitdem progrediente Beinschwäche sowie Extremitätenataxie, im gesamten Krankheitsverlauf kein Hinweis auf schubverdächtige Episoden INDENT Klinisch aktuell: Sakkadierte Blickfolge, bitemporal eingeschränkte Gesichtsfelder, Dysarthrie, links-/bein- und proximal betonte spastische Tetraparese, Babinski bds. positiv. Rechts- und beinbetonter Hol[/INST]\nBased on the information provided in the text, the most likely diagnosis for the patient is: '

In [73]:
tokens = [tokenizer(get_classification_llama(t)) for t in df["train"]["text"]]

# Default collate function 
collate_fn = DataCollatorWithPadding(tokenizer, padding=True) #padding=True, 'max_length'

dataloader = torch.utils.data.DataLoader(dataset=tokens, collate_fn=collate_fn, batch_size=8, shuffle = False) 

for batch in dataloader:
    print(batch["input_ids"].shape)
    break

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


torch.Size([8, 352])


In [19]:
test = next(iter(dataloader))
test = {k:v.to("cuda") for k,v in test.items()}
for i in range(6):
    check_gpu_memory()
    generated_ids = model.generate(**test, max_new_tokens=20, num_beams=2, do_sample=True, num_return_sequences=1, temperature = 0.9, top_p = 0.6)

GPU 0: Tesla V100-SXM2-32GB
   Total Memory: 31.74 GB
   Free Memory: 22.10 GB
   Allocated Memory : 6.31 GB
   Reserved Memory : 8.66 GB
GPU 0: Tesla V100-SXM2-32GB
   Total Memory: 31.74 GB
   Free Memory: 13.00 GB
   Allocated Memory : 6.31 GB
   Reserved Memory : 17.76 GB
GPU 0: Tesla V100-SXM2-32GB
   Total Memory: 31.74 GB
   Free Memory: 13.00 GB
   Allocated Memory : 6.31 GB
   Reserved Memory : 17.76 GB


KeyboardInterrupt: 

It seems as though reserved memory is extremely high when using beam search. If I have longer input sequences this will lead to out of memory issues. I will try to set number of tokens to a lower number and check if beam search works then. I truncate the text directly because if I truncate after the prompt insertion I will loose the end of the prompt.

In [20]:
tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

'[INST]\n<<SYS>>\nIs the MS diagnosis in the text of type "Sekundär progrediente Multiple Sklerose (SPMS)", "primäre progrediente Multiple Sklerose (PPMS)" or "schubförmig remittierende Multiple Sklerose (RRMS)"?\n<</SYS>>\n\nMultiple Sklerose mit sekundär progredientem Verlauf seit ca. 2004 (EM 1983, ED 1996) klinisch: aktuell spastische Hemiparese links, stark eingeschränkte Gehstrecke (Rollator, Rollauto für längere Strecken ), Miktionsstörung, Fatigue-Syndrom, Abduzensparese rechts letzter eindeutiger Schub 2001, 1983 Neuritis optici bds., 1996 sensibler Querschnitt Th5, seit 2004 zunehmende Gehbehinderung (04/2007 EDSS 5.5), deutliche Verschlechterung nach Patellafraktur 2014, seitdem Ausgangniveua nicht erreicht bildgebend: 2005 MRI cerebral sowie HWS und BWS: Cerebral alte Läsion ohne KM-Anreicherung, spinal werden aktive Herde auf Höhe C4 und Th1/2 beschrieben. 01/2006 Kontroll-MRI spinal ohne aktive entzündliche Läsionen. Alte Läsionen sowie eine Verschmächtigung des Rückenmar

In [25]:
print(prof.key_averages().table(sort_by="self_cuda_memory_usage", row_limit=10))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                               [memory]         0.00%       0.000us         0.00%       0.000us       0.000us       0.000us         0.00%       0.000us       0.000us           0 b           0 b      19.00 Kb      19.00 K

In [74]:
torch.cuda.empty_cache()
check_gpu_memory()

GPU 0: Tesla V100-SXM2-32GB
   Total Memory: 31.74 GB
   Free Memory: 24.12 GB
   Allocated Memory : 6.05 GB
   Reserved Memory : 6.65 GB


In [75]:
outputs = []
print("Memory Consumption before loop\n")
check_gpu_memory()
for idx, batch in enumerate(dataloader):
    
    torch.cuda.empty_cache()
    gc.collect()

    print("Memory Consumption before Batch: ", idx)
    check_gpu_memory()
    
    input_ids = batch["input_ids"].to("cuda")
    attention_mask = batch["attention_mask"].to("cuda")
    with torch.inference_mode():
        generated_ids = model.generate(input_ids = input_ids, attention_mask = attention_mask, max_new_tokens=20, num_beams=2, do_sample=True, temperature = 0.9, num_return_sequences = 1, top_p = 0.6).to("cpu")
    outputs.append(tokenizer.batch_decode(generated_ids, skip_special_tokens=True))

Memory Consumption before loop

GPU 0: Tesla V100-SXM2-32GB
   Total Memory: 31.74 GB
   Free Memory: 24.12 GB
   Allocated Memory : 6.05 GB
   Reserved Memory : 6.65 GB
Memory Consumption before Batch:  0
GPU 0: Tesla V100-SXM2-32GB
   Total Memory: 31.74 GB
   Free Memory: 24.12 GB
   Allocated Memory : 3.70 GB
   Reserved Memory : 6.65 GB
Memory Consumption before Batch:  1
GPU 0: Tesla V100-SXM2-32GB
   Total Memory: 31.74 GB
   Free Memory: 26.85 GB
   Allocated Memory : 3.70 GB
   Reserved Memory : 3.91 GB
Memory Consumption before Batch:  2
GPU 0: Tesla V100-SXM2-32GB
   Total Memory: 31.74 GB
   Free Memory: 26.85 GB
   Allocated Memory : 3.70 GB
   Reserved Memory : 3.91 GB
Memory Consumption before Batch:  3
GPU 0: Tesla V100-SXM2-32GB
   Total Memory: 31.74 GB
   Free Memory: 26.85 GB
   Allocated Memory : 3.70 GB
   Reserved Memory : 3.91 GB
Memory Consumption before Batch:  4
GPU 0: Tesla V100-SXM2-32GB
   Total Memory: 31.74 GB
   Free Memory: 26.85 GB
   Allocated Memory

In [76]:
check_gpu_memory()
torch.cuda.empty_cache()
gc.collect()
check_gpu_memory()

GPU 0: Tesla V100-SXM2-32GB
   Total Memory: 31.74 GB
   Free Memory: 22.73 GB
   Allocated Memory : 3.70 GB
   Reserved Memory : 8.04 GB
GPU 0: Tesla V100-SXM2-32GB
   Total Memory: 31.74 GB
   Free Memory: 26.85 GB
   Allocated Memory : 3.70 GB
   Reserved Memory : 3.91 GB


In [77]:
from itertools import chain
outputs = list(chain.from_iterable(outputs))
pd.Series(outputs).to_csv(paths.RESULTS_PATH/'ms_diag-llama2-chat_zero_shot-shortened300_beam2.csv')

In [79]:
results = [out.split("\nBased on the information provided in the text, the most likely diagnosis for the patient is:")[1] for out in outputs]

In [86]:
set(df["train"]["labels"])

{'primary_progressive_multiple_sclerosis',
 'relapsing_remitting_multiple_sclerosis',
 'secondary_progressive_multiple_sclerosis'}

In [87]:
# Dictionary to map keywords to labels
keyword_label_mapping = {
    "RRMS": 'relapsing_remitting_multiple_sclerosis',
    "SPMS": 'secondary_progressive_multiple_sclerosis',
    "PPMS": 'primary_progressive_multiple_sclerosis',
}

# Function to assign labels based on text content
def assign_label(text):
    for keyword, label in keyword_label_mapping.items():
        if keyword in text:
            return label
    return "unknown"  # Default label if no keyword is found

# Assign labels to each text in the list
labels = [assign_label(text) for text in results]

In [99]:
correct = 0
for i in range(len(labels)):
    if labels[i] == df["train"]["labels"][i]:
        correct += 1
correct/len(labels)

0.6016260162601627