In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, DataCollatorWithPadding

from datasets import DatasetDict, Features, Sequence, Value, load_dataset

import torch
from torch.utils.data import DataLoader
import gc

import os
import sys
sys.path.append(os.getcwd()+"/../..")
from src import paths

from tqdm import tqdm

In [2]:
def check_gpu_memory():
    if torch.cuda.is_available():
        num_gpus = torch.cuda.device_count()
        for gpu_id in range(num_gpus):
            free_mem, total_mem = torch.cuda.mem_get_info(gpu_id)
            gpu_properties = torch.cuda.get_device_properties(gpu_id)
            print(f"GPU {gpu_id}: {gpu_properties.name}")
            print(f"   Total Memory: {total_mem / (1024 ** 3):.2f} GB")
            print(f"   Free Memory: {free_mem / (1024 ** 3):.2f} GB")
            print(f"   Allocated Memory : {torch.cuda.memory_allocated(gpu_id) / (1024 ** 3):.2f} GB")
            print(f"   Reserved Memory : {torch.cuda.memory_reserved(gpu_id) / (1024 ** 3):.2f} GB")
    else:
        print("No GPU available.")
# Call the function to check GPU memory
check_gpu_memory()

GPU 0: NVIDIA GeForce RTX 2080 Ti
   Total Memory: 10.75 GB
   Free Memory: 10.20 GB
   Allocated Memory : 0.00 GB
   Reserved Memory : 0.00 GB
GPU 1: NVIDIA GeForce RTX 2080 Ti
   Total Memory: 10.75 GB
   Free Memory: 10.20 GB
   Allocated Memory : 0.00 GB
   Reserved Memory : 0.00 GB


In [3]:
# Low precision config
print("Memory before Model is loaded:\n")
check_gpu_memory()
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(paths.MODEL_PATH/'llama2-chat', device_map="auto", torch_dtype=torch.float16)
print("Memory after Model is loaded:\n")
check_gpu_memory()

Memory before Model is loaded:

GPU 0: NVIDIA GeForce RTX 2080 Ti
   Total Memory: 10.75 GB
   Free Memory: 10.20 GB
   Allocated Memory : 0.00 GB
   Reserved Memory : 0.00 GB
GPU 1: NVIDIA GeForce RTX 2080 Ti
   Total Memory: 10.75 GB
   Free Memory: 10.20 GB
   Allocated Memory : 0.00 GB
   Reserved Memory : 0.00 GB


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Memory after Model is loaded:

GPU 0: NVIDIA GeForce RTX 2080 Ti
   Total Memory: 10.75 GB
   Free Memory: 3.89 GB
   Allocated Memory : 6.31 GB
   Reserved Memory : 6.31 GB
GPU 1: NVIDIA GeForce RTX 2080 Ti
   Total Memory: 10.75 GB
   Free Memory: 3.89 GB
   Allocated Memory : 6.31 GB
   Reserved Memory : 6.31 GB


In [4]:
tokenizer = AutoTokenizer.from_pretrained(paths.MODEL_PATH/'llama2', padding_side='left')
print("Vocabulary Size without Pad Token: ", len(tokenizer))

# Check if the pad token is already in the tokenizer vocabulary
if '<pad>' not in tokenizer.get_vocab():
    # Add the pad token
    tokenizer.add_special_tokens({"pad_token":"<pad>"})

#Resize the embeddings
model.resize_token_embeddings(len(tokenizer))

#Configure the pad token in the model
model.config.pad_token_id = tokenizer.pad_token_id

# Check if they are equal
assert model.config.pad_token_id == tokenizer.pad_token_id, "The model's pad token ID does not match the tokenizer's pad token ID!"

# Print the pad token ids
print('Tokenizer pad token ID:', tokenizer.pad_token_id)
print('Model pad token ID:', model.config.pad_token_id)
print('Model config pad token ID:', model.config.pad_token_id)
print("Vocabulary Size with Pad Token: ", len(tokenizer))

Vocabulary Size without Pad Token:  32000
Tokenizer pad token ID: 32000
Model pad token ID: 32000
Model config pad token ID: 32000
Vocabulary Size with Pad Token:  32001


In [6]:
# Load data
data_files = {"train": "ms-diag_clean_train.csv", "validation": "ms-diag_clean_val.csv", "test": "ms-diag_clean_test.csv"}
df = load_dataset(os.path.join(paths.DATA_PATH_PREPROCESSED,'ms-diag'), data_files = data_files)

# Number of labels
num_labels = len(set(df['train']['labels']))

# Label to id
label2id = {'primary_progressive_multiple_sclerosis': 0,
            'relapsing_remitting_multiple_sclerosis': 1,
            'secondary_progressive_multiple_sclerosis': 2}
id2label = {v:k for k,v in label2id.items()}

BASE_PROMPT = "<s>[INST]\n<<SYS>>\n{system_prompt}\n<</SYS>>\n\n{user_prompt}[/INST]\n\n{answer_init}"
SYSTEM_PROMPT = "Generiere einen fast identischen Text und behalte die genaue Diagnose der multiplen Sklerose bei."
ANSWER_INIT = "Generierung: "

def construct_prompt(class_id:int, truncation_size:int=300) -> list[str]:

    def format_prompt(text:str)->str:
        """Truncates the text to the given truncation size and formats the prompt.
        
        Args:
            text (str): Text
        
        Returns:
            str: Returns the formatted prompt
        """
        if len(text) > truncation_size:
            text = text[:truncation_size]
        else:
            text = text
        input = BASE_PROMPT.format(system_prompt = SYSTEM_PROMPT,
                                user_prompt = text,
                                answer_init = ANSWER_INIT)

        return input
    
    prompts = df["train"].filter(lambda example: example["labels"] == id2label[class_id])["text"]
    prompts = [format_prompt(prompt) for prompt in prompts]

    return prompts    

# Default collate function 
collate_fn = DataCollatorWithPadding(tokenizer, padding=True) #padding=True, 'max_length'

prompts_ppms = construct_prompt(0, truncation_size = 300)
prompts_spms = construct_prompt(2, truncation_size = 300)

dataloader_ppms = torch.utils.data.DataLoader(dataset=[tokenizer(example) for example in prompts_ppms], collate_fn=collate_fn, batch_size=2, shuffle = False)
dataloader_spms = torch.utils.data.DataLoader(dataset=[tokenizer(example) for example in prompts_spms], collate_fn=collate_fn, batch_size=2, shuffle = False)


In [10]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [8]:
force_words_ppms = ["primäre multiple sklerose", "primär progressive multiple sklerose", "primär progrediente multiple sklerose"]
force_words_ids_ppms = [tokenizer(force_words_ppms, add_special_tokens=False)["input_ids"]]
force_words_spms = ["sekundär progressive multiple sklerose", "sekundär progrediente multiple sklerose"]
force_words_ids_spms = [tokenizer(force_words_spms, add_special_tokens = False)["input_ids"]]

In [12]:
test = tokenizer(prompts_spms[0], return_tensors = "pt")
test = {k:v.to(device) for k, v in test.items()}
test_ids = model.generate(**test, max_new_tokens = 128, temperature = 1, num_beams =2, do_sample = True, force_words_ids = force_words_ids_spms).to("cpu")
print(tokenizer.batch_decode(test_ids, skip_special_tokens = True))

['[INST]\n<<SYS>>\nGeneriere einen fast identischen Text und behalte die genaue Diagnose der multiplen Sklerose bei.\n<</SYS>>\n\nMultiple Sklerose mit sekundär progredientem Verlauf seit ca. 2004 (EM 1983, ED 1996) klinisch: aktuell spastische Hemiparese links, stark eingeschränkte Gehstrecke (Rollator, Rollauto für längere Strecken ), Miktionsstörung, Fatigue-Syndrom, Abduzensparese rechts letzter eindeutiger Schub 2001, 198[/INST]\n\nGenerierung:  sekundär progrediente multiple sklerose seit ca. 2004 (EM 1983, ED 1996)\n\nKlinische Symptome:\n\n* Spastische Hemiparese links (seit ca. 2004)\n* Starke Einschränkung der Gehstrecke (Rollator, Rollauto für längere Strecken)\n* Miktionsstörung\n* Fatigue-Syndrom\n* Abduzensparese rechts (letzter eindeut']


In [27]:
del test_ids

In [14]:
generated_samples = []

for i in tqdm(range(20)):
    for batch in dataloader_spms:
        batch = {k:v.to(device) for k,v in batch.items()}
        try:
            generated_ids = model.generate(**batch, 
                                           max_new_tokens=128, 
                                           num_beams=2, 
                                           do_sample=True, 
                                           num_return_sequences=1, 
                                           temperature = 1, 
                                           top_p = 0.8,
                                          force_words_ids = force_words_ids_spms).to("cpu")
            generated_samples.append(tokenizer.batch_decode(generated_ids, skip_special_tokens = True))
        except:
            continue
    

100%|██████████| 20/20 [06:20<00:00, 19.03s/it]


In [15]:
from itertools import chain
outputs = list(chain.from_iterable(generated_samples))
outputs = [text.split("[/INST]\n\nGenerierung: ")[1] for text in outputs]

In [16]:
outputs

[' sekundär progrediente multiple sklerose seit ca. 2004 (EM 1983, ED 1996)\n\nKlinische Symptome:\n\n* Spastische Hemiparese links (seit ca. 2004)\n* Starke Einschränkung der Gehstrecke (Rollator, Rollauto für längere Strecken)\n* Miktionsstörung\n* Fatigue-Syndrom\n* Abduzensparese rechts (letzter eindeut',
 ' sekundär progressive multiple sklerose (ED 08/2003), EDSS 7.5\n\nVerlauf:\n\n* INDENT 1992: mögliche pontocerebelläre Störung mit Drehschwindel\n* INDENT 06/2003: Schub mit sensomotorischem Hemisyndrom rechts, nur inkompletter Remission\n* INDENT 11/2010: Schub mit akuter Verschlechterung der Hemi\n*',
 ' sekundär progressive multiple sklerose seit 2008, EM 1999, ED 2002, EDSS 6.5 (aktuell nicht beurteilbar)\n\nKlinische Symptome:\n\n* Rechtseitiger Beinbentonte spastische Tetraparese\n* Blasen- und Harninkontinenz\n\nDiagnose:\n\n* Multiple Sklerose (MS) mit sekundär chronisch progredientem Verlauf\n* EM 1999, ED 2',
 ' sekundär progressive multiple sklerose (SPMS) ab ca. 2011

In [17]:
import pandas as pd
pd.Series(outputs).to_csv(paths.DATA_PATH_PREPROCESSED/'ms-diag/artificial_spms.csv')