<a href="https://colab.research.google.com/github/nnilayy/MedGPT/blob/main/Stress_Testing_GPUs_Code_with_30b_LLMs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install transformers

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe = pipeline("text-generation", model="BioMistral/BioMistral-7B")
# pipe(messages)

In [None]:
import os
import psutil
import platform
try:
    import GPUtil
except ImportError:
    print("GPUtil module not installed. GPU information will not be available.")

# CPU Information
cpus = os.cpu_count()
print("Logical CPUs:", cpus)
print("Physical CPUs:", psutil.cpu_count(logical=False))

# System Memory
ram = psutil.virtual_memory()
print("Total RAM (GB):", round(ram.total / (1024 ** 3), 2))
print("Available RAM (GB):", round(ram.available / (1024 ** 3), 2))
print("Used RAM (GB):", round(ram.used / (1024 ** 3), 2))

# Disk Information
print("\nDisk Information:")
for partition in psutil.disk_partitions():
    try:
        usage = psutil.disk_usage(partition.mountpoint)
        print(f"  Mountpoint: {partition.mountpoint}")
        print(f"    Total Size (GB): {round(usage.total / (1024 ** 3), 2)}")
        print(f"    Used Space (GB): {round(usage.used / (1024 ** 3), 2)}")
        print(f"    Free Space (GB): {round(usage.free / (1024 ** 3), 2)}")
        print(f"    Percentage Used: {usage.percent}%")
    except PermissionError:
        print(f"  No Permission to access {partition.mountpoint}")

# GPU Information (if GPUtil is available)
if 'GPUtil' in globals():
    gpus = GPUtil.getGPUs()
    if gpus:
        for i, gpu in enumerate(gpus):
            print(f"\nGPU {i}: {gpu.name}")
            print(f"  Total VRAM (GB): {round(gpu.memoryTotal / 1024, 2)}")
            print(f"  Used VRAM (GB): {round(gpu.memoryUsed / 1024, 2)}")
            print(f"  Free VRAM (GB): {round(gpu.memoryFree / 1024, 2)}")
            print(f"  GPU Load (%): {gpu.load * 100}")
    else:
        print("No GPU found or GPUtil cannot find it.")
else:
    print("GPU information not available due to missing GPUtil.")


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
checkpoint  = [
    "BioMistral/BioMistral-7B",
#     "uygarkurt/llama-3-merged-linear",
#     "01-ai/Yi-1.5-9B",
#     "facebook/opt-30b"
]
model_to_load = 0
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained(checkpoint[model_to_load],
                                             torch_dtype=torch.float16,
                                             device_map='auto',
                                            )

tokenizer = AutoTokenizer.from_pretrained(checkpoint[model_to_load], return_token_type_ids=False)

In [None]:
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

## Datasets

In [None]:
from datasets import load_dataset

In [None]:
# PubmedQA: https://huggingface.co/datasets/qiaojin/PubMedQA
pubmedqa_artificial = load_dataset("qiaojin/PubMedQA", "pqa_artificial")
pubmedqa_labeled = load_dataset("qiaojin/PubMedQA", "pqa_labeled")
pubmedqa_unlabeled = load_dataset("qiaojin/PubMedQA", "pqa_unlabeled")
pubmedqa_artificial

Downloading readme:   0%|          | 0.00/5.19k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/233M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/211269 [00:00<?, ? examples/s]

Downloading data:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Downloading data:   0%|          | 0.00/66.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/61249 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['pubid', 'question', 'context', 'long_answer', 'final_decision'],
        num_rows: 211269
    })
})

In [None]:
dataset = pubmedqa_artificial['train']
dataset = dataset.map(lambda x: {'finalized_context': " ".join(x['context']['contexts'])})
print(dataset)

Map:   0%|          | 0/211269 [00:00<?, ? examples/s]

Dataset({
    features: ['pubid', 'question', 'context', 'long_answer', 'final_decision', 'finalized_context'],
    num_rows: 211269
})


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModelForCausalLM
import gc

tokenizer = AutoTokenizer.from_pretrained("BioMistral/BioMistral-7B", return_token_type_ids=True)
tokenizer.pad_token = tokenizer.eos_token
def encode(dataset):
    outputs = tokenizer(dataset['question'], dataset['finalized_context'], truncation=True, padding='max_length', max_length=1024)
    outputs['labels'] = dataset['final_decision']
    return outputs

dataset = dataset.map(encode,
            batched=True,
            batch_size=1000,
            num_proc=16,
            keep_in_memory=True,
#             cache_file_name="./dataset_cache"
           )

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorWithPadding
checkpoint = 'BioMistral/BioMistral-7B'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.pad_token = tokenizer.eos_token

label_to_idx = {'yes': 1, 'no': 0}
def encode(examples):
    outputs = tokenizer(
        examples['question'],
        examples['finalized_context'],
        truncation=True,
        padding='max_length',
        max_length=1024
    )
    outputs['labels'] = [label_to_idx[label] for label in examples['final_decision']]
    return outputs

dataset = dataset.map(
    encode,
    batched=True,
    batch_size=1000,
    num_proc=16,
    keep_in_memory=True
)

# Set the format for PyTorch
dataset = dataset.remove_columns(['pubid', 'question', 'long_answer', 'final_decision','finalized_context', 'context'])
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
print(dataset)

Map (num_proc=16):   0%|          | 0/211269 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
checkpoint = 'BioMistral/BioMistral-7B'
model = AutoModelForCausalLM.from_pretrained(checkpoint,
                                             torch_dtype=torch.float16,
                                             device_map='auto',
                                             num_labels=2
                                            )

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.pad_token = tokenizer.eos_token

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    logits = torch.from_numpy(logits)
    labels = torch.from_numpy(labels)

    predictions = torch.argmax(logits, dim=-1)
    accuracy = (predictions == labels).float().mean()
    return {'accuracy': accuracy.item()}

data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

# Training arguments
training_args = TrainingArguments(
    output_dir ='./results',
    run_name = 'run_8',
    num_train_epochs = 1,
    per_device_train_batch_size = 2,
    save_total_limit = 3,
    eval_strategy = "epoch",
    save_strategy = "epoch",
    label_names = ['no', 'yes'],
    load_best_model_at_end = False,
    fp16 = torch.cuda.is_available()
)


trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = dataset['train'],
#     eval_dataset = dataset['validation'],
    data_collator = data_collator,
    compute_metrics = compute_metrics
)

trainer.train()

# evaluation_results = trainer.evaluate()
# print(evaluation_results)

In [None]:
# MedMCQA: https://huggingface.co/datasets/openlifescienceai/medmcqa
medmcqa = load_dataset("openlifescienceai/medmcqa")

In [None]:
medmcqa_train = medmcqa['train']
medmcqa_test = medmcqa['test']
medmcqa_validation = medmcqa['validation']
medmcqa

In [None]:
# MedQA: https://huggingface.co/datasets/GBaker/MedQA-USMLE-4-options-hf
medqa = load_dataset("GBaker/MedQA-USMLE-4-options-hf")
medqa_train = medqa['train']
medqa_test = medqa['test']
medqa_validation = medqa['validation']
medqa_test

## Inferencing Code

In [None]:
# pubmedqa_artificial['train'][0]
pubmedqa_artificial['train'].features

In [None]:
# pubmedqa_artificial['train'][0]
pubmedqa_artificial['train'][0]['final_decision']

In [None]:
index = 121
question = pubmedqa_artificial['train'][index]['question']
context = " ".join(pubmedqa_artificial['train'][index]['context']['contexts'])
answer = pubmedqa_artificial['train'][index]['final_decision']

print(f"""
Question: {question}
Answer: {answer}
""")

In [None]:
input_text = f"""
You Are a Soldier Going to Write an Exam. You have to read the Question very Properly and Answer the Question,
to your best Knowledge. If you fail to answer anything apart from "yes", "no", "maybe" options
Your Entire Family Dies. Answer the Following Question with just one word. Remember Just One Word

Here are Four examples as to how you should answer the mentioned Question:
Question: Are group 2 innate lymphoid cells ( ILC2s ) increased in chronic rhinosinusitis with nasal polyps or eosinophilia?
Options:
(a)yes
(b)no
(c)maybe
Answer: (a)yes

Question: Does vagus nerve contribute to the development of steatohepatitis and obesity in phosphatidylethanolamine N-methyltransferase deficient mice?
Options:
(a)yes
(b)no
(c)maybe
Answer: (a)yes

Question: Does 1+2=2?
Options:
(a)yes
(b)no
(c)maybe
Answer: (b)no

Question: Does Pythagoras Theorem indicate that in a Right Angled Triange a^2+b^2=c^2 where c is the hypoteneuse?
Options:
(a)yes
(b)no
(c)maybe
Answer: (a)yes

Now Like the Above question, Answer the following question with just one answer, yes, no maybe to your correct knowledge

Question: {question}
Options:
(a)yes
(b)no
(c)maybe
"""
tokenizer.pad_token = tokenizer.eos_token
# Example input
encoded_input = tokenizer.encode(input_text, return_tensors='pt').to(device)
output_sequences = model.generate(
    pad_token_id=tokenizer.pad_token_id,
    input_ids=encoded_input,
    do_sample=True,
    max_length=1024,
    num_return_sequences=1
)

generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
output = generated_text.replace(input_text, "")
print(output)

In [None]:
from huggingface_hub import notebook_login
hugging_face_token = "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"
notebook_login()

In [None]:
model.push_to_hub('nnilayy/biomistral-unmodified')
tokenizer.push_to_hub('nnilayy/biomistral-unmodified')

In [None]:
# Clear GPU (Maybe)
import torch
# del model
# del tokenizer
# torch.cuda.empty_cache()


# Clear CPU's Memory
import gc
gc.collect()

In [None]:
# Loading model on CPUs would crash them, instead loaad them on GPUs, using the parameter
# device_map ="cuda"/"auto" in loading pretrained model
# By loading model onto gpus, even if we load multiple models onto gpus, the notebook wouldnt crash but just instead free upt the space which was occupied by the previous model

# When you are loading a model, you can load the model either on a cpu or a gpu. After loading these models would
# persist in the cpu/gpu and not

# When device is mapped to auto, it would try to load mjaority of the model on the gpus, but once gpus
# reach their limit the rest of model is loaded onto the cpu, but if the model is still big enoughy
# the cpu's ram would be filled completely and the notebook would crash, ultimately being not able to load the
# model into memory


# Interesting Observation, On the first load of the 30 billion paramter model into memory, the notebook crashed
# on the second load of the model, the model loaded perfectly into the memory, without using any of the CPU's ram
