<a href="https://colab.research.google.com/github/nnilayy/MedGPT/blob/main/peft_deepspeed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install -U bitsandbytes evaluate datasets transformers peft deepspeed triton wandb

In [None]:
%%bash
sudo apt-get update
sudo apt-get install libaio-dev -y

In [None]:
!pip install flash-attn

In [None]:
from huggingface_hub import notebook_login
hugging_face_token = "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"
notebook_login()

In [None]:
import wandb
wandb_api_token = "1a6a95ba4f084dedd64528953348896560a68bfe"
wandb.login(key = wandb_api_token)

In [None]:
%%bash
cat <<'EOT' > ds_config_zero3.json
{
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },

    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "betas": "auto",
            "eps": "auto",
            "weight_decay": "auto"
        }
    },

    "scheduler": {
        "type": "WarmupLR",
        "params": {
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto"
        }
    },

    "zero_optimization": {
        "stage": 3,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": true
        },

        "offload_param": {
            "device": "cpu",
            "pin_memory": true
        },

        "overlap_comm": true,
        "contiguous_gradients": true,
        "sub_group_size": 1e9,
        "reduce_bucket_size": "auto",
        "stage3_prefetch_bucket_size": "auto",
        "stage3_param_persistence_threshold": "auto",
        "stage3_max_live_parameters": 1e9,
        "stage3_max_reuse_distance": 1e9,
        "stage3_gather_16bit_weights_on_model_save": true
    },

    "sparse_attention": {
        "mode": "fixed",
        "block": 16,
        "different_layout_per_head": true,
        "num_local_blocks": 4,
        "num_global_blocks": 1,
        "attention": "bidirectional",
        "horizontal_global_attention": false,
        "num_different_global_patterns": 4,
        "num_random_blocks": 0,
        "local_window_blocks": [4],
        "global_block_indices": [0],
        "num_sliding_window_blocks": 3
  },

    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    "steps_per_print": 2000,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
}
EOT

In [None]:
%%bash
cat <<'EOT' > trainer.py

# --------------------------------------------------------------------------------------------------------------------------------------------
# --------------------------------------------------------------------------------------------------------------------------------------------
import os
import multiprocessing
from transformers import BertTokenizer, BitsAndBytesConfig, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, AutoModelForSequenceClassification, AutoTokenizer
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
from datasets import load_dataset
import torch
import warnings
warnings.filterwarnings("ignore")
from accelerate import PartialState
from accelerate.logging import get_logger

def main():
    checkpoint = ["bert-base-uncased",
                 "BioMistral/BioMistral-7B",
                 "bigscience/bloom-3b",
                 ]

    index = 2
#     bnb_config = BitsAndBytesConfig(load_in_8bit=True)
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint[index],
                                                               num_labels=2,
                                                               torch_dtype=torch.float16, #This reduces the gpu onboard vram usage
#                                                                quantization_config=bnb_config,
                                                               low_cpu_mem_usage=True,
#                                                                device_map={"": PartialState().process_index},
                                                          )

    peft_config = LoraConfig(
                             inference_mode=False,
                             r=64,
                             lora_alpha = 2048,
                             lora_dropout = 0.1,
                             bias="none",
                             peft_type = "SEQ_CLS",
                             )

    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        logits = torch.from_numpy(logits)
        labels = torch.from_numpy(labels)

        predictions = torch.argmax(logits, dim=-1)
        accuracy = (predictions == labels).float().mean()
        return {'accuracy': accuracy.item()}

    # Preprocess the dataset
    def encode(examples):
        outputs = tokenizer(examples['sentence1'], examples['sentence2'], truncation=True, padding='max_length', max_length=128)
        outputs['labels'] = examples['label']
        return outputs

    tokenizer = AutoTokenizer.from_pretrained(checkpoint[index])
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    # Dataset
    dataset = load_dataset('glue', 'mrpc')
    dataset = dataset.map(encode, batched=True, num_proc=12)
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    label_names = dataset['train'].features['label'].names
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Training arguments

    training_args = TrainingArguments(
        output_dir='./results',
        logging_dir='./logs_rslora',
        do_train=True,
        do_eval=True,
        num_train_epochs=10,
        learning_rate=2e-4,
        logging_strategy='epoch',
        per_device_train_batch_size=20,
        per_device_eval_batch_size=20,
        save_total_limit=1,
        dataloader_num_workers = 8,
        dataloader_pin_memory = True,
        dataloader_prefetch_factor = 4,
        save_strategy="epoch",
        eval_strategy="epoch",
        label_names = ["labels"], #Without this Validation Accuracy and Validation Loss wouldn't be logged
        fp16=True,
        ddp_find_unused_parameters = False,
#         gradient_checkpointing=True,
#         gradient_accumulation_steps=4,
#         deepspeed = "/kaggle/working/ds_config_zero3.json",
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset['train'],
        eval_dataset=dataset['validation'],
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    # Train the model
    trainer.train()
    trainer.evaluate()

if __name__ == "__main__":
    main()
# --------------------------------------------------------------------------------------------------------------------------------------------
# --------------------------------------------------------------------------------------------------------------------------------------------
EOT

In [None]:
%%bash
cat <<'EOT' > trainer.py

# --------------------------------------------------------------------------------------------------------------------------------------------
# --------------------------------------------------------------------------------------------------------------------------------------------
import os
import multiprocessing
from transformers import BertTokenizer, BitsAndBytesConfig, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
from datasets import load_dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from accelerate import PartialState
from accelerate.logging import get_logger

class SparseAttention(nn.Module):
    def __init__(self, hidden_size, num_heads, sparsity_factor=2):
        super().__init__()
        self.num_heads = num_heads
        self.hidden_size = hidden_size
        self.head_dim = hidden_size // num_heads
        self.sparsity_factor = sparsity_factor

        self.query = nn.Linear(hidden_size, hidden_size)
        self.key = nn.Linear(hidden_size, hidden_size)
        self.value = nn.Linear(hidden_size, hidden_size)

    def forward(self, hidden_states):
        batch_size, seq_length, _ = hidden_states.size()

        query = self.query(hidden_states).view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
        key = self.key(hidden_states).view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
        value = self.value(hidden_states).view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)

        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_dim)

        sparse_mask = torch.ones_like(scores, dtype=torch.bool)
        sparse_mask[:, :, :, ::self.sparsity_factor] = False

        scores = scores.masked_fill(sparse_mask, float('-inf'))

        attn_weights = F.softmax(scores, dim=-1)
        attn_output = torch.matmul(attn_weights, value)

        return attn_output.transpose(1, 2).contiguous().view(batch_size, seq_length, self.hidden_size), attn_weights

class CustomModelWithSparseAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
        self.sparse_attention = SparseAttention(config.hidden_size, config.num_attention_heads)
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None,
                inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None,
                return_dict=None):
        if input_ids is not None:
            input_shape = input_ids.size()
            embeddings = self.embeddings(input_ids)
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.size()[:-1]
            embeddings = inputs_embeds
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        attention_output, attention_weights = self.sparse_attention(embeddings)
        hidden_states = self.dense(attention_output)
        logits = self.classifier(hidden_states[:, 0, :])  # Use [CLS] token for classification

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,)
            if output_hidden_states:
                output += (hidden_states,)
            if output_attentions:
                output += (attention_weights,)
            return ((loss,) + output) if loss is not None else output

        return {'loss': loss, 'logits': logits, 'hidden_states': hidden_states, 'attentions': attention_weights}

def main():
    checkpoint = ["bert-base-uncased",
                 "BioMistral/BioMistral-7B",
                 "bigscience/bloom-3b",
                 ]

    index = 0  # Using BERT base uncased
    config = AutoConfig.from_pretrained(checkpoint[index])
    config.num_labels = 2

    model = CustomModelWithSparseAttention(config)

    peft_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        inference_mode=False,
        r=1,
        lora_alpha=2048,
        lora_dropout=0.1,
        target_modules=["dense", "classifier"]
    )

    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        if isinstance(logits, tuple):  # If model returns a tuple, take the first element as logits
            logits = logits[0]
        predictions = np.argmax(logits, axis=-1)
        accuracy = (predictions == labels).mean()
        return {'accuracy': accuracy}

    def encode(examples):
        outputs = tokenizer(examples['sentence1'], examples['sentence2'], truncation=True, padding='max_length', max_length=128)
        outputs['labels'] = examples['label']
        return outputs

    tokenizer = AutoTokenizer.from_pretrained(checkpoint[index])
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    dataset = load_dataset('glue', 'mrpc')
    dataset = dataset.map(encode, batched=True, num_proc=12)
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'labels'])
    label_names = dataset['train'].features['label'].names
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    torch.set_grad_enabled(True)
    training_args = TrainingArguments(
        output_dir='./results',
        logging_dir='./logs_rslora',
        do_train=True,
        do_eval=True,
        num_train_epochs=10,
        learning_rate=2e-4,
        logging_strategy='epoch',
        per_device_train_batch_size=20,
        per_device_eval_batch_size=20,
        save_total_limit=1,
        dataloader_num_workers=8,
        dataloader_pin_memory=True,
        dataloader_prefetch_factor=4,
        save_strategy="epoch",
        evaluation_strategy="epoch",
        label_names=["labels"],
        fp16=True,
        ddp_find_unused_parameters=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset['train'],
        eval_dataset=dataset['validation'],
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    trainer.train()
    trainer.evaluate()

if __name__ == "__main__":
    main()
# --------------------------------------------------------------------------------------------------------------------------------------------
# --------------------------------------------------------------------------------------------------------------------------------------------
EOT

In [None]:
from transformers import TrainingArguments
TrainingArguments?

In [None]:
lr_scheduler_type="cosine",

In [None]:
# !pip install lomo-optim
# !pip install git+https://github.com/jiaweizzhao/GaLore
!pip install tensorly

In [None]:
# %%bash
# cat <<'EOT' > working_trainer.py
import torch
from transformers import AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, BertForSequenceClassification
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
from transformers import TrainingArguments, Trainer
from evaluate import load
from datasets import load_dataset
from transformers import DataCollatorWithPadding
import numpy as np
import warnings
import os
from peft.utils import get_peft_model_state_dict

warnings.filterwarnings("ignore", category=RuntimeWarning)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

def main():
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return accuracy.compute(predictions=predictions, references=labels)

    def encode(examples):
        output = tokenizer(examples['sentence1'], examples['sentence2'], truncation=True, padding='max_length', max_length=128)
        output['labels'] = examples['label']
        return output


    checkpoint = "bert-base-uncased"
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint,
#                                                  torch_dtype=torch.float16,
                                                 )


    peft_config = LoraConfig(inference_mode=False,
                             r=32,
                             lora_alpha = 512,
                             lora_dropout = 0.1,
                             bias="none",
                             peft_type = TaskType.SEQ_CLS, #" CAUSAL_LM"
                             )

    # model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()

    # TYPICAL TRAINING CODE
    accuracy = load("accuracy")
    tokenizer = AutoTokenizer.from_pretrained(checkpoint, padding_side="right")
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    dataset = load_dataset("glue", "mrpc")
    dataset = dataset.map(encode, batched=True)
    dataset = dataset.remove_columns(['sentence1', 'sentence2', 'label', 'idx'])
    dataset.set_format(type='pt', columns=['input_ids', 'attention_mask', 'labels',], output_all_columns=True)
#     label_names = dataset['train'].features['label'].names
    data_collator = DataCollatorWithPadding(tokenizer)

#     torch.set_grad_enabled(True)
    training_args = TrainingArguments(
        output_dir="your-name/bigscience/mt0-large-lora",
        optim="paged_adamw_32bit",
        learning_rate=1e-4,
        lr_scheduler_type="cosine",
#         lr_scheduler_kwargs={"power": 2.0},
        warmup_ratio=0.4,
#         warmup_steps=200,
#         ddp_backend="nccl",
        logging_strategy="epoch", # Logs the Training Loss
        label_names = ['labels'], # Logs the Validation Loss and Validation Accuracy
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=15,
        eval_strategy="epoch", # Doesnt Evaluate the model per epoch, Reducing the training time
        save_strategy="epoch",
        fp16=True,
        seed=42,
        data_seed=42,
        dataloader_num_workers=4, # Reduces Training time by a decent percentage
        dataloader_pin_memory=True,
        dataloader_persistent_workers=True,
        ddp_find_unused_parameters=False,
#         gradient_checkpointing=True,
#         torch_empty_cache_steps=40, #Clears vram cache during training after a few steps
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()
if __name__ == "__main__":
#     main()
    from accelerate import notebook_launcher
    notebook_launcher(main, num_processes=2)
# EOT

[2024-08-01 16:32:46,216] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-08-01 16:32:46,261] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


[34m[1mwandb[0m: Currently logged in as: [33mnnilayy[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6395,0.621369,0.664928
2,0.5923,0.559371,0.743768
3,0.5463,0.509891,0.784348
4,0.4841,0.449037,0.821449
5,0.4301,0.440502,0.817971
6,0.3945,0.426874,0.828986
7,0.3383,0.415037,0.830145
8,0.2659,0.386071,0.844638
9,0.2191,0.398121,0.845797
10,0.1741,0.411007,0.845797


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6395,0.621369,0.664928
2,0.5923,0.559371,0.743768
3,0.5463,0.509891,0.784348
4,0.4841,0.449037,0.821449
5,0.4301,0.440502,0.817971
6,0.3945,0.426874,0.828986
7,0.3383,0.415037,0.830145
8,0.2659,0.386071,0.844638
9,0.2191,0.398121,0.845797
10,0.1741,0.411007,0.845797




In [None]:
!accelerate launch -h

In [None]:
# !torchrun --nproc_per_node=2 load_ddp_model.py
!accelerate launch --multi_gpu \
--num_processes=2 \
--mixed_precision="fp16" \
working_trainer.py

# --multi_gpu
# --use_deepspeed

# Training Config
# deepspeed: no
# lora: no
# time: cuda error

# Training Config
# deepspeed: no
# lora: yes
# time: 26 mins

# Training Config
# deepspeed: yes
# lora: yes
# time: 48 mins

# Training Config
# deepspeed: yes
# lora: yes
# time: 48 mins

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import os

model_id = "google/gemma-2b"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})

In [None]:
import torch
import gc
torch.cuda.empty_cache()
gc.collect()

In [None]:
# Loads Model on CPUs RAM
from transformers import AutoTokenizer, AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-3b")
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-3b")

In [None]:
# Loads Model on 1 GPU Vram
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-3b")
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-3b")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
# Loads Model on 1 GPU Vram, in 16 precision
# Reduce memory footprint by half
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-3b", torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-3b")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
import torch
from transformers import AutoModelForCausalLM
torch.set_default_dtype(torch.float16)
if torch.cuda.is_available():
    torch.set_default_device('cuda:1')
model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-3b")

In [None]:
# Data Parallelism: Same Model Gets Loaded On One GPU
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-3b")
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-3b")
model = torch.nn.DataParallel(model)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-3b")
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-3b")

if torch.cuda.is_available():
    # Use all available GPUs
    device_ids = list(range(torch.cuda.device_count()))
    model = torch.nn.DataParallel(model, device_ids=device_ids)
    model.to('cuda')  # Move model to the default device
else:
    print("CUDA is not available.")


In [None]:
from datasets import load_dataset
dataset = load_dataset('nnilayy/pubmedqa_artificial_128')