<a href="https://colab.research.google.com/github/nnilayy/MedGPT/blob/main/peft_deepspeed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install bitsandbytes evaluate datasets transformers peft

In [None]:
!pip install transformers --upgrade

In [None]:
from huggingface_hub import notebook_login
hugging_face_token = "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"
notebook_login()

In [None]:
import wandb
wandb_api_token = "1a6a95ba4f084dedd64528953348896560a68bfe"
wandb.login(key = wandb_api_token)

In [None]:
from transformers import BertTokenizer,BitsAndBytesConfig,BertForSequenceClassification,Trainer,TrainingArguments,DataCollatorWithPadding,AutoModelForSequenceClassification,AutoTokenizer
from peft import get_peft_model,LoraConfig,TaskType,prepare_model_for_kbit_training
from datasets import load_dataset
import torch
import warnings

checkpoint = ["bert-base-uncased",
             "BioMistral/BioMistral-7B",
             "bigscience/bloom-3b",
             ]
bnb_config = BitsAndBytesConfig(load_in_8bit=True)
index = 2
model = AutoModelForSequenceClassification.from_pretrained(checkpoint[index],
#                                                            device_map="auto",
                                                           num_labels=2,
                                                           torch_dtype=torch.float16, #This reduces the gpu onboard vram usage
#                                                            quantization_config = bnb_config,
                                                      )

peft_config = LoraConfig(
                         inference_mode=False,
                         r=16,
                         lora_alpha = 1024,
                         lora_dropout = 0.1,
                         bias="none",
                         peft_type = "SEQ_CLS",
                         use_dora=True,
                         )

# model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    logits = torch.from_numpy(logits)
    labels = torch.from_numpy(labels)

    predictions = torch.argmax(logits, dim=-1)
    accuracy = (predictions == labels).float().mean()
    return {'accuracy': accuracy.item()}

# Preprocess the dataset
def encode(examples):
    outputs = tokenizer(examples['sentence1'], examples['sentence2'], truncation=True, padding='max_length', max_length=128)
    outputs['labels'] = examples['label']
    return outputs

tokenizer = AutoTokenizer.from_pretrained(checkpoint[index])
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# model.config.pad_token_id = model.config.eos_token_id
# tokenizer.pad_token = tokenizer.eos_token
# Dataset
dataset = load_dataset('glue', 'mrpc')
dataset = dataset.map(encode, batched=True, num_proc=12)
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
label_names = dataset['train'].features['label'].names
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    logging_dir='./logs_rslora',
    # run_name='run_8',
    do_train=True,
    do_eval=True,
    num_train_epochs=10,
    learning_rate=2e-4,
    logging_strategy='epoch',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    save_total_limit=1,
    save_strategy="epoch",
    eval_strategy="epoch",
    label_names = ["labels"], #Without this Validation Accuracy and Validation Loss wouldn't be logged
    fp16=True,
#         fsdp="full_shard",
#         fsdp_auto_wrap_policy="TRANSFORMER_BASED_WRAP",
#         fsdp_transformer_layer_cls_to_wrap = "BertLayer",
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()
trainer.evaluate()

In [None]:
%%bash
cat <<'EOT' > ds_config_zero3.json
{
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },

    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "betas": "auto",
            "eps": "auto",
            "weight_decay": "auto"
        }
    },

    "scheduler": {
        "type": "WarmupLR",
        "params": {
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto"
        }
    },

    "zero_optimization": {
        "stage": 3,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": true
        },
        "offload_param": {
            "device": "cpu",
            "pin_memory": true
        },
        "overlap_comm": true,
        "contiguous_gradients": true,
        "sub_group_size": 1e9,
        "reduce_bucket_size": "auto",
        "stage3_prefetch_bucket_size": "auto",
        "stage3_param_persistence_threshold": "auto",
        "stage3_max_live_parameters": 1e9,
        "stage3_max_reuse_distance": 1e9,
        "stage3_gather_16bit_weights_on_model_save": true
    },

    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    "steps_per_print": 2000,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
}
EOT

In [None]:
from transformers import BertTokenizer,BitsAndBytesConfig,BertForSequenceClassification,Trainer,TrainingArguments,DataCollatorWithPadding,AutoModelForSequenceClassification,AutoTokenizer
from peft import get_peft_model,LoraConfig,TaskType,prepare_model_for_kbit_training
from datasets import load_dataset
import torch
import warnings
warnings.filterwarnings("ignore")

def main():
    checkpoint = ["bert-base-uncased",
                 "BioMistral/BioMistral-7B",
                 "bigscience/bloom-3b",
                 ]
#     bnb_config = BitsAndBytesConfig(load_in_8bit=True)
    index = 2
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint[index],
    #                                                            device_map="auto",
                                                               num_labels=2,
                                                               torch_dtype=torch.float16, #This reduces the gpu onboard vram usage
#                                                                quantization_config = bnb_config,
                                                          )

    peft_config = LoraConfig(
                             inference_mode=False,
                             r=16,
                             lora_alpha = 1024,
                             lora_dropout = 0.1,
                             bias="none",
                             peft_type = "SEQ_CLS",
                             use_dora=True,
                             )

    # model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()


    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        logits = torch.from_numpy(logits)
        labels = torch.from_numpy(labels)

        predictions = torch.argmax(logits, dim=-1)
        accuracy = (predictions == labels).float().mean()
        return {'accuracy': accuracy.item()}

    # Preprocess the dataset
    def encode(examples):
        outputs = tokenizer(examples['sentence1'], examples['sentence2'], truncation=True, padding='max_length', max_length=128)
        outputs['labels'] = examples['label']
        return outputs

    tokenizer = AutoTokenizer.from_pretrained(checkpoint[index])
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    # model.config.pad_token_id = model.config.eos_token_id
    # tokenizer.pad_token = tokenizer.eos_token
    # Dataset
    dataset = load_dataset('glue', 'mrpc')
    dataset = dataset.map(encode, batched=True, num_proc=12)
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    label_names = dataset['train'].features['label'].names
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Training arguments
    torch.set_grad_enabled(True)
    training_args = TrainingArguments(
        output_dir='./results',
        logging_dir='./logs_rslora',
        # run_name='run_8',
        do_train=True,
        do_eval=True,
        num_train_epochs=10,
        learning_rate=2e-4,
        logging_strategy='epoch',
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        save_total_limit=1,
        save_strategy="epoch",
        eval_strategy="epoch",
        label_names = ["labels"], #Without this Validation Accuracy and Validation Loss wouldn't be logged
        fp16=True,
        deepspeed = "/kaggle/working/ds_config_zero3.json",
#         gradient_checkpointing=True,
#         gradient_accumulation_steps=4,
#         fsdp="full_shard",
#         fsdp_auto_wrap_policy="TRANSFORMER_BASED_WRAP",
#         fsdp_transformer_layer_cls_to_wrap = "BertLayer",
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset['train'],
        eval_dataset=dataset['validation'],
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    # Train the model
    trainer.train()
    trainer.evaluate()
if __name__ =="__main__":
    main()
#     from accelerate import notebook_launcher
#     notebook_launcher(main, num_processes=2, mixed_precision="fp16")


In [None]:
!pip3 install deepspeed

In [None]:
# !torchrun --nproc_per_node=2 load_ddp_model.py
!accelerate launch --num_processes=2 --mixed_precision="fp16" --use_deepspeed trainer.py

Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloom-3b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloom-3b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
trainable params: 5,145,600 || all params: 3,007,708,160 || trainable%: 0.1711
trainable params: 5,145,600 || all params: 3,007,708,160 || trainable%: 0.1711
Map (num_proc=12): 100%|████████████| 3668/3668 [00:10<00:00, 335.58 examples/s]
Map (num_proc=12): 100%|████████████| 3668/3668 [00:11<00:00, 324.19 examples/s]
Map (num_proc=12): 100%|███████████████| 408/408 [00:09<00:00, 44.69 examples/s]
Map (num_proc=12): 100%|███████████████| 408/408 [00:10

In [None]:
!accelerate launch -h

In [None]:
# Loads Model on CPUs RAM
from transformers import AutoTokenizer, AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-3b")
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-3b")

In [None]:
Trainer?

In [None]:
# Loads Model on 1 GPU Vram
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-3b")
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-3b")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
# Loads Model on 1 GPU Vram, in 16 precision
# Reduce memory footprint by half
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-3b", torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-3b")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
import torch
from transformers import AutoModelForCausalLM
torch.set_default_dtype(torch.float16)
if torch.cuda.is_available():
    torch.set_default_device('cuda:1')
model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-3b")

In [None]:
# Data Parallelism: Same Model Gets Loaded On One GPU
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-3b")
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-3b")
model = torch.nn.DataParallel(model)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-3b")
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-3b")

if torch.cuda.is_available():
    # Use all available GPUs
    device_ids = list(range(torch.cuda.device_count()))
    model = torch.nn.DataParallel(model, device_ids=device_ids)
    model.to('cuda')  # Move model to the default device
else:
    print("CUDA is not available.")


In [None]:
from datasets import load_dataset
dataset = load_dataset('nnilayy/pubmedqa_artificial_128')

In [None]:
file_content="""
from transformers import BertTokenizer,BitsAndBytesConfig,BertForSequenceClassification,Trainer,TrainingArguments,DataCollatorWithPadding,AutoModelForSequenceClassification,AutoTokenizer
from peft import get_peft_model,LoraConfig,TaskType,prepare_model_for_kbit_training
from datasets import load_dataset
import torch
import warnings
warnings.filterwarnings("ignore")

def main():
    checkpoint = ["bert-base-uncased",
                 "BioMistral/BioMistral-7B",
                 "bigscience/bloom-3b",
                 ]
#     bnb_config = BitsAndBytesConfig(load_in_8bit=True)
    index = 2
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint[index],
    #                                                            device_map="auto",
                                                               num_labels=2,
                                                               torch_dtype=torch.float16, #This reduces the gpu onboard vram usage
#                                                                quantization_config = bnb_config,
                                                          )

    peft_config = LoraConfig(
                             inference_mode=False,
                             r=16,
                             lora_alpha = 1024,
                             lora_dropout = 0.1,
                             bias="none",
                             peft_type = "SEQ_CLS",
                             use_dora=True,
                             )

    # model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()


    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        logits = torch.from_numpy(logits)
        labels = torch.from_numpy(labels)

        predictions = torch.argmax(logits, dim=-1)
        accuracy = (predictions == labels).float().mean()
        return {'accuracy': accuracy.item()}

    # Preprocess the dataset
    def encode(examples):
        outputs = tokenizer(examples['sentence1'], examples['sentence2'], truncation=True, padding='max_length', max_length=128)
        outputs['labels'] = examples['label']
        return outputs

    tokenizer = AutoTokenizer.from_pretrained(checkpoint[index])
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    # model.config.pad_token_id = model.config.eos_token_id
    # tokenizer.pad_token = tokenizer.eos_token
    # Dataset
    dataset = load_dataset('glue', 'mrpc')
    dataset = dataset.map(encode, batched=True, num_proc=12)
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    label_names = dataset['train'].features['label'].names
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Training arguments
    torch.set_grad_enabled(True)
    training_args = TrainingArguments(
        output_dir='./results',
        logging_dir='./logs_rslora',
        # run_name='run_8',
        do_train=True,
        do_eval=True,
        num_train_epochs=10,
        learning_rate=2e-4,
        logging_strategy='epoch',
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        save_total_limit=1,
        save_strategy="epoch",
        eval_strategy="epoch",
        label_names = ["labels"], #Without this Validation Accuracy and Validation Loss wouldn't be logged
        fp16=True,
        deepspeed = "/kaggle/working/ds_config_zero3.json",
#         gradient_checkpointing=True,
#         gradient_accumulation_steps=4,
#         fsdp="full_shard",
#         fsdp_auto_wrap_policy="TRANSFORMER_BASED_WRAP",
#         fsdp_transformer_layer_cls_to_wrap = "BertLayer",
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset['train'],
        eval_dataset=dataset['validation'],
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    # Train the model
    trainer.train()
    trainer.evaluate()
if __name__ =="__main__":
    main()
#     from accelerate import notebook_launcher
#     notebook_launcher(main, num_processes=2, mixed_precision="fp16")
"""


# Create and write to the file in the /kaggle/working/ directory
file_path = "/kaggle/working/trainer.py"
with open(file_path, "w") as file:
    file.write(file_content)

print("File created successfully in /kaggle/working/")

In [None]:
# !python -m torch.distributed.launch --nproc_per_node=2 trainer.py
# !torchrun --nproc_per_node=2 load_ddp_model.py
# !accelerate launch --multi_gpu --mixed_precision="fp16" --num_processes=2 trainer.py
!python trainer.py

In [None]:
import torch
import gc
torch.cuda.empty_cache()
gc.collect()