In [None]:
!pip install -U bitsandbytes evaluate datasets transformers peft deepspeed triton wandb
# !pip install flash-attn

In [None]:
%%bash
sudo apt-get update
sudo apt-get install libaio-dev -y

In [None]:
import os
import wandb
from dotenv import load_dotenv
load_dotenv()

hugging_face_api_key = os.getenv("HUGGING_FACE_API_KEY")
wandb_api_key = os.getenv("WANDB_API_KEY")
wandb.login(key = wandb_api_key)

In [None]:
import torch
from transformers import AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM
from transformers import BertTokenizer, BertForMaskedLM
# Loading the model in 8-bit and 4-bit
# checkpoint = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"

checkpoint = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(checkpoint, padding_side="right")
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
   load_in_8bit=True,
#    bnb_4bit_quant_type="nf4",
#    bnb_4bit_use_double_quant=True,
   bnb_8bit_compute_dtype=torch.bfloat16
)

model = BertForMaskedLM.from_pretrained(checkpoint,
                                            #  device_map = "auto",
                                             quantization_config = bnb_config,
                                             torch_dtype=torch.float16,
                                             )

In [None]:
# MERGE LORA WEIGHTS WITH BASE MODEL
from transformers import AutoModelForCausalLM
from peft import PeftModel

# Assuming 'base_model' is your pre-trained model's name or path
base_model = AutoModelForCausalLM.from_pretrained("base_model_name_or_path")
peft_model = PeftModel.from_pretrained(base_model, "path_to_trained_adapter")
merged_model = peft_model.merge_and_unload()

In [None]:
# Loads Model on 1 GPU Vram, in 16 precision
# Reduce memory footprint by half
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-3b", torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-3b")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
import torch
from transformers import AutoModelForCausalLM
torch.set_default_dtype(torch.float16)
if torch.cuda.is_available():
    torch.set_default_device('cuda:1')
model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-3b")

In [None]:
# Data Parallelism: Same Model Gets Loaded On One GPU
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-3b")
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-3b")
model = torch.nn.DataParallel(model)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-3b")
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-3b")

if torch.cuda.is_available():
    # Use all available GPUs
    device_ids = list(range(torch.cuda.device_count()))
    model = torch.nn.DataParallel(model, device_ids=device_ids)
    model.to('cuda')  # Move model to the default device
else:
    print("CUDA is not available.")


In [None]:
# !python -m torch.distributed.launch --nproc_per_node=2 trainer.py
# !torchrun --nproc_per_node=2 load_ddp_model.py
# !accelerate launch --multi_gpu --mixed_precision="fp16" --num_processes=2 trainer.py
!python trainer.py

In [None]:
%%bash
cat <<'EOT' > ds_config_zero3.json
{
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },

    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "betas": "auto",
            "eps": "auto",
            "weight_decay": "auto"
        }
    },

    "scheduler": {
        "type": "WarmupLR",
        "params": {
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto"
        }
    },

    "zero_optimization": {
        "stage": 3,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": true
        },

        "offload_param": {
            "device": "cpu",
            "pin_memory": true
        },

        "overlap_comm": true,
        "contiguous_gradients": true,
        "sub_group_size": 1e9,
        "reduce_bucket_size": "auto",
        "stage3_prefetch_bucket_size": "auto",
        "stage3_param_persistence_threshold": "auto",
        "stage3_max_live_parameters": 1e9,
        "stage3_max_reuse_distance": 1e9,
        "stage3_gather_16bit_weights_on_model_save": true
    },

    "sparse_attention": {
        "mode": "fixed",
        "block": 16,
        "different_layout_per_head": true,
        "num_local_blocks": 4,
        "num_global_blocks": 1,
        "attention": "bidirectional",
        "horizontal_global_attention": false,
        "num_different_global_patterns": 4,
        "num_random_blocks": 0,
        "local_window_blocks": [4],
        "global_block_indices": [0],
        "num_sliding_window_blocks": 3
  },

    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    "steps_per_print": 2000,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
}
EOT

In [None]:
%%bash
cat <<'EOT' > trainer.py

# --------------------------------------------------------------------------------------------------------------------------------------------
# --------------------------------------------------------------------------------------------------------------------------------------------
import os
import multiprocessing
from transformers import BertTokenizer, BitsAndBytesConfig, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
from datasets import load_dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from accelerate import PartialState
from accelerate.logging import get_logger

class SparseAttention(nn.Module):
    def __init__(self, hidden_size, num_heads, sparsity_factor=2):
        super().__init__()
        self.num_heads = num_heads
        self.hidden_size = hidden_size
        self.head_dim = hidden_size // num_heads
        self.sparsity_factor = sparsity_factor

        self.query = nn.Linear(hidden_size, hidden_size)
        self.key = nn.Linear(hidden_size, hidden_size)
        self.value = nn.Linear(hidden_size, hidden_size)

    def forward(self, hidden_states):
        batch_size, seq_length, _ = hidden_states.size()

        query = self.query(hidden_states).view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
        key = self.key(hidden_states).view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
        value = self.value(hidden_states).view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)

        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_dim)

        sparse_mask = torch.ones_like(scores, dtype=torch.bool)
        sparse_mask[:, :, :, ::self.sparsity_factor] = False

        scores = scores.masked_fill(sparse_mask, float('-inf'))

        attn_weights = F.softmax(scores, dim=-1)
        attn_output = torch.matmul(attn_weights, value)

        return attn_output.transpose(1, 2).contiguous().view(batch_size, seq_length, self.hidden_size), attn_weights

class CustomModelWithSparseAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
        self.sparse_attention = SparseAttention(config.hidden_size, config.num_attention_heads)
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None,
                inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None,
                return_dict=None):
        if input_ids is not None:
            input_shape = input_ids.size()
            embeddings = self.embeddings(input_ids)
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.size()[:-1]
            embeddings = inputs_embeds
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        attention_output, attention_weights = self.sparse_attention(embeddings)
        hidden_states = self.dense(attention_output)
        logits = self.classifier(hidden_states[:, 0, :])  # Use [CLS] token for classification

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,)
            if output_hidden_states:
                output += (hidden_states,)
            if output_attentions:
                output += (attention_weights,)
            return ((loss,) + output) if loss is not None else output

        return {'loss': loss, 'logits': logits, 'hidden_states': hidden_states, 'attentions': attention_weights}

def main():
    checkpoint = ["bert-base-uncased",
                 "BioMistral/BioMistral-7B",
                 "bigscience/bloom-3b",
                 ]

    index = 0  # Using BERT base uncased
    config = AutoConfig.from_pretrained(checkpoint[index])
    config.num_labels = 2

    model = CustomModelWithSparseAttention(config)

    peft_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        inference_mode=False,
        r=1,
        lora_alpha=2048,
        lora_dropout=0.1,
        target_modules=["dense", "classifier"]
    )

    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        if isinstance(logits, tuple):  # If model returns a tuple, take the first element as logits
            logits = logits[0]
        predictions = np.argmax(logits, axis=-1)
        accuracy = (predictions == labels).mean()
        return {'accuracy': accuracy}

    def encode(examples):
        outputs = tokenizer(examples['sentence1'], examples['sentence2'], truncation=True, padding='max_length', max_length=128)
        outputs['labels'] = examples['label']
        return outputs

    tokenizer = AutoTokenizer.from_pretrained(checkpoint[index])
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    dataset = load_dataset('glue', 'mrpc')
    dataset = dataset.map(encode, batched=True, num_proc=12)
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'labels'])
    label_names = dataset['train'].features['label'].names
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    torch.set_grad_enabled(True)
    training_args = TrainingArguments(
        output_dir='./results',
        logging_dir='./logs_rslora',
        do_train=True,
        do_eval=True,
        num_train_epochs=10,
        learning_rate=2e-4,
        logging_strategy='epoch',
        per_device_train_batch_size=20,
        per_device_eval_batch_size=20,
        save_total_limit=1,
        dataloader_num_workers=8,
        dataloader_pin_memory=True,
        dataloader_prefetch_factor=4,
        save_strategy="epoch",
        evaluation_strategy="epoch",
        label_names=["labels"],
        fp16=True,
        ddp_find_unused_parameters=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset['train'],
        eval_dataset=dataset['validation'],
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    trainer.train()
    trainer.evaluate()

if __name__ == "__main__":
    main()
# --------------------------------------------------------------------------------------------------------------------------------------------
# --------------------------------------------------------------------------------------------------------------------------------------------
EOT

In [None]:
# !torchrun --nproc_per_node=2 load_ddp_model.py
!accelerate launch --multi_gpu \
--num_processes=2 \
--mixed_precision="fp16" \
working_trainer.py

# --multi_gpu
# --use_deepspeed

# Training Config
# deepspeed: no
# lora: no
# time: cuda error

# Training Config
# deepspeed: no
# lora: yes
# time: 26 mins

# Training Config
# deepspeed: yes
# lora: yes
# time: 48 mins

# Training Config
# deepspeed: yes
# lora: yes
# time: 48 mins