<a href="https://colab.research.google.com/github/nnilayy/MedGPT/blob/main/distributed_training_gpu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install accelerate datasets evaluate transformers

In [None]:
import torch
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, DataCollatorWithPadding
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from accelerate import Accelerator

In [None]:
# Device Type and Count
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_devices = torch.cuda.device_count()
print(f'Using device: {device}')
print(f'Number of available devices: {num_devices}')

accelerator = Accelerator()

checkpoint = 'bert-base-uncased'
model = BertForSequenceClassification.from_pretrained(checkpoint, force_download=True, num_labels=2)
tokenizer = BertTokenizer.from_pretrained(checkpoint, force_download=True)

# model.to(device)

dataset = load_dataset('glue', 'mrpc')

def encode(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], truncation=True, padding='max_length', max_length=128)

dataset = dataset.map(encode, batched=True)
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

train_dataset = dataset['train']
eval_dataset = dataset['validation']

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=data_collator)
eval_loader = DataLoader(eval_dataset, batch_size=32, collate_fn=data_collator)


optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4)

model, optimizer, train_loader, eval_loader = accelerator.prepare(model, optimizer, train_loader, eval_loader)

In [None]:
def train(epoch, model, loader, optimizer, device):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    loop = tqdm(loader, desc=f"Training Epoch {epoch}")
    for batch in loop:
        batch = {k: v for k, v in batch.items() if k in ['input_ids', 'attention_mask', 'labels']}
        outputs = model(**batch)
        loss = outputs.loss

        optimizer.zero_grad()
        # loss.backward()
        accelerator.backward(loss)
        optimizer.step()

        total_loss += loss.item()

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        correct_predictions += (predictions == batch['labels']).sum().item()
        total_predictions += batch['labels'].size(0)
        loop.set_postfix(loss=loss.item())

    train_accuracy = correct_predictions / total_predictions
    train_loss = total_loss / len(loader)
    return train_accuracy, train_loss

# Evaluation function without tqdm
def evaluate(model, loader, device):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    for batch in loader:
        batch = {k: v for k, v in batch.items() if k in ['input_ids', 'attention_mask', 'labels']}
        with torch.no_grad():
            outputs = model(**batch)
            logits = outputs.logits
            loss = outputs.loss
            total_loss += loss.item()
            predictions = torch.argmax(logits, dim=-1)
            correct_predictions += (predictions == batch['labels']).sum().item()
            total_predictions += batch['labels'].size(0)

    validation_accuracy = correct_predictions / total_predictions
    validation_loss = total_loss / len(loader)
    return validation_accuracy, validation_loss

In [None]:
# Run the training and evaluation
for epoch in range(1, 6):  # Training for 3 epochs
    train_accuracy, train_loss = train(epoch, model, train_loader, optimizer, device)
    validation_accuracy, validation_loss = evaluate(model, eval_loader, device)
    print(f"Training Accuracy: {train_accuracy:.4f}, Validation Accuracy: {validation_accuracy:.4f}")
    print(f"Training Loss: {train_loss:.4f}, Validation Loss: {validation_loss:.4f}")

In [None]:
# Define the content you want to write to the file
file_content = """
def hello_world():
    print("Hello, World!")

if __name__=="__main__":
    hello_world()
"""

# Create and write to the file in the /kaggle/working/ directory
file_path = "/kaggle/working/example.py"
with open(file_path, "w") as file:
    file.write(file_content)

print("File created successfully in /kaggle/working/")


In [None]:
!python example.py

In [None]:
file_content="""

import torch
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, DataCollatorWithPadding
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from accelerate import Accelerator

def main():
    accelerator = Accelerator()
    print("Process {}/{} starting".format(accelerator.process_index, accelerator.num_processes))

    device = accelerator.device
    print(f'Using device: {device}', flush=True) if accelerator.is_local_main_process else None

    checkpoint = 'bert-base-uncased'
    model = BertForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
    tokenizer = BertTokenizer.from_pretrained(checkpoint)

    dataset = load_dataset('glue', 'mrpc')
    def encode(examples):
        return tokenizer(examples['sentence1'], examples['sentence2'], truncation=True, padding='max_length', max_length=128)

    dataset = dataset.map(encode, batched=True)
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

    train_dataset = dataset['train']
    eval_dataset = dataset['validation']
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=data_collator)
    eval_loader = DataLoader(eval_dataset, batch_size=32, collate_fn=data_collator)

    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4)
    model, optimizer, train_loader, eval_loader = accelerator.prepare(model, optimizer, train_loader, eval_loader)

    def train(epoch, model, loader, optimizer, device):
        model.train()
        total_loss = 0
        loop = tqdm(loader, desc=f"Training Epoch {epoch}", disable=not accelerator.is_local_main_process)
        for batch in loop:
            outputs = model(**batch)
            loss = outputs.loss

            optimizer.zero_grad()
            accelerator.backward(loss)
            optimizer.step()

            total_loss += loss.item()
            loop.set_postfix(loss=loss.item())

        return total_loss / len(loader)

    def evaluate(model, loader, device):
        model.eval()
        total_loss = 0
        loop = tqdm(loader, desc="Evaluating", disable=not accelerator.is_local_main_process)
        for batch in loop:
            with torch.no_grad():
                outputs = model(**batch)
                loss = outputs.loss
                total_loss += loss.item()
                loop.set_postfix(loss=loss.item())

        return total_loss / len(loader)

    for epoch in range(1, 6):  # Training for 5 epochs
        train_loss = train(epoch, model, train_loader, optimizer, device)
        validation_loss = evaluate(model, eval_loader, device)
        if accelerator.is_local_main_process:
            print(f"Training Loss: {train_loss:.4f}, Validation Loss: {validation_loss:.4f}")

if __name__ == "__main__":
    main()
"""


# Create and write to the file in the /kaggle/working/ directory
file_path = "/kaggle/working/train_script.py"
with open(file_path, "w") as file:
    file.write(file_content)

print("File created successfully in /kaggle/working/")

In [None]:
!accelerate config

In [None]:
# !accelerate launch --multi_gpu --num_processes=2 train_script.py
# !accelerate launch --multi_gpu --mixed_precision=fp16 --num_processes=2 train_script.py
# !accelerate launch --multi_gpu --mixed_precision=fp16 --num_processes=2 train_script.py

Training Epoch 1: 100%|█████████████| 58/58 [00:38<00:00,  1.50it/s, loss=0.897]
Evaluating: 100%|█████████████████████| 7/7 [00:01<00:00,  4.91it/s, loss=0.598]
Training Loss: 0.6477, Validation Loss: 0.6429
Training Epoch 2: 100%|█████████████| 58/58 [00:41<00:00,  1.41it/s, loss=0.698]
Evaluating: 100%|█████████████████████| 7/7 [00:01<00:00,  4.71it/s, loss=0.594]
Training Loss: 0.6470, Validation Loss: 0.6327
Training Epoch 3: 100%|█████████████| 58/58 [00:39<00:00,  1.45it/s, loss=0.582]
Evaluating: 100%|█████████████████████| 7/7 [00:01<00:00,  4.84it/s, loss=0.602]
Training Loss: 0.6409, Validation Loss: 0.6287
Training Epoch 4: 100%|█████████████| 58/58 [00:40<00:00,  1.44it/s, loss=0.519]
Evaluating: 100%|█████████████████████| 7/7 [00:01<00:00,  4.77it/s, loss=0.594]
Training Loss: 0.6387, Validation Loss: 0.6301
Training Epoch 5: 100%|█████████████| 58/58 [00:39<00:00,  1.45it/s, loss=0.528]
Evaluating: 100%|█████████████████████| 7/7 [00:01<00:00,  4.83it/s, loss=0.596]
Tr