In [1]:
!pip install datasets



In [2]:
!pip install torchtext==0.17.0 torch==2.2.0

Collecting torchtext==0.17.0
  Downloading torchtext-0.17.0-cp310-cp310-manylinux1_x86_64.whl.metadata (7.6 kB)
Collecting torch==2.2.0
  Downloading torch-2.2.0-cp310-cp310-manylinux1_x86_64.whl.metadata (25 kB)
Collecting torchdata==0.7.1 (from torchtext==0.17.0)
  Downloading torchdata-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.2.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.2.0)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.2.0)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.2.0)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB

In [9]:
%%writefile main.py

import datasets
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torchtext

import tqdm
import matplotlib.pyplot as plt
import numpy as np

import copy
import random
import time
import os
import json

from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.distributed as dist
import torch.multiprocessing as mp

SEED = 1234
ROOT = "."
MODEL_NAME = "NBoW"
SENARIO = "2GPU"
EPOCHS = 20
BATCH_SIZE = 512

outdir = "./my_datasets"  
os.makedirs(outdir, exist_ok=True)
os.environ['HF_DATASETS_CACHE'] = outdir


random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

"""# 2. Initialize the DDP Environment"""

def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'  # Change this to the master node's IP address if using multiple machines
    os.environ['MASTER_PORT'] = '12345'  # Pick a free port on the master node
    dist.init_process_group("nccl", rank=rank, world_size=world_size)

def cleanup():
    dist.destroy_process_group()

"""# 3. Define a Model."""

class NBoW(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_index):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_index)
        self.fc = nn.Linear(embedding_dim, output_dim)

    def forward(self, ids):
        # ids = [batch size, seq len]
        embedded = self.embedding(ids)
        # embedded = [batch size, seq len, embedding dim]
        pooled = embedded.mean(dim=1)
        # pooled = [batch size, embedding dim]
        prediction = self.fc(pooled)
        # prediction = [batch size, output dim]
        return prediction



def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def create_model(vocab, output_dim, pad_index, embedding_dim = 300):
    vocab_size = len(vocab)
    model = NBoW(vocab_size, embedding_dim, output_dim, pad_index)
    print(f'The model has {count_parameters(model):,} trainable parameters')

    vectors = torchtext.vocab.GloVe()
    pretrained_embedding = vectors.get_vecs_by_tokens(vocab.get_itos())

    model.embedding.weight.data = pretrained_embedding

    return model

"""# 4. Create a Dummy Dataset"""

def create_dataloader(rank, world_size, batch_size=BATCH_SIZE, root = ROOT, max_length = 256):
    def tokenize_example(example, tokenizer, max_length):
        tokens = tokenizer(example["text"])[:max_length]
        return {"tokens": tokens}

    ## load the data with
    if rank == 0:
        # Load the dataset 
        train_data, test_data = datasets.load_dataset("imdb", split=["train", "test"])

    dist.barrier()  # Ensure all processes wait for the dataset to be downloaded
     
    train_data, test_data = datasets.load_dataset("imdb", split=["train", "test"])
    
    ## Tokenization
    tokenizer = torchtext.data.utils.get_tokenizer("basic_english")
    train_data = train_data.map(
        tokenize_example, fn_kwargs={"tokenizer": tokenizer, "max_length": max_length}
    )
    test_data = test_data.map(
        tokenize_example, fn_kwargs={"tokenizer": tokenizer, "max_length": max_length}
    )

    ## create the validation split
    test_size = 0.25

    train_valid_data = test_data.train_test_split(test_size=test_size)
    test_data = train_valid_data["train"]
    valid_data = train_valid_data["test"]


    ## Creating a Vocabulary
    min_freq = 5
    special_tokens = ["<unk>", "<pad>"]

    vocab = torchtext.vocab.build_vocab_from_iterator(
        train_data["tokens"],
        min_freq=min_freq,
        specials=special_tokens,
    )

    if rank == 0:
        print(f"Vocabulary size: {len(vocab)}")
        print(f'Number of training examples: {len(train_data)}')
        print(f'Number of validation examples: {len(valid_data)}')
        print(f'Number of testing examples: {len(test_data)}')

    unk_index = vocab["<unk>"]
    pad_index = vocab["<pad>"]
    vocab.set_default_index(unk_index)

    ## Numericalizing Data
    def numericalize_example(example, vocab):
        ids = vocab.lookup_indices(example["tokens"])
        return {"ids": ids}

    train_data = train_data.map(numericalize_example, fn_kwargs={"vocab": vocab})
    valid_data = valid_data.map(numericalize_example, fn_kwargs={"vocab": vocab})
    test_data = test_data.map(numericalize_example, fn_kwargs={"vocab": vocab})

    train_data = train_data.with_format(type="torch", columns=["ids", "label"])
    valid_data = valid_data.with_format(type="torch", columns=["ids", "label"])
    test_data = test_data.with_format(type="torch", columns=["ids", "label"])

    ## Creating Data Loaders
    def get_collate_fn(pad_index):
        def collate_fn(batch):
            batch_ids = [i["ids"] for i in batch]
            batch_ids = nn.utils.rnn.pad_sequence(
                batch_ids, padding_value=pad_index, batch_first=True
            )
            batch_label = [i["label"] for i in batch]
            batch_label = torch.stack(batch_label)
            batch = {"ids": batch_ids, "label": batch_label}
            return batch

        return collate_fn
    
    collate_fn = get_collate_fn(pad_index)
    train_sampler = DistributedSampler(train_data, num_replicas=world_size, rank=rank, shuffle=True)
    val_sampler = DistributedSampler(valid_data, num_replicas=world_size, rank=rank)

    train_dataloader = data.DataLoader(train_data, batch_size=batch_size, collate_fn=collate_fn, sampler=train_sampler, pin_memory=True) #use num_workers > 0 for better performance
    val_dataloader = data.DataLoader(valid_data, batch_size=batch_size, collate_fn=collate_fn, sampler=val_sampler, pin_memory=True) #use num_workers > 0 for better performance
    test_dataloader = data.DataLoader(test_data, batch_size=batch_size, collate_fn=collate_fn, shuffle=False, pin_memory=True) #no sampling for test dataset

    
    output_dim = len(train_data.unique("label"))
    return train_dataloader, val_dataloader, test_dataloader, vocab, output_dim, pad_index

"""# 5. Implement the Training Loop

## a. Help function
"""

RESULTS_FILE = f"{ROOT}/{MODEL_NAME}_{EPOCHS}epochs_{SENARIO}.json"

def log_results(scenario, results):
    """
    Save results to a JSON file for comparison across scenarios.
    """
    if os.path.exists(RESULTS_FILE):
        with open(RESULTS_FILE, 'r') as f:
            all_results = json.load(f)
    else:
        all_results = {}

    all_results[scenario] = results

    with open(RESULTS_FILE, 'w') as f:
        json.dump(all_results, f, indent=4)

def get_accuracy(prediction, label):
    batch_size, _ = prediction.shape
    predicted_classes = prediction.argmax(dim=-1)
    correct_predictions = predicted_classes.eq(label).sum()
    accuracy = correct_predictions / batch_size
    return accuracy

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

"""## b. train function"""
def train(model, data_loader, criterion, optimizer, rank):
    model.train()
    epoch_losses = []
    epoch_accs = []
    i=0
    for batch in tqdm.tqdm(data_loader, desc=f"Training on the rank {rank}..."):
        ids = batch["ids"].to(rank)
        label = batch["label"].to(rank)
        prediction = model(ids)
        loss = criterion(prediction, label)
        accuracy = get_accuracy(prediction, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
        epoch_accs.append(accuracy.item())
        if i % 50 == 0 and rank == 0:
            print(f"- On Training: {i} was passed over  {len(data_loader)}")
        i+=1
    return np.mean(epoch_losses), np.mean(epoch_accs)

"""## c. Validation function"""

def evaluate(model, data_loader, criterion, rank, mode = "Evaluating"):
    model.eval()
    epoch_losses = []
    epoch_accs = []
    i = 0
    with torch.no_grad():
        for batch in tqdm.tqdm(data_loader, desc=f"{mode} on the rank {rank}..."):
            ids = batch["ids"].to(rank)
            label = batch["label"].to(rank)
            prediction = model(ids)
            loss = criterion(prediction, label)
            accuracy = get_accuracy(prediction, label)
            epoch_losses.append(loss.item())
            epoch_accs.append(accuracy.item())
            if i % 50 == 0 and rank == 0:
                print(f"- On {mode}: {i} was passed over  {len(data_loader)}")
            i+=1
    return np.mean(epoch_losses), np.mean(epoch_accs)

"""## d. Main loop"""

outdir = f'{ROOT}/model/'
if not os.path.exists(outdir):
    os.makedirs(outdir)

def main_train(rank, world_size, root = outdir, num_epochs = EPOCHS, model_name = MODEL_NAME):
    ## a. Set up the distributed process groups
    setup(rank, world_size)
    print(f"Process {rank} initialized.")

    ## b. Create Model, DataLoader
    train_dataloader, val_dataloader, test_dataloader, vocab, output_dim, pad_index = create_dataloader(rank, world_size)
    model = create_model(vocab, output_dim, pad_index).to(rank)

    ## c. Wrap the model with DistributedDataParallel
    ddp_model = DDP(model, device_ids=[rank])

    ## d. Loss and Optimizer
    #LR = 5e-4
    criterion = nn.CrossEntropyLoss().to(rank) # Move loss to GPU
    optimizer = optim.Adam(ddp_model.parameters())

    ## e. Training Loop
    best_valid_loss = float('inf')
    training_times = []
    train_losses = []
    train_accurcy = []
    validation_times = []
    validation_losses = []
    validation_accurcy = []

    epoch_times = []
    
    for epoch in range(num_epochs):
        start_epoch_time = time.monotonic()
        start_time = time.monotonic()

        train_loss, train_acc = train(ddp_model, train_dataloader, criterion, optimizer, rank)
        train_time = time.monotonic() - start_time
        training_times.append(train_time)
        train_losses.append(train_loss)
        train_accurcy.append(train_acc)

        start_time = time.monotonic()
        valid_loss, valid_acc = evaluate(ddp_model, val_dataloader, criterion, rank)
        val_time = time.monotonic() - start_time
        validation_times.append(val_time)
        validation_losses.append(valid_loss)
        validation_accurcy.append(valid_acc)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(ddp_model.state_dict(), f'{root}tut-model.pt')

        end_time = time.monotonic()
        e_time = end_time - start_epoch_time
        epoch_times.append(e_time)
        epoch_mins, epoch_secs = epoch_time(start_epoch_time, end_time)

        print(f'--------------|     On process {rank}      |----------------')
        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

    ## f. test after train
    ddp_model.load_state_dict(torch.load(f'{root}tut-model.pt'))
    start_time = time.monotonic()
    test_loss, test_acc = evaluate(ddp_model, test_dataloader, criterion, rank, mode = "Testing")
    test_time = time.monotonic() - start_time
    print(f'Test results on process {rank}: Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

    # Log results
    results = {
        "world_size": world_size,
        "rank": rank,
        "training_times": training_times,
        "train_losses": train_losses,
        "train_accurcy": train_accurcy,
        "validation_times": validation_times,
        "validation_losses": validation_losses,
        "validation_accurcy": validation_accurcy,
        "test_time": test_time,
        "test_loss": test_loss,
        "test_acc": test_acc,
        "epoch_times": epoch_times
     }

    scenario = f"model_{model_name}_epochs_{num_epochs}_{world_size}_GPUs_rank_{rank}"
    log_results(scenario, results)
    dist.barrier()
    
    cleanup()
    print(f'Process {rank} finished training.')

"""# 6. Main Execution"""
if __name__ == "__main__":

    def main():
        world_size = torch.cuda.device_count()
        print(f'Total number of devices detected: {world_size}')

        if world_size >= 1:
            #start the training process on all available GPUs
            if world_size > 1:
                #start the training process on all available GPUs
                mp.spawn(
                    main_train,
                    args=(world_size,),
                    nprocs=world_size,
                    join=True
                )
            else:
                #run training on single GPU
                main_train(rank=0, world_size=1)

        else:
            print('no GPUs found. Please make sure you have configured CUDA correctly')

    main()

Overwriting main.py


In [10]:
ls

main.py  [0m[01;34mmodel[0m/  [01;34mmy_datasets[0m/  NBoW_5epochs_2GPU.json


In [None]:
!python main.py

Total number of devices detected: 2
Process 0 initialized.
Process 1 initialized.
Vocabulary size: 24897
Number of training examples: 25000
Number of validation examples: 6250
Number of testing examples: 18750
The model has 7,469,702 trainable parameters
The model has 7,469,702 trainable parameters
Training on the rank 0...:   0%|                         | 0/25 [00:00<?, ?it/s]