In [None]:
from google.colab import files
uploaded = files.upload()

Saving construct_dataset.py to construct_dataset.py
Saving download_data.py to download_data.py
Saving gpt.py to gpt.py
Saving train_runs.py to train_runs.py
Saving train.py to train.py


In [None]:
# !pip install torch tqdm matplotlib psutil
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

Using device: cuda


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Training set-up for Baseline and Longformer model experiments with lengths 256 and 512
import torch
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
import os
import psutil
from tqdm import tqdm
import sys
sys.path.append('/content/drive/MyDrive/gpt_starter')

from warmup_cosine import cosine_with_warmup_lr_scheduler
import sys
# Add directory to the path
sys.path.append('/content/drive/MyDrive/gpt_starter')
# Import gpt class from file
from gpt import GPTModel
# # Mount Google Drive
# from google.colab import drive
# drive.mount('/content/drive')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

if device == torch.device("cuda"):
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

# Enable mixed precision for GPU
AUTOCAST_DTYPE = torch.float16 if device == torch.device("cuda") else None
COMPILE = True

# Model architecture
D_MODEL = 512
N_HEADS = 16
LAYERS = 8
VOCAB_SIZE = 10000
PEAK_LR = 0.0005
WARMUP_STEPS = 300
BATCH_SIZE = 32
ACCUMULATION = 4
GRAD_CLIP = 1.0

# Experimental loop setup
sequence_lengths = [256, 512, 1024]
epochs = 5

torch.manual_seed(0)

# Loop through each model type and sequence length for the experiments
for model_type in ["baseline", "longformer"]:
    for sequence_length in sequence_lengths:
        # Initialize the model based on model type
        print("Setting up ", model_type, " model with sequence length ", sequence_length)
        if model_type == "baseline": # no sliding window if baseline model
            model = GPTModel(
                d_model=D_MODEL, n_heads=N_HEADS, layers=LAYERS, vocab_size=VOCAB_SIZE, max_seq_len=sequence_length
            )
        elif model_type == "longformer":
            model = GPTModel(
                d_model=D_MODEL, n_heads=N_HEADS, layers=LAYERS, vocab_size=VOCAB_SIZE, max_seq_len=sequence_length,
                window_size=128, global_attn_nodes=[0]
            )
        if COMPILE and hasattr(torch, "compile"):
            model = torch.compile(model)
        model = model.to(device)

        # Load dataset from google drive
        dataset_path = f'/content/drive/MyDrive/gpt_starter/dataset_{sequence_length}.npy'
        with open(dataset_path, 'rb') as f:
            dataset = np.load(f, allow_pickle=True)
        print("Loaded dataset of shape: ", dataset.shape, " for sequence length: ", sequence_length)

        # Initialize optimizer, scheduler, and loss function
        opt = torch.optim.AdamW(model.parameters(), lr=PEAK_LR)
        scheduler = cosine_with_warmup_lr_scheduler(opt, len(dataset) // BATCH_SIZE, WARMUP_STEPS)
        loss_fn = torch.nn.CrossEntropyLoss().to(device)
        scaler = torch.cuda.amp.GradScaler() if device == torch.device("cuda") else None

        # Metrics tracking
        losses = []
        perplexities = []
        times = []
        memory_usages = []
        metrics_df = pd.DataFrame(columns=["Epoch", "Batch", "Loss", "Perplexity"])

        # Training
        start_time = time.time()
        for epoch in range(epochs):
            print("Running ", model_type.upper(), " Model with sequence length ", sequence_length, ", Epoch ", epoch + 1, "/", epochs)
            # Track epoch time
            epoch_start_time = time.time()
            epoch_memory_usage_start = psutil.Process(os.getpid()).memory_info().rss / (1024 ** 2)  # Memory in MB

            # Add tqdm for batch progress bar
            with tqdm(total=len(dataset) // BATCH_SIZE, desc=f"Epoch {epoch + 1}/{epochs}") as pbar:
                for b in range(len(dataset) // BATCH_SIZE):
                    # Prepare the batch
                    bdx = b % (len(dataset) // BATCH_SIZE)
                    x = dataset[BATCH_SIZE * bdx:BATCH_SIZE * (bdx + 1), :]
                    x = torch.from_numpy(x).to(device)
                    inp = x[:, :-1]
                    targ = x[:, 1:]

                    # Forward pass
                    with torch.autocast(device_type="cuda", dtype=AUTOCAST_DTYPE):
                        y = model(inp)
                        y = y.transpose(1, 2)
                        loss = loss_fn(y, targ)

                    # Backpropagation
                    if scaler:
                        scaler.scale(loss).backward()
                    else:
                        loss.backward()

                    # Gradient accumulation and optimization
                    if (b + 1) % ACCUMULATION == 0:
                        if scaler:
                            scaler.unscale_(opt)
                        torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
                        if scaler:
                            scaler.step(opt)
                            scaler.update()
                        else:
                            opt.step()
                        opt.zero_grad(set_to_none=True)

                    scheduler.step()

                    # Log loss and perplexity
                    losses.append(loss.item())
                    perplexity = torch.exp(loss).item()
                    perplexities.append(perplexity)
                    # Add data to the dataframe
                    new_row = pd.DataFrame([{"Epoch": epoch + 1, "Batch": b + 1, "Loss": loss.item(), "Perplexity": perplexity}])
                    metrics_df = pd.concat([metrics_df, new_row], ignore_index=True)

                    # Update progress bar
                    pbar.set_postfix({"Loss": loss.item(), "Perplexity": perplexity})
                    pbar.update(1)

            # Get timing metrics
            epoch_time = time.time() - epoch_start_time
            epoch_memory_usage_end = psutil.Process(os.getpid()).memory_info().rss / (1024 ** 2)
            epoch_memory_usage = epoch_memory_usage_end - epoch_memory_usage_start
            times.append(epoch_time)
            memory_usages.append(epoch_memory_usage)

            print("Epoch ", epoch + 1, " complete. Time: ", epoch_time, " seconds, Memory Usage: ," epoch_memory_usage, " MB")

        # Save metrics
        metrics_df.to_csv(f"/content/drive/MyDrive/{model_type}_model_seq{sequence_length}_metrics.csv", index=False)

        # Plot loss curve
        plt.plot(losses, label="Cross entropy Loss")
        plt.xlabel("Epochs")
        plt.ylabel("Loss")
        plt.legend()
        plt.savefig(f"/content/drive/MyDrive/{model_type}_model_loss_seq{sequence_length}.png")
        plt.show()
        # Save model weights
        torch.save(model.state_dict(), f"/content/drive/MyDrive/{model_type}_model_seq{sequence_length}_weights.pt")
        # Save losses, perplexities, times, and memory usages
        pd.DataFrame({"Loss": losses, "Perplexity": perplexities}).to_csv(f"/content/drive/MyDrive/{model_type}_model_seq{sequence_length}_metrics.csv", index=False)
        pd.DataFrame({"Epoch": range(1, epochs + 1), "Time (s)": times, "Memory Usage (MB)": memory_usages}).to_csv(f"/content/drive/MyDrive/{model_type}_model_seq{sequence_length}_times_memory.csv", index=False)


Using device: cuda
Setting up BASELINE model with sequence length 256
Loaded dataset of shape: (512080, 257) for sequence length: 256
Running BASELINE Model with sequence length 256, Epoch 1/5


  metrics_df = pd.concat([metrics_df, new_row], ignore_index=True)
Epoch 1/5: 100%|██████████| 16002/16002 [47:32<00:00,  5.61it/s, Loss=0.0196, Perplexity=1.02]


Epoch 1 complete. Time: 2852.13 seconds, Memory Usage: -826.03 MB
Running BASELINE Model with sequence length 256, Epoch 2/5


Epoch 2/5:  91%|█████████ | 14542/16002 [42:52<04:14,  5.73it/s, Loss=0.0233, Perplexity=1.02]

In [None]:
# Install pre-built versions of transformers and tokenizers
!pip install transformers==4.31.0 tokenizers==0.13.3
# Clone the Longformer repository
!git clone https://github.com/allenai/longformer.git
# Move to the longformer directory
%cd longformer
# Install remaining requirements, skipping those already installed
!pip install -r requirements.txt --no-deps


Collecting transformers==4.31.0
  Downloading transformers-4.31.0-py3-none-any.whl.metadata (116 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m112.6/116.9 kB[0m [31m3.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.9/116.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers==0.13.3
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m51.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m101.0 MB/s[0m e

In [None]:
# !git clone https://github.com/allenai/longformer.git
# !cd longformer/longformer
# !python setup.py install
# !pip install torch transformers
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"



In [None]:
# Attempt at using pre-trained Allen AI longformer to traing 1024 sequence lenght tokens
import torch
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ['TORCH_COMPILE_DEBUG'] = '1'
os.environ['TORCHINDUCTOR_DISABLE'] = '1'  # Disable Triton - debug
import psutil
from tqdm import tqdm
import sys
from google.colab import drive
drive.mount('/content/drive')
sys.path.append('/content/drive/MyDrive/gpt_starter/')
from warmup_cosine import cosine_with_warmup_lr_scheduler
import torch._dynamo
torch._dynamo.config.suppress_errors = True

# Import Longformer from the transformers library
from transformers import LongformerConfig, LongformerModel, LongformerTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if device == torch.device("cuda"):
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

# Model architecture
D_MODEL = 256
N_HEADS = 8
LAYERS = 6
VOCAB_SIZE = 10000
PEAK_LR = 0.0005
WARMUP_STEPS = 300
BATCH_SIZE = 4  # Reduced batch size for debugging
ACCUMULATION = 4
GRAD_CLIP = 1.0

# Experiments
sequence_lengths = [1024]
epochs = 5
checkpoint_frequency = 500

torch.manual_seed(0)

# Load the Longformer model
def load_longformer(sequence_length):
    config = LongformerConfig(attention_window=512)
    model = LongformerModel(config)
    tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
    return model, tokenizer
# Training
for model_type in ["longformer"]:
    for sequence_length in sequence_lengths:
        print("Setting up ", model_type, " model with sequence length ", sequence_length)
        if model_type == "longformer":
            model, tokenizer = load_longformer(sequence_length)
        model = model.to(device)
        # Load dataset
        dataset_path = f'/content/drive/MyDrive/gpt_starter/dataset_{sequence_length}.npy'
        with open(dataset_path, 'rb') as f:
            dataset = np.load(f, allow_pickle=True)
        print("Loaded dataset of shape: ", dataset.shape, " for sequence length: ", sequence_length)

        # Initialize optimizer, scheduler, and loss function
        opt = torch.optim.AdamW(model.parameters(), lr=PEAK_LR)
        scheduler = cosine_with_warmup_lr_scheduler(opt, len(dataset) // BATCH_SIZE, WARMUP_STEPS)
        loss_fn = torch.nn.CrossEntropyLoss().to(device)

        # Training loop
        start_time = time.time()
        for epoch in range(epochs):
            print("Running ", model_type, " Model with sequence length ", sequence_length ", Epoch ", epoch + 1, "/", epochs)
            epoch_start_time = time.time()

            # Add progress bar
            with tqdm(total=len(dataset) // BATCH_SIZE, desc=f"Epoch {epoch + 1}/{epochs}") as pbar:
                for b in range(len(dataset) // BATCH_SIZE):
                    # Prepare batch
                    bdx = b % (len(dataset) // BATCH_SIZE)
                    x = dataset[BATCH_SIZE * bdx:BATCH_SIZE * (bdx + 1), :]
                    x = torch.from_numpy(x).to(device)
                    inp = x[:, :-1]
                    targ = x[:, 1:]
                    if model_type == "longformer":
                        # Create attention_mask based on non-padding tokens
                        attention_mask = (inp != tokenizer.pad_token_id).float()

                        # Debug
                        print("Input IDs shape: ", inp.shape, ", Max token ID: ", inp.max(), ", Min token ID: ", inp.min())
                        print("Attention Mask shape: ", attention_mask.shape, ", Max: ", attention_mask.max(), ", Min: ", attention_mask.min())

                        # Convert inp to the input_ids that Longformer expects
                        inputs = {
                            "input_ids": inp,
                            "attention_mask": attention_mask
                        }
                        torch.cuda.synchronize()
                        outputs = model(**inputs).last_hidden_state
                        torch.cuda.synchronize()

                        # Calculate loss
                        y = outputs.transpose(1, 2)
                        assert y.shape[1] == VOCAB_SIZE, f"Expected logits pof {VOCAB_SIZE} vocab size, got {y.shape[1]}"

                        loss_fn = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id).to(device)
                        loss = loss_fn(y, targ)
                        torch.cuda.synchronize()

                    # Backpropagation
                    loss.backward()
                    # Gradient accumulation and optimization
                    if (b + 1) % ACCUMULATION == 0:
                        torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
                        opt.step()
                        opt.zero_grad(set_to_none=True)
                    scheduler.step()

                    # Log loss and perplexity
                    losses.append(loss.item())
                    perplexity = torch.exp(loss).item()
                    # Update progress bar
                    pbar.set_postfix({"Loss": loss.item(), "Perplexity": perplexity})
                    pbar.update(1)

            epoch_time = time.time() - epoch_start_time
            print(f"Epoch {epoch + 1} complete. Time: {epoch_time:.2f} seconds")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using device: cuda
Setting up LONGFORMER model with sequence length 1024


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loaded dataset of shape: (128019, 1025) for sequence length: 1024
Running LONGFORMER Model with sequence length 1024, Epoch 1/5


Epoch 1/5:   0%|          | 0/32004 [00:00<?, ?it/s]

Input IDs shape: torch.Size([4, 1024]), Max token ID: 9990, Min token ID: 0
Attention Mask shape: torch.Size([4, 1024]), Max: 1.0, Min: 1.0


Epoch 1/5:   0%|          | 0/32004 [00:01<?, ?it/s]


RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
