In [None]:
!pip install datasets transformers
!pip install accelerate
!pip install optimum
!pip install datasets==2.15.0
!pip install wandb

In [1]:
import os
import time
import datetime

import pandas as pd
import seaborn as sns
import numpy as np
import random

import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
torch.manual_seed(42)

from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup



In [3]:
from datasets import load_dataset
datasets = load_dataset("wikimedia/wikisource", "20231201.en")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Resolving data files:   0%|          | 0/19 [00:00<?, ?it/s]

In [4]:
datasets = datasets['train'].train_test_split(test_size=0.005)
data_train = datasets['test']


In [5]:
model_checkpoint = "gpt2"

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

In [7]:
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512, padding="max_length")
tokenized_datasets = data_train.map(tokenize_function, batched=True, num_proc=2, remove_columns=["text",'id','url','title'])

Map (num_proc=2):   0%|          | 0/1042 [00:00<?, ? examples/s]

In [8]:
tokenized_datasets=tokenized_datasets.train_test_split(test_size=0.1)
train_dataset = tokenized_datasets['train']

eval_test = tokenized_datasets['test'].train_test_split(test_size=0.5)
val_dataset = eval_test['train']
test_dataset = eval_test['test']

In [9]:
# Assuming you want to remove the first sample
index_to_remove = 0

# Filter the dataset to exclude the sample with the specified index
train_dataset = train_dataset.filter(lambda example, idx: idx != index_to_remove, with_indices=True)

# Print the updated dataset
print(train_dataset)


Filter:   0%|          | 0/937 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 936
})


In [10]:
def collate_fn(batch):
    input_ids = [item["input_ids"] for item in batch]
    attention_masks = [item["attention_mask"] for item in batch]
    labels = [item["input_ids"] for item in batch]

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    labels = torch.tensor(labels)

    # Pad sequences to the same length
    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True)
    attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True)
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True)

    return {
        "input_ids": input_ids,
        "attention_mask": attention_masks,
        "labels": labels,
    }

In [11]:
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True,collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False,collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False,collate_fn=collate_fn)


In [12]:
for batch in train_dataloader:
    print(batch)
    break

{'input_ids': tensor([[ 1212,  2223,   373,  ..., 50256, 50256, 50256],
        [49580,    12,    66,  ..., 50256, 50256, 50256],
        [ 2202,  2805,  1987,  ...,    13, 14021,    11],
        [   44,  4146, 15543,  ...,   262, 16629,   290]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[ 1212,  2223,   373,  ..., 50256, 50256, 50256],
        [49580,    12,    66,  ..., 50256, 50256, 50256],
        [ 2202,  2805,  1987,  ...,    13, 14021,    11],
        [   44,  4146, 15543,  ...,   262, 16629,   290]])}


In [13]:
# I'm not really doing anything with the config buheret
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False,torch_dtype=torch.float16)

# instantiate the model
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)

# this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
# otherwise the tokenizer and model tensors won't match up
model.resize_token_embeddings(len(tokenizer))

# Tell pytorch to run this model on the GPU.
device = torch.device("cuda")
model.cuda()

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [14]:
# some parameters I cooked up that work reasonably well

epochs = 3
learning_rate = 5e-4
warmup_steps = 1e2
epsilon = 1e-8

# this produces sample output every 100 steps
sample_every = 100

In [15]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch)
optimizer = AdamW(model.parameters(),
                  lr = learning_rate,
                  eps = epsilon
                )



In [16]:
# Total number of training steps is [number of batches] x [number of epochs].
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
# This changes the learning rate as the training loop progresses
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = warmup_steps,
                                            num_training_steps = total_steps)

In [17]:
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

In [18]:
import wandb
wandb.init(project="gpt2_wiki",name="Normal(seq_512_batch_4)" )
total_t0 = time.time()

training_stats = []

model = model.to(device)

for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()

    total_train_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):

        b_input_ids = batch['input_ids'].to(device)
        b_labels = batch['labels'].to(device)
        b_masks = batch['attention_mask'].to(device)

        model.zero_grad()

        outputs = model(  b_input_ids,
                          labels=b_labels,
                          attention_mask = b_masks,
                          token_type_ids=None
                        )

        loss = outputs[0]

        batch_loss = loss.item()
        total_train_loss += batch_loss

        # Get sample every x batches.
        if step % sample_every == 0 and not step == 0:

            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.   Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))
        loss.backward()

        optimizer.step()

        scheduler.step()
        current_lr = scheduler.get_last_lr()[0]
        wandb.log({"Learning Rate": current_lr})

    avg_train_loss = total_train_loss / len(train_dataloader)

    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))

    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in val_dataloader:

        b_input_ids = batch['input_ids'].to(device)
        b_labels = batch['labels'].to(device)
        b_masks = batch['attention_mask'].to(device)

        with torch.no_grad():

            outputs  = model(b_input_ids,
#                            token_type_ids=None,
                             attention_mask = b_masks,
                            labels=b_labels)

            loss = outputs[0]

        batch_loss = loss.item()
        total_eval_loss += batch_loss

    avg_val_loss = total_eval_loss / len(val_dataloader)

    validation_time = format_time(time.time() - t0)

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

    wandb.log({"epoch": epoch_i + 1, "Training Loss": avg_train_loss, "Validation Loss": avg_val_loss})
print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

[34m[1mwandb[0m: Currently logged in as: [33mpepoo20[0m ([33mhtx_ai_101[0m). Use [1m`wandb login --relogin`[0m to force relogin



Training...
  Batch   100  of    234. Loss: 2.4007809162139893.   Elapsed: 0:01:02.
  Batch   200  of    234. Loss: 1.226928949356079.   Elapsed: 0:02:02.

  Average training loss: 2.84
  Training epoch took: 0:02:23

Running Validation...
  Validation Loss: 2.70
  Validation took: 0:00:03

Training...
  Batch   100  of    234. Loss: 2.660759925842285.   Elapsed: 0:01:01.
  Batch   200  of    234. Loss: 2.5243844985961914.   Elapsed: 0:02:01.

  Average training loss: 2.36
  Training epoch took: 0:02:22

Running Validation...
  Validation Loss: 2.72
  Validation took: 0:00:03

Training...
  Batch   100  of    234. Loss: 0.725527286529541.   Elapsed: 0:01:01.
  Batch   200  of    234. Loss: 1.2108570337295532.   Elapsed: 0:02:01.

  Average training loss: 1.96
  Training epoch took: 0:02:22

Running Validation...
  Validation Loss: 2.79
  Validation took: 0:00:03

Training complete!
Total training took 0:07:14 (h:mm:ss)


In [19]:
# Display floats with two decimal places.
# pd.set_option('precision', 2)

# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index.
df_stats = df_stats.set_index('epoch')

# A hack to force the column headers to wrap.
#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])

# Display the table.
df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2.836028,2.702509,0:02:23,0:00:03
2,2.356757,2.718431,0:02:22,0:00:03
3,1.959563,2.790252,0:02:22,0:00:03


In [20]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
7.908 GB of memory reserved.


In [21]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [22]:
import torch
from optimum.bettertransformer import BetterTransformer

configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False,torch_dtype=torch.float16)

# instantiate the model
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)

# this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
# otherwise the tokenizer and model tensors won't match up
model.resize_token_embeddings(len(tokenizer))
model = BetterTransformer.transform(model)

# Tell pytorch to run this model on the GPU.
device = torch.device("cuda")
model.cuda()

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


In [23]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch)
optimizer = AdamW(model.parameters(),
                  lr = learning_rate,
                  eps = epsilon
                )



In [24]:
# some parameters I cooked up that work reasonably well

epochs = 3
learning_rate = 5e-4
warmup_steps = 1e2
epsilon = 1e-8

# this produces sample output every 100 steps
sample_every = 100

In [25]:
# Total number of training steps is [number of batches] x [number of epochs].
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
# This changes the learning rate as the training loop progresses
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = warmup_steps,
                                            num_training_steps = total_steps)

In [26]:
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

In [28]:
import wandb
wandb.init(project="gpt2_wiki",name="Flash_optimum(seq_512_batch_4)" )
total_t0 = time.time()

training_stats = []

model = model.to(device)
with torch.backends.cuda.sdp_kernel(
                    enable_flash=True, enable_math=False, enable_mem_efficient=True
                ):
  for epoch_i in range(0, epochs):

      # ========================================
      #               Training
      # ========================================

      print("")
      print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
      print('Training...')

      t0 = time.time()

      total_train_loss = 0

      model.train()

      for step, batch in enumerate(train_dataloader):

          b_input_ids = batch['input_ids'].to(device)
          b_labels = batch['labels'].to(device)
          b_masks = batch['attention_mask'].to(device)

          model.zero_grad()

          outputs = model( b_input_ids,
                            labels=b_labels,
                            attention_mask = b_masks,
                            token_type_ids=None
                          )

          loss = outputs[0]

          batch_loss = loss.item()
          total_train_loss += batch_loss

          # Get sample every x batches.
          if step % sample_every == 0 and not step == 0:

              elapsed = format_time(time.time() - t0)
              print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.   Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))






          loss.backward()

          optimizer.step()

          scheduler.step()
          current_lr = scheduler.get_last_lr()[0]
          wandb.log({"Learning Rate": current_lr})

      # Calculate the average loss over all of the batches.
      avg_train_loss = total_train_loss / len(train_dataloader)

      # Measure how long this epoch took.
      training_time = format_time(time.time() - t0)

      print("")
      print("  Average training loss: {0:.2f}".format(avg_train_loss))
      print("  Training epoch took: {:}".format(training_time))

      # ========================================
      #               Validation
      # ========================================

      print("")
      print("Running Validation...")

      t0 = time.time()

      model.eval()

      total_eval_loss = 0
      nb_eval_steps = 0

      # Evaluate data for one epoch
      for batch in val_dataloader:

          b_input_ids = batch['input_ids'].to(device)
          b_labels = batch['labels'].to(device)
          b_masks = batch['attention_mask'].to(device)

          with torch.no_grad():

              outputs  = model(b_input_ids,
  #                            token_type_ids=None,
                              attention_mask = b_masks,
                              labels=b_labels)

              loss = outputs[0]

          batch_loss = loss.item()
          total_eval_loss += batch_loss

      avg_val_loss = total_eval_loss / len(val_dataloader)

      validation_time = format_time(time.time() - t0)

      print("  Validation Loss: {0:.2f}".format(avg_val_loss))
      print("  Validation took: {:}".format(validation_time))

      # Record all statistics from this epoch.
      training_stats.append(
          {
              'epoch': epoch_i + 1,
              'Training Loss': avg_train_loss,
              'Valid. Loss': avg_val_loss,
              'Training Time': training_time,
              'Validation Time': validation_time
          }
      )
      wandb.log({"epoch": epoch_i + 1, "Training Loss": avg_train_loss, "Validation Loss": avg_val_loss})

print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))


Training...
  Batch   100  of    234. Loss: 2.253352642059326.   Elapsed: 0:00:57.
  Batch   200  of    234. Loss: 3.1808199882507324.   Elapsed: 0:01:52.

  Average training loss: 2.45
  Training epoch took: 0:02:11

Running Validation...
  Validation Loss: 3.08
  Validation took: 0:00:02

Training...
  Batch   100  of    234. Loss: 1.8277353048324585.   Elapsed: 0:00:56.
  Batch   200  of    234. Loss: 1.8914355039596558.   Elapsed: 0:01:52.

  Average training loss: 2.02
  Training epoch took: 0:02:11

Running Validation...
  Validation Loss: 3.09
  Validation took: 0:00:02

Training...
  Batch   100  of    234. Loss: 2.281769275665283.   Elapsed: 0:00:56.
  Batch   200  of    234. Loss: 2.4016902446746826.   Elapsed: 0:01:52.

  Average training loss: 1.77
  Training epoch took: 0:02:11

Running Validation...
  Validation Loss: 3.14
  Validation took: 0:00:02

Training complete!
Total training took 0:06:40 (h:mm:ss)


In [29]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
7.908 GB of memory reserved.


In [30]:
# Display floats with two decimal places.
# pd.set_option('precision', 2)

# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index.
df_stats = df_stats.set_index('epoch')

# A hack to force the column headers to wrap.
#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])

# Display the table.
df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2.446443,3.079093,0:02:11,0:00:02
2,2.021835,3.093033,0:02:11,0:00:02
3,1.766879,3.135537,0:02:11,0:00:02
