In [None]:
!pip install datasets transformers
!pip install accelerate
!pip install optimum
!pip install datasets==2.15.0
!pip install wandb

In [None]:
import os
import time
import datetime

import pandas as pd
import seaborn as sns
import numpy as np
import random

import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
torch.manual_seed(42)

from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup



In [None]:
from datasets import load_dataset
datasets = load_dataset("wikimedia/wikisource", "20231201.en")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Resolving data files:   0%|          | 0/19 [00:00<?, ?it/s]

In [None]:
datasets = datasets['train'].train_test_split(test_size=0.1)
data_train = datasets['test']


In [None]:
model_checkpoint = "gpt2"

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512, padding="max_length")
tokenized_datasets = data_train.map(tokenize_function, batched=True, num_proc=2, remove_columns=["text",'id','url','title'])

Map (num_proc=2):   0%|          | 0/20828 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets=tokenized_datasets.train_test_split(test_size=0.1)
train_dataset = tokenized_datasets['train']

eval_test = tokenized_datasets['test'].train_test_split(test_size=0.5)
val_dataset = eval_test['train']
test_dataset = eval_test['test']

In [None]:
# Assuming you want to remove the first sample
index_to_remove = 0

# Filter the dataset to exclude the sample with the specified index
train_dataset = train_dataset.filter(lambda example, idx: idx != index_to_remove, with_indices=True)

# Print the updated dataset
print(train_dataset)


Filter:   0%|          | 0/18745 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 18744
})


In [None]:
# Assuming you want to remove the first sample
index_to_remove = 0

# Filter the dataset to exclude the sample with the specified index
val_dataset = val_dataset.filter(lambda example, idx: idx != index_to_remove, with_indices=True)

# Print the updated dataset
print(train_dataset)


Filter:   0%|          | 0/1041 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 18744
})


In [None]:
def collate_fn(batch):
    input_ids = [item["input_ids"] for item in batch]
    attention_masks = [item["attention_mask"] for item in batch]
    labels = [item["input_ids"] for item in batch]

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    labels = torch.tensor(labels)

    # Pad sequences to the same length
    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True)
    attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True)
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True)

    return {
        "input_ids": input_ids,
        "attention_mask": attention_masks,
        "labels": labels,
    }

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True,collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False,collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False,collate_fn=collate_fn)


In [None]:
for batch in train_dataloader:
    print(batch)
    break

{'input_ids': tensor([[ 1212,  2223,   373,  ..., 50256, 50256, 50256],
        [49580,    12,    66,  ..., 50256, 50256, 50256],
        [ 2202,  2805,  1987,  ...,    13, 14021,    11],
        [   44,  4146, 15543,  ...,   262, 16629,   290]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[ 1212,  2223,   373,  ..., 50256, 50256, 50256],
        [49580,    12,    66,  ..., 50256, 50256, 50256],
        [ 2202,  2805,  1987,  ...,    13, 14021,    11],
        [   44,  4146, 15543,  ...,   262, 16629,   290]])}


In [None]:
# I'm not really doing anything with the config buheret
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False,torch_dtype=torch.float16)

# instantiate the model
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)

# this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
# otherwise the tokenizer and model tensors won't match up
model.resize_token_embeddings(len(tokenizer))

# Tell pytorch to run this model on the GPU.
device = torch.device("cuda")
model.cuda()

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
# some parameters I cooked up that work reasonably well

epochs = 3
learning_rate = 5e-4
warmup_steps = 1e2
epsilon = 1e-8

# this produces sample output every 100 steps
sample_every = 500

In [None]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch)
optimizer = AdamW(model.parameters(),
                  lr = learning_rate,
                  eps = epsilon
                )



In [None]:
# Total number of training steps is [number of batches] x [number of epochs].
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
# This changes the learning rate as the training loop progresses
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = warmup_steps,
                                            num_training_steps = total_steps)

In [None]:
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

In [None]:
import wandb
wandb.init(project="gpt2_wiki",name="Normal(seq_512_batch_4)" )
total_t0 = time.time()

training_stats = []

model = model.to(device)

for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()

    total_train_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):

        b_input_ids = batch['input_ids'].to(device)
        b_labels = batch['labels'].to(device)
        b_masks = batch['attention_mask'].to(device)

        model.zero_grad()

        outputs = model(  b_input_ids,
                          labels=b_labels,
                          attention_mask = b_masks,
                          token_type_ids=None
                        )

        loss = outputs[0]

        batch_loss = loss.item()
        total_train_loss += batch_loss

        # Get sample every x batches.
        if step % sample_every == 0 and not step == 0:

            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.   Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))
        loss.backward()

        optimizer.step()

        scheduler.step()
        current_lr = scheduler.get_last_lr()[0]
        wandb.log({"Learning Rate": current_lr})

    avg_train_loss = total_train_loss / len(train_dataloader)

    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))

    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in val_dataloader:

        b_input_ids = batch['input_ids'].to(device)
        b_labels = batch['labels'].to(device)
        b_masks = batch['attention_mask'].to(device)

        with torch.no_grad():

            outputs  = model(b_input_ids,
#                            token_type_ids=None,
                             attention_mask = b_masks,
                            labels=b_labels)

            loss = outputs[0]

        batch_loss = loss.item()
        total_eval_loss += batch_loss

    avg_val_loss = total_eval_loss / len(val_dataloader)

    validation_time = format_time(time.time() - t0)

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

    wandb.log({"epoch": epoch_i + 1, "Training Loss": avg_train_loss, "Validation Loss": avg_val_loss})
print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc



Training...
  Batch   500  of  4,686. Loss: 2.7321925163269043.   Elapsed: 0:05:11.
  Batch 1,000  of  4,686. Loss: 2.596139907836914.   Elapsed: 0:10:26.
  Batch 1,500  of  4,686. Loss: 3.163100004196167.   Elapsed: 0:15:40.
  Batch 2,000  of  4,686. Loss: 3.2759792804718018.   Elapsed: 0:20:55.
  Batch 2,500  of  4,686. Loss: 1.9101479053497314.   Elapsed: 0:26:09.
  Batch 3,000  of  4,686. Loss: 2.1116349697113037.   Elapsed: 0:31:24.
  Batch 3,500  of  4,686. Loss: 2.5065014362335205.   Elapsed: 0:36:38.
  Batch 4,000  of  4,686. Loss: 2.8094160556793213.   Elapsed: 0:41:53.
  Batch 4,500  of  4,686. Loss: 3.41867733001709.   Elapsed: 0:47:07.

  Average training loss: 2.65
  Training epoch took: 0:49:04

Running Validation...
  Validation Loss: 2.55
  Validation took: 0:00:54

Training...
  Batch   500  of  4,686. Loss: 1.8631473779678345.   Elapsed: 0:05:15.
  Batch 1,000  of  4,686. Loss: 2.8346073627471924.   Elapsed: 0:10:30.
  Batch 1,500  of  4,686. Loss: 2.940920829772949.

In [None]:
# Display floats with two decimal places.
# pd.set_option('precision', 2)

# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index.
df_stats = df_stats.set_index('epoch')

# A hack to force the column headers to wrap.
#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])

# Display the table.
df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2.65286,2.553308,0:49:04,0:00:54
2,2.328816,2.471993,0:49:08,0:00:54
3,2.048848,2.460908,0:49:08,0:00:55


In [None]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
import torch
from optimum.bettertransformer import BetterTransformer

configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False,torch_dtype=torch.float16)

# instantiate the model
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)

# this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
# otherwise the tokenizer and model tensors won't match up
model.resize_token_embeddings(len(tokenizer))
# model = BetterTransformer.transform(model)

# Tell pytorch to run this model on the GPU.
device = torch.device("cuda")
model.cuda()

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


In [None]:
# some parameters I cooked up that work reasonably well

epochs = 3
learning_rate = 5e-4
warmup_steps = 1e2
epsilon = 1e-8

# this produces sample output every 100 steps
sample_every = 500

In [None]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch)
optimizer = AdamW(model.parameters(),
                  lr = learning_rate,
                  eps = epsilon
                )



In [None]:
# Total number of training steps is [number of batches] x [number of epochs].
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
# This changes the learning rate as the training loop progresses
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = warmup_steps,
                                            num_training_steps = total_steps)

In [None]:
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

In [None]:
import wandb

wandb.init(project="gpt2_wiki",name="Flash_optimum(seq_512_batch_4)_v2" )


[34m[1mwandb[0m: Currently logged in as: [33mpepoo20[0m ([33mhtx_ai_101[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
import wandb

# wandb.init(project="gpt2_wiki",name="Flash_optimum(seq_512_batch_4)" )
total_t0 = time.time()

training_stats = []

model = model.to(device)
with torch.backends.cuda.sdp_kernel(
                    enable_flash=True, enable_math=False, enable_mem_efficient=True
                ):
  for epoch_i in range(0, epochs):

      # ========================================
      #               Training
      # ========================================

      print("")
      print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
      print('Training...')

      t0 = time.time()

      total_train_loss = 0

      model.train()

      for step, batch in enumerate(train_dataloader):

          b_input_ids = batch['input_ids'].to(device)
          b_labels = batch['labels'].to(device)
          b_masks = batch['attention_mask'].to(device)

          model.zero_grad()

          outputs = model( b_input_ids,
                            labels=b_labels,
                            attention_mask = b_masks,
                            token_type_ids=None
                          )

          loss = outputs[0]

          batch_loss = loss.item()
          total_train_loss += batch_loss

          # Get sample every x batches.
          if step % sample_every == 0 and not step == 0:

              elapsed = format_time(time.time() - t0)
              print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.   Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))
          loss.backward()

          optimizer.step()

          scheduler.step()
          current_lr = scheduler.get_last_lr()[0]
          wandb.log({"Learning Rate": current_lr})

      # Calculate the average loss over all of the batches.
      avg_train_loss = total_train_loss / len(train_dataloader)

      # Measure how long this epoch took.
      training_time = format_time(time.time() - t0)

      print("")
      print("  Average training loss: {0:.2f}".format(avg_train_loss))
      print("  Training epoch took: {:}".format(training_time))

      # ========================================
      #               Validation
      # ========================================

      print("")
      print("Running Validation...")

      t0 = time.time()

      model.eval()

      total_eval_loss = 0
      nb_eval_steps = 0

      # Evaluate data for one epoch
      for batch in val_dataloader:

          b_input_ids = batch['input_ids'].to(device)
          b_labels = batch['labels'].to(device)
          b_masks = batch['attention_mask'].to(device)

          with torch.no_grad():

              outputs  = model(b_input_ids,
  #                            token_type_ids=None,
                              attention_mask = b_masks,
                              labels=b_labels)

              loss = outputs[0]

          batch_loss = loss.item()
          total_eval_loss += batch_loss

      avg_val_loss = total_eval_loss / len(val_dataloader)

      validation_time = format_time(time.time() - t0)

      print("  Validation Loss: {0:.2f}".format(avg_val_loss))
      print("  Validation took: {:}".format(validation_time))

      # Record all statistics from this epoch.
      training_stats.append(
          {
              'epoch': epoch_i + 1,
              'Training Loss': avg_train_loss,
              'Valid. Loss': avg_val_loss,
              'Training Time': training_time,
              'Validation Time': validation_time
          }
      )
      wandb.log({"epoch": epoch_i + 1, "Training Loss": avg_train_loss, "Validation Loss": avg_val_loss})

print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

In [None]:
6.2

In [None]:
# Display floats with two decimal places.
# pd.set_option('precision', 2)

df_stats = pd.DataFrame(data=training_stats)

df_stats = df_stats.set_index('epoch')

df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2.644277,3.072886,0:45:36,0:00:49
2,2.325789,3.023307,0:45:35,0:00:48
3,2.041798,2.813372,0:45:35,0:00:49


In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# model.save_pretrained("fine_tuned_model")
model.push_to_hub("bettermodel_gpt2_wiki")

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/pepoo20/bettermodel_gpt2_wiki/commit/9f9307386cdb338001f36d541873c3ce00a32146', commit_message='Upload model', commit_description='', oid='9f9307386cdb338001f36d541873c3ce00a32146', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Learning Rate,███▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
Training Loss,█▄▁
Validation Loss,█▇▁
epoch,▁▅█

0,1
Learning Rate,0.0
Training Loss,2.0418
Validation Loss,2.81337
epoch,3.0
