In [None]:
!pip install datasets transformers
!pip install accelerate
!pip install optimum
!pip install datasets==2.15.0
!pip install wandb

In [1]:
import os
import time
import datetime

import pandas as pd
import seaborn as sns
import numpy as np
import random

import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
torch.manual_seed(42)

from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import AutoModelForCausalLM

from transformers import Trainer, TrainingArguments


In [2]:
from datasets import load_dataset
datasets = load_dataset("wikimedia/wikisource", "20231201.en")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Resolving data files:   0%|          | 0/19 [00:00<?, ?it/s]

In [3]:
datasets = datasets['train'].train_test_split(test_size=0.005)
data_train = datasets['test']


In [4]:
data_train = data_train.train_test_split(test_size=0.2)


In [5]:
model_checkpoint = "gpt2"

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [7]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

In [8]:
tokenized_datasets = data_train.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text",'id', 'url', 'title'])

Map (num_proc=4):   0%|          | 0/833 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2126 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (3794 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (6219 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (4282 > 1024). Running this sequence through the model will result in indexing errors


Map (num_proc=4):   0%|          | 0/209 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1095 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (7247 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (9910 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (8588 > 1024). Running this sequence through the model will result in indexing errors


In [9]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 833
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 209
    })
})

In [10]:
block_size = 128
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [11]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=16,
    num_proc=4,
)

Map (num_proc=4):   0%|          | 0/833 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/209 [00:00<?, ? examples/s]

In [12]:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)

In [13]:
model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    f"{model_name}-finetuned-wikitext2",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    # push_to_hub=True,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 16,
    report_to = 'none',
    logging_steps=200,
)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["test"],
)

In [48]:
import wandb
wandb.init(project="gpt2_wiki_v1",name="Normal" )


VBox(children=(Label(value='0.019 MB of 0.019 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [None]:
train_status= trainer.train()

In [51]:
train_status

TrainOutput(global_step=1446, training_loss=3.5840048044713866, metrics={'train_runtime': 1798.4624, 'train_samples_per_second': 25.722, 'train_steps_per_second': 0.804, 'total_flos': 3021842350080000.0, 'train_loss': 3.5840048044713866, 'epoch': 3.0})

In [52]:
normal_model = model

In [12]:
from transformers import AutoModelForCausalLM
from optimum.bettertransformer import BetterTransformer
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)
model = BetterTransformer.transform(model)

The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


In [13]:
model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    f"{model_name}-finetuned-wikitext2_flash",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    # push_to_hub=True,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 16,
    report_to = 'none',
    logging_steps=200,
    save_strategy ='no'
)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["test"],
)

In [56]:
import wandb
wandb.init(project="gpt2_wiki_v1",name="Flash" )


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [57]:
with torch.backends.cuda.sdp_kernel(
                        enable_flash=True, enable_math=False, enable_mem_efficient=True
                    ):
  train_status_v2=trainer.train()


Epoch,Training Loss,Validation Loss
1,3.6492,3.638063
2,3.5578,3.632343
3,3.51,3.631246


In [58]:
train_status_v2

TrainOutput(global_step=1446, training_loss=3.5840605739735962, metrics={'train_runtime': 1732.8471, 'train_samples_per_second': 26.696, 'train_steps_per_second': 0.834, 'total_flos': 3021842350080000.0, 'train_loss': 3.5840605739735962, 'epoch': 3.0})

In [59]:
wandb.finish()

VBox(children=(Label(value='0.020 MB of 0.020 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,█▂▁
eval/runtime,▁▁█
eval/samples_per_second,██▁
eval/steps_per_second,██▁
train/epoch,▁▂▃▃▄▅▅▇███
train/global_step,▁▂▃▃▄▅▅▇███
train/learning_rate,█▇▆▅▃▂▁
train/loss,█▅▃▃▂▁▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,3.63125
eval/runtime,48.0596
eval/samples_per_second,92.656
eval/steps_per_second,5.805
train/epoch,3.0
train/global_step,1446.0
train/learning_rate,0.0
train/loss,3.51
train/total_flos,3021842350080000.0
train/train_loss,3.58406


Memory
Normal 9.6GB vs BetterTransformer 8.6GB