<a href="https://colab.research.google.com/github/mobarakol/tutorial_notebooks/blob/main/LLM_GPT2_LM_Finetune_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!pip -q install labml labml_helpers labml_nn

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/266.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.3/266.3 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m443.9/443.9 kB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.0/131.0 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m55.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for fairscale (pyproject.toml) ... [?25l[?25hdone


In [15]:
pip -q install --upgrade labml

Collecting labml
  Using cached labml-0.5.3-py3-none-any.whl.metadata (7.1 kB)
Using cached labml-0.5.3-py3-none-any.whl (94 kB)
Installing collected packages: labml
  Attempting uninstall: labml
    Found existing installation: labml 0.4.168
    Uninstalling labml-0.4.168:
      Successfully uninstalled labml-0.4.168
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
labml-nn 0.4.137 requires labml==0.4.168, but you have labml 0.5.3 which is incompatible.[0m[31m
[0mSuccessfully installed labml-0.5.3


In [1]:
import torch
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModelForCausalLM

from labml import lab, monit, tracker
from labml.configs import BaseConfigs, option
from labml.utils.download import download_file
from labml_helpers.device import DeviceConfigs
from labml_nn.lora.gpt2 import GPTModel

In [2]:

class Trainer(BaseConfigs):
    device: torch.device = DeviceConfigs()
    layer_norm_epsilon: float = 1e-05
    d_model: int = 768
    n_layers: int = 6
    n_heads: int = 6
    n_positions: int = 512
    vocab_size: int = 256
    epochs: int = 4
    batch_size: int = 1
    learning_rate: float = 1e-4
    context_len: int = 128
    lora_r: int = 2
    text: TensorDataset = "tiny_shakespeare"

    tokenizer = AutoTokenizer.from_pretrained("gpt2")
    model: GPTModel
    optimizer: torch.optim.Adam

    loss_func = torch.nn.CrossEntropyLoss()
    data_loader: DataLoader
    def _load_pretrained_weights(self):
        hf_model = AutoModelForCausalLM.from_pretrained("gpt2")
        state_dict = hf_model.state_dict()
        mapping = {
            'transformer.wte.weight': 'token_embedding.weight',
            'transformer.wpe.weight': 'position_embedding.weight',
            'transformer.ln_f.weight': 'final_norm.weight',
            'transformer.ln_f.bias': 'final_norm.bias',
            'lm_head.weight': 'lm_head.weight'
        }

        for i in range(12):
                mapping[f'transformer.h.{i}.ln_1.weight'] = f'blocks.{i}.attn_norm.weight'
                mapping[f'transformer.h.{i}.ln_1.bias'] = f'blocks.{i}.attn_norm.bias'
                mapping[f'transformer.h.{i}.attn.c_attn.weight'] = f'blocks.{i}.attn.qkv_projection.weight'
                mapping[f'transformer.h.{i}.attn.c_attn.bias'] = f'blocks.{i}.attn.qkv_projection.bias'
                mapping[f'transformer.h.{i}.attn.c_proj.weight'] = f'blocks.{i}.attn.output_projection.weight'
                mapping[f'transformer.h.{i}.attn.c_proj.bias'] = f'blocks.{i}.attn.output_projection.bias'
                mapping[f'transformer.h.{i}.ln_2.weight'] = f'blocks.{i}.ffn_norm.weight'
                mapping[f'transformer.h.{i}.ln_2.bias'] = f'blocks.{i}.ffn_norm.bias'
                mapping[f'transformer.h.{i}.mlp.c_fc.weight'] = f'blocks.{i}.ffn.linear_in.weight'
                mapping[f'transformer.h.{i}.mlp.c_fc.bias'] = f'blocks.{i}.ffn.linear_in.bias'
                mapping[f'transformer.h.{i}.mlp.c_proj.weight'] = f'blocks.{i}.ffn.linear_out.weight'
                mapping[f'transformer.h.{i}.mlp.c_proj.bias'] = f'blocks.{i}.ffn.linear_out.bias'

        new_state_dict = {}
        for old_key, new_key in mapping.items():
            if old_key in state_dict:
                new_state_dict[new_key] = state_dict[old_key]

        convo_layers = ([f'blocks.{i}.ffn.linear_in.weight' for i in range(12)] +
                        [f'blocks.{i}.ffn.linear_out.weight' for i in range(12)] +
                        [f'blocks.{i}.attn.qkv_projection.weight' for i in range(12)] +
                        [f'blocks.{i}.attn.output_projection.weight' for i in range(12)])

        for layer in convo_layers:
            new_state_dict[layer] = torch.transpose(new_state_dict[layer], 0, 1)

        missing_keys, unexpected_keys = self.model.load_state_dict(new_state_dict, strict=False)
        assert all('lora' in key for key in missing_keys)
        assert not unexpected_keys

    def initialize(self):
        self.model = GPTModel(
            layer_norm_epsilon=self.layer_norm_epsilon,
            d_model=self.d_model,
            n_layers=self.n_layers,
            n_heads=self.n_heads,
            n_positions=self.n_positions,
            vocab_size=self.vocab_size,
            r=self.lora_r,
        )
        self.model.to(self.device)
        self._load_pretrained_weights()
        self.optimizer = Adam(self.model.parameters(), lr=self.learning_rate)
        self.data_loader = DataLoader(self.text, batch_size=self.batch_size, shuffle=True)

        def run(self):
            for _ in monit.loop(self.epochs):
                for (inputs,) in monit.iterate('Train', self.data_loader):
                    inputs = inputs.to(self.device)
                    logits = self.model(inputs[:, :-1])
                    loss = self.loss_func(logits.reshape(-1, logits.shape[-1]), inputs[:, 1:].reshape(-1))
                    self.optimizer.zero_grad()
                    loss.backward()
                    self.optimizer.step()
                    tracker.save({'loss': loss})
                    tracker.add_global_step()
                tracker.new_line()

@option(Trainer.text)
def tiny_shakespeare(c: Trainer):
    path = lab.get_data_path() / 'tiny_shakespeare.txt'
    if not path.exists():
        download_file("https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt", path)
    with open(path, 'r', encoding='utf-8') as f:
        text = f.read()

    tokens = c.tokenizer.encode(text)
    num_batches = len(tokens) // (c.batch_size * c.context_len)
    tokens = tokens[:num_batches * c.batch_size * c.context_len]
    input_ids = torch.tensor(tokens).view(-1, c.context_len)
    return TensorDataset(input_ids)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
from labml_nn.lora.experiment import Trainer
from labml import experiment
experiment.create(name="lora_gpt2")
trainer = Trainer()
experiment.configs(trainer)
trainer.initialize()
with experiment.start():
    trainer.run()

Token indices sequence length is longer than the specified maximum sequence length for this model (338025 > 1024). Running this sequence through the model will result in indexing errors


OutOfMemoryError: CUDA out of memory. Tried to allocate 3.06 GiB. GPU 0 has a total capacity of 14.75 GiB of which 1.52 GiB is free. Process 117420 has 13.22 GiB memory in use. Of the allocated memory 12.96 GiB is allocated by PyTorch, and 144.07 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)