# Improving Language Understanding with Generative Pre-Training

Implementation of the original ['Improving Language Understanding with Generative Pre-Training'](https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf) paper in which GPT was introduced, from scratch.

## 1. Libraries

In [1]:
import os
import math
from typing import List, Dict, Generator, Union
from dataclasses import dataclass

# The dataset is taken from NLTK.
import nltk
import torch
import torchtext
import numpy as np
from torch import nn
from torch.optim import Adam
from tqdm.notebook import tqdm
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from torchtext.vocab import build_vocab_from_iterator
from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR


# Controlling the randomness in PyTorch and NumPy.
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.backends.cudnn.benchmark = True
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed(RANDOM_SEED)

# Change this path based on your Google Drive.
PROJECT_PATH = "/content/drive/MyDrive/Notebooks/PyTorch/Papers/NLP/GPT"

## 2. Dataset
I am going to use a small dataset out of `nltk`'s *Project Gutenberg* texts.

In [2]:
nltk.download("gutenberg")

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


True

In [3]:
corpus = nltk.corpus.gutenberg.words("melville-moby_dick.txt")

## 2.1. Vocabulary

Here I'll create a Vocab. All its keys will be words, since the Tokenizer will be Word Tokenizer, not a Subword Tokenizer as it was in the paper.

In [4]:
def iterate_corpus(tokens: List[str], seq_size: int):
    """Iterate through the corpus and yielding a batch of sequences.

    Args:
        tokens (List[str]): Word tokens.
        seq_size (int): Length of a sequence batch.

    Yields:
        Generator[List[str]]: Generated batch.
    """
    for i in range(0, len(tokens) - seq_size, seq_size):
        yield tokens[i:i + seq_size]


vocab = build_vocab_from_iterator(
    iterator=iterate_corpus(corpus, seq_size=1000),
    # Only one special token.
    specials=["[UNK]"]
)
vocab.set_default_index(vocab["[UNK]"])
print(f"Vocab size: {len(vocab)}")

torch.save(vocab, os.path.join(PROJECT_PATH, "vocab.pt"))

Vocab size: 19279


## 2.2. Tokenizer

In [5]:
class WordTokenizer:
    """Word tokenizer which uses a previously defined vocabulary.

    Args:
        vocab (torchtext.vocab.Vocab): PyTorch Vocab.
    """

    def __init__(self, vocab: torchtext.vocab.Vocab):
        self.vocab = vocab
        self.itos = vocab.get_itos()

    def __call__(self, tokens: List[str]) -> List[int]:
        """Convert the strings in `tokens` to IDs.

        Args:
            tokens (List[str]): List of word tokens.

        Returns:
            List[int]: List of token IDs.
        """
        return [
            self.vocab[token]
            for token in tokens
        ]

    def ids2tokens(self, ids: List[int]) -> List[str]:
        """Convert IDs to word tokens.

        Args:
            tokens (List[int]): List of token IDs.

        Returns:
            List[str]: List of word tokens.
        """
        return [
            self.itos[i]
            for i in ids
        ]

    def tokens2ids(self, tokens: List[str]) -> List[int]:
        """Convert the strings in `tokens` to IDs.

        Args:
            tokens (List[str]): List of word tokens.

        Returns:
            List[int]: List of token IDs.
        """
        return [
            self.vocab[token]
            for token in tokens
        ]


tokenizer = WordTokenizer(vocab=vocab)
token_ids = tokenizer(["what", "is", "the", "meaning", "of", "life"])
print("Token IDs:", token_ids)
print("Word tokens:", tokenizer.ids2tokens(token_ids))

Token IDs: [72, 17, 2, 1290, 4, 182]
Word tokens: ['what', 'is', 'the', 'meaning', 'of', 'life']


### 2.3. PyTorch Dataset
This is an Autoregressive dataset. Basically, the output is the input $t + 1$, where $t$ is the timestep. See the example below:

$Input_{t}$: `['I', 'want', 'to', 'be', 'a', 'responsible']`

$Output_{t}$: `['want', 'to', 'be', 'a', 'responsible', 'person']`

As you can see, the input and output are of the same length, the only difference is that the output is shifted to the right with one word.

Usually, for text generation, the only important token out of the output sequence is the last one (in our case `'person'`). It is appended to the input and then, iteratively, text generation is being made.

$Input_{t}$ = `['I', 'want', 'to', 'be', 'a', 'responsible']`

$Input_{t+1}$ = `['I', 'want', 'to', 'be', 'a', 'responsible', 'person']`

In [6]:
class AutoregressiveDataset(Dataset):
    """Autoregressive textual dataset.
    
    Args:
        corpus (List[str]): List of words.
        tokenizer (WordTokenizer): Word tokenizer, responsible for the bidirectional convertion
        of tokens to ids and vice versa.
        seq_len (int): Length of each sequence that's going to be passed to the model.
    """

    def __init__(self, corpus: List[str], tokenizer: WordTokenizer, seq_len: int):
        self.corpus = corpus
        self.tokenizer = tokenizer
        self.seq_len = seq_len

    def __len__(self):
        # We are subtracting `self.seq_len` and 1 because otherwise
        # the final sample of the dataset would have input and output of
        # different lengths.
        return len(self.corpus) - self.seq_len - 1

    def __getitem__(self, idx):
        # Sequence of size `self.seq_len`.
        input_ids = self.tokenizer(self.corpus[idx:idx + self.seq_len])
        # Since the model is autoregressive, the expected output is just
        # one step ahead of the input, i.e. we only predict the next token.
        labels = self.tokenizer(self.corpus[idx + 1:idx + self.seq_len + 1])

        return {
            "input_ids": torch.LongTensor(input_ids),
            "labels": torch.LongTensor(labels)
        }


dataset = AutoregressiveDataset(corpus=corpus, tokenizer=tokenizer, seq_len=10)

sample_id = 26_000
print(f"Dataset size: {len(dataset)}")
print(f"Dataset sample format:\n{dataset[sample_id]}")
print(f"\nInput sequence:\n\t{tokenizer.ids2tokens(dataset[sample_id]['input_ids'])}")
print(f"Prediction sequence:\n\t{tokenizer.ids2tokens(dataset[sample_id]['labels'])}")
print("Here the predicted word is 'waters' with ID '369'.")

Dataset size: 260808
Dataset sample format:
{'input_ids': tensor([3282,    7,   32,   94, 3620,    7, 4197,  278,   60,    2]), 'labels': tensor([   7,   32,   94, 3620,    7, 4197,  278,   60,    2,  369])}

Input sequence:
	['Woe', 'to', 'him', 'who', 'seeks', 'to', 'pour', 'oil', 'upon', 'the']
Prediction sequence:
	['to', 'him', 'who', 'seeks', 'to', 'pour', 'oil', 'upon', 'the', 'waters']
Here the predicted word is 'waters' with ID '369'.


## 3. Model
Below is the implementation of the Causal Transformer architecture. It consists of only Decoder layers. They are stacked one after the other and then their output is projected to a certain dimension (denoted by the `out_size` parameter) using a Linear layer.

Scroll to the next code cell to take a look at the GPT configuration.

In [7]:
class CausalTransformer(nn.Module):
    """Causal Transformer is a Transformer that uses only the Decoder layer of the initial 'Attention Is All You Need' architecture.
    These types of models are usually used for Autoregressive/Causal objective.

    Args:
        d_model (int): Dimensional states.
        nheads (int): Number of attention heads.
        num_layers (int): Number of decoder layers.
        vocab_size (int): Vocabulary size, used for the Embedding layer.
        out_size (int): Output size, sinze the final Linear layer projects the Transformer output to a certain `output_size` dimension.
        padding_key (int): Padding key; Not mandatory for GPT but still this class can be used with padding as well.
    """

    def __init__(
        self, 
        d_model: int, 
        nheads: int,
        num_layers: int,
        vocab_size: int, 
        out_size: int, 
        seq_len: int,
        padding_key: int,
    ):
        super().__init__()

        self.padding_key = padding_key
        self.seq_len = seq_len

        self.decoder = TransformerDecoder(
            d_model,
            nheads,
            num_layers
        )

        # Embedding layer
        self.embed = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=d_model
        )

        # Positional encoding layer
        self.pos_enc = nn.Embedding(
            num_embeddings=seq_len,
            embedding_dim=d_model
        )

        # Dropout layer
        self.drop = nn.Dropout()

        self.proj = nn.Linear(d_model, out_size)

        # Initializing the weights of all layers.
        self.apply(self._init_weights)

    def forward(self, x: torch.LongTensor, y: torch.LongTensor = None) -> Union[float, torch.Tensor]:
        """Make an autoregressive prediction.

        Args:
            x (torch.LongTensor): Sequence of input indices.
            y (torch.LongTensor, optional): Sequence of target indices. Defaults to None. If `y` is passed, 
            the loss is directly calculated and returned. Otherwise, the logits are returned.
        """
        _, x_seq_len = x.shape

        # Generate a tensor of element positions based on sequence length.
        assert x_seq_len <= self.seq_len, f"Sequence length dimension of `x` should be greater than {self.seq_len}"
        pos = torch.arange(0, x_seq_len, dtype=torch.long, device=x.device).unsqueeze(0)

        # Create padding mask
        pad_mask = (x == self.padding_key)

        # Embedding and Positional Embedding
        embed_x = self.embed(x)
        pos_enc_x = self.pos_enc(pos)
        x = embed_x + pos_enc_x
        # Shape: (batch_size, seq_len, d_model)

        # Dropout after the initial encodings
        x = self.drop(x)

        x = self.decoder(x, pad_mask)
        # Output projection
        logits = self.proj(x)

        # When there is a passed target tensor, we calculate the loss here.
        if y is not None:
            loss = F.cross_entropy(
                logits.reshape(-1, logits.shape[-1]), 
                y.reshape(-1)
            )

            return loss

        return logits

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
        elif isinstance(module, nn.LayerNorm):
            torch.nn.init.zeros_(module.bias)
            torch.nn.init.ones_(module.weight)


class TransformerDecoder(nn.Module):

    def __init__(
        self, 
        d_model: int, 
        nheads: int, 
        num_layers: int,
    ):
        super().__init__()

        self.decoder_layers = nn.ModuleList([
            TransformerDecoderLayer(d_model, nheads)
            for _ in range(num_layers)
        ])

    def forward(self, x, padding_mask):
        # Passing the input tensor through the Decoder layers
        for decoder in self.decoder_layers:
            x = decoder(x, padding_mask)

        return x


class TransformerDecoderLayer(nn.Module):
    """Transformer Decoder Layer, greatly inspired by these two papers:
    - [GPT Paper](https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf)
    - [T-DMCA](https://arxiv.org/pdf/1801.10198.pdf)
    It has only one input and this input should consist of delimiters and 
    special tokens.
    """
    
    def __init__(
        self,
        d_model,
        nheads,
        # The value 0.1 comes from the original GPT paper.
        dropout=0.1
    ):
        super().__init__()

        assert d_model % nheads == 0, "'d_model' should be divisible by 'n_heads'!"

        # Parameter that will only be used for accessing the model device.
        # That is of need, because of the masks and the way they are passed
        # through the model!
        self.dummy_param = nn.Parameter(torch.empty(0))

        # Linear layers that consist of the weights of 
        # the queries, keys and values
        # The shapes are coming from the 'Attention Is All You Need' paper.
        self.Wq = nn.Linear(d_model, d_model)
        self.Wk = nn.Linear(d_model, d_model)
        self.Wv = nn.Linear(d_model, d_model)

        # Multiheaded Self Attention
        self.attention = nn.MultiheadAttention(
            d_model, nheads, batch_first=True,
            dropout=dropout
        )

        # Layer Norm 1
        self.norm1 = nn.LayerNorm(d_model)

        # Position-wise Feed-forward network
        self.ff = nn.Linear(d_model, 4 * d_model)
        self.ff_proj = nn.Linear(4 * d_model, d_model)
        self.gelu = GELU()

        # Layer Norm 2
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, x: torch.Tensor, padding_mask: torch.Tensor) -> torch.Tensor:
        _, seq_len, _ = x.shape
        # Those there variables are (batch_size, seq_len and embedding_size).

        # Create padding and attention/future masks
        padding_mask = padding_mask.to(self.dummy_param.device)
        future_mask = self.get_future_mask(seq_len).to(self.dummy_param.device)

        # Generate queries, keys and values
        queries = self.Wq(x)
        keys = self.Wk(x)
        values = self.Wv(x)
        # Shapes: (batch_size, seq_len, d_model)

        # Self Attention with masks
        # Here the second output is None - it is supposed to be the attention
        # weights, but we are not using them.
        attn_out, _ = self.attention(
            queries, keys, values,
            key_padding_mask=padding_mask, attn_mask=future_mask,
            need_weights=False
        )
        # Shape: (batch_size, seq_len, d_model)

        # Layer Normalization, Position-wise Feed-Forward, Layer Normalization with
        # residual connections
        norm1_attn_out = self.norm1(attn_out + x)

        attn_out = self.ff(norm1_attn_out)
        attn_out = self.ff_proj(attn_out)
        attn_out = self.gelu(attn_out)

        attn_out = self.norm2(attn_out + norm1_attn_out)
        # Shape: (batch_size, seq_len, d_model)
        
        return attn_out

    def get_future_mask(self, seq_len: int):
        mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).to(torch.bool)
        return mask


class GELU(nn.Module):
    """Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415"""

    def forward(self, x):
        return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))

The model configuration is taken from the paper, see the citations below:
> We trained a $12$-layer decoder-only transformer with masked self-attention heads ($768$ dimensional states and $12$
attention heads). For the position-wise feed-forward networks, we used $3072$ dimensional inner states.
We used the **Adam** optimization scheme [27] with a max learning rate of $2.5$e-4.

> We train for $100$ epochs on minibatches of $64$ randomly sampled,  contiguous sequences of 512 tokens.

> We also employed a modified version of L2 regularization proposed in [37], with w = $0.01$ on all non bias or
gain weights.

Note, the inner dimensional state ($3072$ in the paper) is defined as $4 \times d_{model}$ in this implementation.

In [8]:
@dataclass
class GptConfig:
    """
    Parameters taken from p. 5 of the original GPT paper:
    https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf
    """
    d_model: int = 768
    nheads: int = 12
    num_layers: int = 12
    # The vocab size in the paper is different. I will use a smaller vocab, based
    # on the current dataset.
    vocab_size: int = 19_279
    out_size: int = 19_279
    seq_len: int = 64
    # This token is not used by GPT.
    pad_token: int = -100


class Gpt(nn.Module):
    """Generative Pre-Trained Transformer (GPT). Initially mentioned in this [paper](https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf).

    Args:
        config (GptConfig): Configuration defining the GPT model architecture. You can try with different parameters, the default ones are coming from the first GPT paper.
    """

    def __init__(self, config=GptConfig()):
        super().__init__()
        self.model = CausalTransformer(
            d_model=config.d_model,
            nheads=config.nheads,
            num_layers=config.num_layers,
            vocab_size=config.vocab_size,
            out_size=config.out_size,
            seq_len=config.seq_len,
            padding_key=config.pad_token
        )

    def forward(self, x: torch.LongTensor, y: torch.LongTensor = None):
        if y is not None:
            return self.model(x, y)
        
        return self.model(x)

    def generate(self, input_tokens: List[str], tokenizer: WordTokenizer, max_length: int = 10):
        """Generate a sequence autoregressively - by adding one token at a time.
        Args:
            
        """
        token_ids = tokenizer(input_tokens)
        
        self.model.eval()

        with torch.no_grad():
            for _ in range(max_length):
                # Getting the last predicted ID.
                next_token_id = self.model(
                    torch.LongTensor(token_ids).unsqueeze(0)
                ).argmax(-1)[0][-1]
                token_ids.append(next_token_id.item())

        self.model.train()

        return tokenizer.ids2tokens(token_ids)


config = GptConfig(seq_len=10)
gpt = Gpt(config)
gpt.eval()

with torch.no_grad():
    x = torch.randint(low=0, high=1000, size=(2, 10))
    y_pred = gpt(x)
    print("Input shape:", x.shape)
    print("Prediction shape:", y_pred.shape)
    print()

gpt.train()



Input shape: torch.Size([2, 10])
Prediction shape: torch.Size([2, 10, 19279])



Gpt(
  (model): CausalTransformer(
    (decoder): TransformerDecoder(
      (decoder_layers): ModuleList(
        (0-11): 12 x TransformerDecoderLayer(
          (Wq): Linear(in_features=768, out_features=768, bias=True)
          (Wk): Linear(in_features=768, out_features=768, bias=True)
          (Wv): Linear(in_features=768, out_features=768, bias=True)
          (attention): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (ff): Linear(in_features=768, out_features=3072, bias=True)
          (ff_proj): Linear(in_features=3072, out_features=768, bias=True)
          (gelu): GELU()
          (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (embed): Embedding(19279, 768)
    (pos_enc): Embedding(10, 768)
    (drop): Dropout(p=0.5, inplace=False)
    (proj): Linear(in_features=76

## 4. Training session
This is a pretraining/fine-tuning session definition. It takes `TrainingArguments` with `epochs=100`, `batch_size=64`, `linear_steps=2000`, `l_rate=2.5e-4`, `weight_decay=1e-2`.

The model is trained using two schedulers - linear and cosine:
> The learning rate
was increased linearly from zero over the first $2000$ updates and annealed to $0$ using a cosine schedule.

> We train for 100 epochs on minibatches of 64 randomly sampled, contiguous sequences of 512 tokens

Instead of $100$ epochs, I chose $10$, since my dataset is quite smaller. The minibatch size is the same - $64$. The sequence length is $32$ because I am limited on VRAM and also it took less time.



In [9]:
@dataclass
class TrainingArguments:
    model_path: str = os.path.join(PROJECT_PATH, "model")
    # Those are the original GPT training hyperparameters. Below, when I start the training
    # session, I'll change them, since my dataset and environment is different.
    epochs: int = 100
    batch_size: int = 64
    linear_steps: int = 2000
    l_rate: int = 2.5e-4
    weight_decay: float = 1e-2
    sequence_length: int = 512
    grad_clip: float = 1.0


class TrainingSession:
    """Training session for a GPT. It uses two learning rate schedulers - linear and cosine.
    Also, it can work with or without an evaluation dataset.
    """

    def __init__(self):
        self.loss_func = nn.CrossEntropyLoss()

    def start(
        self, 
        model: nn.Module,
        train_args: TrainingArguments, 
        train_dataset: Dataset, 
        eval_dataset: Dataset = None,
        device: str = None,
        saving_step: int = 500,
        use_schedulers: bool = True
    ):
        """Start a training session.

        Args:
            model (nn.Module): Model you're going to train.
            train_args (TrainingArguments): Model training arguments.
            train_dataset (Dataset): Training dataset.
            eval_dataset (Dataset, optional): Evaluation dataset. If no dataset is passed there will be no model evaluation. Defaults to None.
            device (str, optional): Device on which the model will be trained/evaluated. If no device is passed `TrainingSession` dynamically chooses a device (CUDA if available, else CPU). Defaults to None.
            saving_step (int, optional): Saving the model each `saving_step` of the training.
            use_schedulers (bool, optional): Whether to use the Linear and Cosine schedulers or not. Defaults to True.

        Returns:
            nn.Module: The latest version of the trained model.
        """
        self.model = model
        # If a device is not passed explicitly, it is assumed that when CUDA is
        # available, we'll use it.
        if device is None:
            self.device = "cuda" if torch.cuda.is_available() else "cpu"
        else:
            self.device = device

        # Moving the model to `self.device`.
        self.model.to(self.device)
        
        self.use_schedulers = use_schedulers
        self.train_args = train_args
        self._unpack_train_args(
            # `train_size` is needed for the LR Schedulers.
            train_size=len(train_dataset)
        )

        self.saving_step = saving_step
        
        # Defining the DataLoaders.
        train_dl = DataLoader(
            dataset=train_dataset,
            batch_size=self.batch_size,
            shuffle=True
        )
        eval_dl = None
        if eval_dataset:
            eval_dl = DataLoader(
                dataset=eval_dataset,
                batch_size=self.batch_size,
                shuffle=True
            )

        # Progressbar of the epochs.
        epochs_pb = tqdm(range(self.epochs))
        
        for epoch in epochs_pb:
            train_ppl = self._train_epoch(train_dl)
            if eval_dl:
                eval_ppl = self._eval_epoch(eval_dl)

            if eval_ppl:
                epochs_pb.set_description(f"Epoch {epoch + 1}/{self.epochs}, Training Perplexity: {train_ppl:.2f}, Evaluation Perplexity: {eval_ppl:.2f}")
            else:
                epochs_pb.set_description(f"Epoch {epoch + 1}/{self.epochs}, Training Loss: {train_ppl:.2f}")

        return self.model

    def save_model(self, path: str):
        torch.save(self.model.state_dict(), path)

    def _train_epoch(self, dataloader: DataLoader):
        progressbar = tqdm(dataloader)

        for step, batch in enumerate(progressbar, start=1):
            # Moving the Tensors to `self.device`.
            x = batch["input_ids"].to(self.device)
            y = batch["labels"].to(self.device)

            self.optimizer.zero_grad()

            loss = self.model(x, y)
            loss.backward()
            
            if self.use_schedulers:
                # Updating the learning rate linearly the first `self.linear_steps` steps.
                if step <= self.linear_steps:
                    self._lin_scheduler.step()

                # Updating the learning rate for the remaining steps using a cosine
                # scheduler.
                if step > self.linear_steps:
                    self._cos_scheduler.step()

            # Clipping the gradients based on the gradient norm.
            torch.nn.utils.clip_grad_norm_(
                self.model.parameters(), 
                self.grad_clip
            )
            self.optimizer.step()

            perplexity = torch.exp(loss)

            progressbar.set_description(f"Step: {step}/{self.total_steps}, Loss: {loss.item():.4f}, Perplexity: {perplexity:.4f}")

            # Saving the model on each `self.saving_step` step.
            if step % self.saving_step == 0:
                self.save_model(self.model_path)

        return perplexity

    def _eval_epoch(self, dataloader: DataLoader):
        progressbar = tqdm(dataloader)

        self.model.eval()

        with torch.no_grad():
            for step, batch in enumerate(progressbar, start=1):
                # Moving the Tensors to `self.device`.
                x = batch["input_ids"].to(self.device)
                y = batch["labels"].to(self.device)

                y_pred = self.model(x)
                # y_pred shape: (batch_size, seq_len, embed_size)
                loss = self.loss_func(
                    y_pred.reshape(-1, y_pred.shape[-1]), 
                    y.reshape(-1)
                )
                perplexity = torch.exp(loss)

                if step % 10 == 0:
                    progressbar.set_description(f"Evaluation Loss: {loss.item():.4f}, Perplexity: {perplexity:.4f}")

        self.model.train()

        return perplexity

    def _unpack_train_args(self, train_size: int):
        self.model_path = self.train_args.model_path

        self.epochs = self.train_args.epochs
        self.batch_size = self.train_args.batch_size
        # Num. steps in which the Linear Scheduler will update the learning rate.
        self.linear_steps = self.train_args.linear_steps
        self.total_steps = train_size // self.batch_size

        self.optimizer = Adam(
            self.model.parameters(), 
            lr=self.train_args.l_rate, weight_decay=self.train_args.weight_decay
        )
        self.grad_clip = self.train_args.grad_clip

        if self.use_schedulers:
            # Increasing the learning rate from 0 to 2e-4 over the first 2000 steps.
            self._lin_scheduler = LinearLR(
                self.optimizer, 
                # Since `start_factor` cannot be 0, I chose a small number.
                start_factor=1e-5, 
                end_factor=1, 
                total_iters=self.linear_steps
            )
            # Annealing the learning rate from 2e-4 to 0.
            self._cos_scheduler = CosineAnnealingLR(
                self.optimizer, 
                T_max=self.total_steps - self.linear_steps
            )


TRAIN_SIZE = 0.9
# You can take a subset of the dataset here, if you want to test the training session
# in a quick manner.
train_corpus = corpus[:3000]

train_args = TrainingArguments(
    # I am changing the original hyperparameters, since my environment is different.
    epochs=10,
    l_rate=2.5e-4,
    linear_steps=500,
    weight_decay=1e-2,
    sequence_length=32
)

dataset = AutoregressiveDataset(
    corpus=train_corpus, 
    tokenizer=tokenizer, 
    seq_len=train_args.sequence_length
)
train_dataset, eval_dataset = random_split(dataset, lengths=[TRAIN_SIZE, 1 - TRAIN_SIZE])
print("Training dataset size:", len(train_dataset))
print("Evaluation dataset size:", len(eval_dataset))

config = GptConfig(seq_len=train_args.sequence_length)
gpt = Gpt(config)

session = TrainingSession()
model = session.start(
    model=gpt,
    train_args=train_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    # Saving the model on every 500th step.
    saving_step=500,
    # You can disable the two schedulers from here.
    use_schedulers=False
)

Training dataset size: 2671
Evaluation dataset size: 296


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

## 5. Inference

In [10]:
# Moving the model weights to the CPU for inference.
gpt.to("cpu")

# Defining an input sequence.
input_sequence = ["what",]

# Generating a sequence.
generated_sequence = gpt.generate(
    input_sequence, 
    tokenizer=tokenizer, 
    max_length=10
)
print(f"Input sequence: {input_sequence}")
print(f"Generated sequence: {generated_sequence}")

Input sequence: ['what']
Generated sequence: ['what', 'thing', 'on', 'the', 'sea', '."', '--', 'SIR', 'T', '.', '"']
