In [1]:
import math
import copy

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import os

In [2]:
!pip install datasets

In [3]:
class PositionalEncoding(nn.Module):
    def __init__(self, dim_model, dropout_p, max_len):
        super().__init__()
        # Modified version from: https://pytorch.org/tutorials/beginner/transformer_tutorial.html
        # max_len determines how far the position can have an effect on a token (window)

        # Info
        self.dropout = nn.Dropout(dropout_p)

        # Encoding - From formula
        pos_encoding = torch.zeros(max_len, dim_model)
        positions_list = torch.arange(0, max_len, dtype=torch.float).view(-1, 1) # 0, 1, 2, 3, 4, 5
        division_term = torch.exp(torch.arange(0, dim_model, 2).float() * (-math.log(10000.0)) / dim_model) # 1000^(2i/dim_model)

        # PE(pos, 2i) = sin(pos/1000^(2i/dim_model))
        pos_encoding[:, 0::2] = torch.sin(positions_list * division_term)

        # PE(pos, 2i + 1) = cos(pos/1000^(2i/dim_model))
        pos_encoding[:, 1::2] = torch.cos(positions_list * division_term)

        # Saving buffer (same as parameter without gradients needed)
        pos_encoding = pos_encoding.unsqueeze(0).transpose(0, 1)
        self.register_buffer("pos_encoding",pos_encoding)

    def forward(self, token_embedding: torch.tensor) -> torch.tensor:
        # Residual connection + pos encoding
        return self.dropout(token_embedding + self.pos_encoding[:token_embedding.size(0), :])

In [4]:
class Transformer(nn.Module):
    """
    Model from "A detailed guide to Pytorch's nn.Transformer() module.", by
    Daniel Melchor: https://medium.com/@danielmelchor/a-detailed-guide-to-pytorchs-nn-transformer-module-c80afbc9ffb1
    """
    # Constructor
    def __init__(
        self,
        num_tokens,
        dim_model,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        dropout_p,
    ):
        super().__init__()

        # INFO
        self.model_type = "Transformer"
        self.dim_model = dim_model

        # LAYERS
        self.positional_encoder = PositionalEncoding(
            dim_model=dim_model, dropout_p=dropout_p, max_len=5000
        )
        self.embedding = nn.Embedding(num_tokens, dim_model)
        self.transformer = nn.Transformer(
            d_model=dim_model,
            nhead=num_heads,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dropout=dropout_p,
        )
        self.out = nn.Linear(dim_model, num_tokens)

    def forward(self, src, tgt, tgt_mask=None, src_pad_mask=None, tgt_pad_mask=None):
        # Src size must be (batch_size, src sequence length)
        # Tgt size must be (batch_size, tgt sequence length)

        # Embedding + positional encoding - Out size = (batch_size, sequence length, dim_model)
        src = self.embedding(src) * math.sqrt(self.dim_model)
        tgt = self.embedding(tgt) * math.sqrt(self.dim_model)
        src = self.positional_encoder(src)
        tgt = self.positional_encoder(tgt)

        # We could use the parameter batch_first=True, but our KDL version doesn't support it yet, so we permute
        # to obtain size (sequence length, batch_size, dim_model),
        src = src.permute(1,0,2)
        tgt = tgt.permute(1,0,2)

        # Transformer blocks - Out size = (sequence length, batch_size, num_tokens)
        transformer_out = self.transformer(src, tgt, tgt_mask=tgt_mask, src_key_padding_mask=src_pad_mask, tgt_key_padding_mask=tgt_pad_mask)
        out = self.out(transformer_out)

        return out

    def get_tgt_mask(self, size) -> torch.tensor:
        # Generates a squeare matrix where the each row allows one word more to be seen
        mask = torch.tril(torch.ones(size, size) == 1) # Lower triangular matrix
        mask = mask.float()
        mask = mask.masked_fill(mask == 0, float('-inf')) # Convert zeros to -inf
        mask = mask.masked_fill(mask == 1, float(0.0)) # Convert ones to 0

        # EX for size=5:
        # [[0., -inf, -inf, -inf, -inf],
        #  [0.,   0., -inf, -inf, -inf],
        #  [0.,   0.,   0., -inf, -inf],
        #  [0.,   0.,   0.,   0., -inf],
        #  [0.,   0.,   0.,   0.,   0.]]

        return mask

    def create_pad_mask(self, matrix: torch.tensor, pad_token: int) -> torch.tensor:
        # If matrix = [1,2,3,0,0,0] where pad_token=0, the result mask is
        # [False, False, False, True, True, True]
        return (matrix == pad_token)

In [5]:
# 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
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
from datasets import load_dataset

# dataset = load_dataset("mateiaassAI/MEID", split=['train[:5%]', 'train[5%:7%]', 'train[7%:10%]'])
dataset = load_dataset("mateiaassAI/MEID", split=['train[:75%]', 'train[50%:55%]', 'train[80%:81%]'])
train_dataset = dataset[0]
test_dataset = dataset[1]
valid_dataset = dataset[2]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
from tokenizers import ByteLevelBPETokenizer
import pandas as pd

dataset_tok = load_dataset("mateiaassAI/MEID")
dataset_tok = dataset_tok["train"]
dataset_tok = dataset_tok['right']

tokenizer = ByteLevelBPETokenizer(lowercase=True)
tokenizer.train_from_iterator(dataset_tok, vocab_size=100000, min_frequency=1, show_progress=True,special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

src_pad_idx = tokenizer.token_to_id('<pad>')
trg_pad_idx = src_pad_idx
trg_sos_idx = tokenizer.token_to_id('<s>')
trg_eos_idx = tokenizer.token_to_id('</s>')

enc_voc_size = tokenizer.get_vocab_size()
dec_voc_size = enc_voc_size

print(trg_sos_idx, trg_eos_idx, src_pad_idx, enc_voc_size)

0 2 1 100000


In [9]:
import random

def generate_data(dataframe, tokenizer):
    data = []
    for example in dataframe:
      input_text = example["wrong"]
      target_text = example["right"]

      input_ids = np.array(tokenizer.encode(input_text).ids)
      target_ids = np.array(tokenizer.encode(target_text).ids)

      SOS_token = np.array([trg_sos_idx])
      EOS_token = np.array([trg_eos_idx])
      X = np.concatenate((SOS_token, input_ids, EOS_token))
      y = np.concatenate((SOS_token, target_ids, EOS_token))

      data.append([X.tolist(), y.tolist()])

    np.random.shuffle(data)
    return data

In [10]:
train_data = generate_data(train_dataset, tokenizer)
val_data = generate_data(valid_dataset, tokenizer)

In [11]:
df = pd.DataFrame(train_data)

# Check for NaN values
nan_values = df.isnull().sum().sum()


if nan_values == 0:
    print("No NaN values found in the dataset.")
else:
    print(f"Total NaN values found: {nan_values}")

No NaN values found in the dataset.


In [12]:
def is_numeric(val):
    try:
        pd.to_numeric(val)
        return True
    except (ValueError, TypeError):
        return False

# Find non-numeric elements in each column
non_numeric_elements = {}
for col in df.columns:
    non_numeric_elements[col] = df[~df[col].apply(is_numeric)][col]

# Print non-numeric elements
for col, values in non_numeric_elements.items():
    if not values.empty:
        print(f"Non-numeric elements in column '{col}':")
        print(values)

In [13]:
def batchify_data(data, batch_size=16, padding=False, padding_token=-1):
    batches = []
    for idx in range(0, len(data), batch_size):
        # We make sure we dont get the last bit if its not batch_size size
        if idx + batch_size < len(data):
            # Here you would need to get the max length of the batch,
            # and normalize the length with the PAD token.
            if padding:
                max_batch_length = 0

                # Get longest sentence in batch
                for seq in data[idx : idx + batch_size]:
                    if len(seq[0]) > max_batch_length:
                        max_batch_length = len(seq[0])
                    if len(seq[1]) > max_batch_length:
                        max_batch_length = len(seq[1])

                # Append X padding tokens until it reaches the max length
                for seq_idx in range(batch_size):
                    remaining_length = max_batch_length - len(data[idx + seq_idx][0])
                    data[idx + seq_idx][0] += [padding_token] * remaining_length
                    # padding_array = np.array([padding_token] * remaining_length)
                    # data[idx + seq_idx][0] = np.concatenate((data[idx + seq_idx][0], padding_array))

                    remaining_length = max_batch_length - len(data[idx + seq_idx][1])
                    # padding_array = np.array([padding_token] * remaining_length)
                    data[idx + seq_idx][1] += [padding_token] * remaining_length
                    # data[idx + seq_idx][1] = np.concatenate((data[idx + seq_idx][1], padding_array))

            # batches.append(data[idx : idx + batch_size])
            batches.append(np.array(data[idx : idx + batch_size]).astype(np.int64))
    print(f"{len(batches)} batches of size {batch_size}")

    return batches

In [14]:
train_dataloader = batchify_data(train_data, batch_size=2, padding = True, padding_token = src_pad_idx)
val_dataloader = batchify_data(val_data, batch_size=2, padding = True, padding_token = src_pad_idx)

75490 batches of size 2
1006 batches of size 2


In [15]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = Transformer(
    num_tokens=enc_voc_size, dim_model=512, num_heads=8, num_encoder_layers=6, num_decoder_layers=6, dropout_p=0.1
).to(device)
# opt = torch.optim.SGD(model.parameters(), lr=0.01)
# loss_fn = nn.CrossEntropyLoss()



In [16]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.kaiming_uniform_(m.weight.data)

print(f'The model has {count_parameters(model):,} trainable parameters')
model.apply(initialize_weights)

The model has 146,640,544 trainable parameters


Transformer(
  (positional_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (embedding): Embedding(100000, 512)
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-5): 6 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
          )
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    )
    (decoder): Tran

In [17]:
from torch.optim import Adam
from torch.optim import lr_scheduler
from torch.optim import AdamW

init_lr = 1e-6
factor = 0.9
adam_eps = 5e-9
patience = 10
# warmup = 100
clip = 1.0
weight_decay = 5e-4

# optimizer = Adam(params=model.parameters(),
#                  lr=init_lr,
#                  weight_decay=weight_decay,
#                  eps=adam_eps)

# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
#                                                  verbose=True,
#                                                  factor=factor,
#                                                  patience=patience)

# loss_fn = nn.CrossEntropyLoss()

loss_fn = nn.CrossEntropyLoss(ignore_index=src_pad_idx)
optimizer = AdamW(model.parameters(), lr=0.001, betas=(0.9, 0.99), weight_decay = 0.1)

# optimizer = optim.Adam(model.parameters(), lr=init_lr, weight_decay=0.01)
# optimizer = torch.optim.SGD(model.parameters(), 1e-02, weight_decay=0.01)
# scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, verbose=True)
# scheduler = optim.lr_scheduler.CyclicLR(optimizer,base_lr=init_lr,max_lr=0.001,mode='triangular2', cycle_momentum=True)


In [18]:
import time

def train_loop(model, opt, loss_fn, dataloader):
    """
    Method from "A detailed guide to Pytorch's nn.Transformer() module.", by
    Daniel Melchor: https://medium.com/@danielmelchor/a-detailed-guide-to-pytorchs-nn-transformer-module-c80afbc9ffb1
    """

    model.train()
    total_loss = 0
    start_time = time.time()

    for step, batch in enumerate(dataloader):
        # batch = np.array(batch)
        # print(batch[0])
        X, y = batch[:, 0], batch[:, 1]
        X, y = torch.tensor(X).to(device), torch.tensor(y).to(device)

        # Now we shift the tgt by one so with the <SOS> we predict the token at pos 1
        y_input = y[:,:-1]
        y_expected = y[:,1:]

        # Get mask to mask out the next words
        sequence_length = y_input.size(1)
        tgt_mask = model.get_tgt_mask(sequence_length).to(device)

        # Standard training except we pass in y_input and tgt_mask
        pred = model(X, y_input, tgt_mask)

        # Permute pred to have batch size first again
        pred = pred.permute(1, 2, 0)
        # print(pred[0,:,:], y_expected[0])
        # break
        loss = loss_fn(pred, y_expected)

        opt.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        opt.step()

        total_loss += loss.detach().item()
        # scheduler.step()
        # if torch.isnan(loss).any():
        #       print("NaN loss detected! Debugging information:")
        #       print("Input Data (X):", X)
        #       print("Expected Output (y_expected):", y_expected)
        #       print("Expected Output (y_input):", y_input)
        #       print("Model Predictions (pred):", pred)
        #       # print("Gradients:", [param.grad for param in model.parameters() if param.grad is not None])

        # Print learning rate
        learning_rate = opt.param_groups[0]['lr']
        if (step + 1) % 10 == 0:  # Adjust frequency as needed
            elapsed_time = time.time() - start_time
            avg_loss = total_loss / step
            print(f"Step [{step+1}/{len(dataloader)}], Loss: {loss.item():.4f}, Time: {elapsed_time:.2f} seconds, Average Loss: {avg_loss:.4f}, Learning Rate: {learning_rate}")
            start_time = time.time()

    return total_loss / len(dataloader)

In [19]:
from sklearn.metrics import accuracy_score, f1_score

def validation_loop(model, loss_fn, dataloader):
    """
    Method from "A detailed guide to Pytorch's nn.Transformer() module.", by
    Daniel Melchor: https://medium.com/@danielmelchor/a-detailed-guide-to-pytorchs-nn-transformer-module-c80afbc9ffb1
    """

    model.eval()
    total_loss = 0
    start_time = time.time()

    with torch.no_grad():
        for step, batch in enumerate(dataloader):
            X, y = batch[:, 0], batch[:, 1]
            X, y = torch.tensor(X, dtype=torch.long, device=device), torch.tensor(y, dtype=torch.long, device=device)

            # Now we shift the tgt by one so with the <SOS> we predict the token at pos 1
            y_input = y[:,:-1]
            y_expected = y[:,1:]

            # Get mask to mask out the next words
            sequence_length = y_input.size(1)
            tgt_mask = model.get_tgt_mask(sequence_length).to(device)

            # Standard training except we pass in y_input and src_mask
            pred = model(X, y_input, tgt_mask)

            # Permute pred to have batch size first again
            pred = pred.permute(1, 2, 0)
            loss = loss_fn(pred, y_expected)
            total_loss += loss.detach().item()

            # if torch.isnan(loss).any():
            #   print("NaN loss detected! Debugging information:")
            #   print("Input Data (X):", X)
            #   print("Expected Output (y_expected):", y_expected)
            #   print("Expected Output (y_input):", y_input)
            #   print("Model Predictions (pred):", pred)
            #   print("Gradients:", [param.grad for param in model.parameters() if param.grad is not None])

            #   # Additional information
            #   print("Model Parameters:")
            #   for name, param in model.named_parameters():
            #       print(name, param.data)

            #   print("Optimizer State:")
            #   for param_group in opt.param_groups:
            #       print("Learning Rate:", param_group['lr'])

            #   # Add more debugging information as needed

            #   print("Loss Function Parameters:")
            #   for name, param in loss_fn.named_parameters():
            #       print(name, param.data)
            #   break

            if (step + 1) % 10 == 0:  # Adjust frequency as needed
              elapsed_time = time.time() - start_time
              print(f"Step [{step+1}/{len(dataloader)}], Loss: {loss.item():.4f}, Time: {elapsed_time:.2f} seconds")
              start_time = time.time()

    return total_loss / len(dataloader)

In [20]:
def fit(model, opt, loss_fn, train_dataloader, val_dataloader, epochs):
    """
    Method from "A detailed guide to Pytorch's nn.Transformer() module.", by
    Daniel Melchor: https://medium.com/@danielmelchor/a-detailed-guide-to-pytorchs-nn-transformer-module-c80afbc9ffb1
    """

    # Used for plotting later on
    train_loss_list, validation_loss_list = [], []

    print("Training and validating model")
    for epoch in range(epochs):
        print("-"*25, f"Epoch {epoch + 1}","-"*25)

        train_loss = train_loop(model, opt, loss_fn, train_dataloader)
        # break
        train_loss_list += [train_loss]

        validation_loss = validation_loop(model, loss_fn, val_dataloader)
        validation_loss_list += [validation_loss]

        print(f"Training loss: {train_loss:.4f}")
        print(f"Validation loss: {validation_loss:.4f}")
        print()

    return train_loss_list, validation_loss_list

In [None]:
train_loss_list, validation_loss_list = fit(model, optimizer, loss_fn, train_dataloader, val_dataloader, 1)

Training and validating model
------------------------- Epoch 1 -------------------------
Step [10/75490], Loss: 9.6452, Time: 2.46 seconds, Average Loss: 12.2522, Learning Rate: 0.001
Step [20/75490], Loss: 8.9242, Time: 1.74 seconds, Average Loss: 10.6204, Learning Rate: 0.001
Step [30/75490], Loss: 7.8461, Time: 1.72 seconds, Average Loss: 10.0493, Learning Rate: 0.001
Step [40/75490], Loss: 8.5602, Time: 2.21 seconds, Average Loss: 9.6949, Learning Rate: 0.001
Step [50/75490], Loss: 8.3979, Time: 2.25 seconds, Average Loss: 9.5142, Learning Rate: 0.001
Step [60/75490], Loss: 7.8953, Time: 1.38 seconds, Average Loss: 9.3395, Learning Rate: 0.001
Step [70/75490], Loss: 8.7136, Time: 1.84 seconds, Average Loss: 9.1862, Learning Rate: 0.001
Step [80/75490], Loss: 8.6256, Time: 1.27 seconds, Average Loss: 9.0994, Learning Rate: 0.001
Step [90/75490], Loss: 9.1433, Time: 1.62 seconds, Average Loss: 9.0049, Learning Rate: 0.001
Step [100/75490], Loss: 9.1641, Time: 1.42 seconds, Average L

In [None]:
model_name = 'transformer_model.pth'
model_path = '/content/drive/MyDrive/transformer/'
os.makedirs(model_path, exist_ok=True)
os.chdir(model_path)

torch.save(model.state_dict(), model_name)

In [None]:
model_saved = Transformer(
    num_tokens=enc_voc_size, dim_model=512, num_heads=8, num_encoder_layers=6, num_decoder_layers=6, dropout_p=0.1
).to(device)

model_location = os.path.join(model_path, model_name)
model_saved.load_state_dict(torch.load(model_location, map_location=device))

In [None]:
def predict(model, input_sequence, max_length=512, SOS_token=trg_sos_idx, EOS_token=trg_eos_idx):
    """
    Method from "A detailed guide to Pytorch's nn.Transformer() module.", by
    Daniel Melchor: https://medium.com/@danielmelchor/a-detailed-guide-to-pytorchs-nn-transformer-module-c80afbc9ffb1
    """
    model.eval()

    y_input = torch.tensor([[SOS_token]], dtype=torch.long, device=device)

    num_tokens = len(input_sequence[0])

    for _ in range(max_length):
        # Get source mask
        tgt_mask = model.get_tgt_mask(y_input.size(1)).to(device)

        pred = model(input_sequence, y_input, tgt_mask)

        next_item = pred.topk(1)[1].view(-1)[-1].item() # num with highest probability
        next_item = torch.tensor([[next_item]], device=device)

        # Concatenate previous input with predicted best word
        y_input = torch.cat((y_input, next_item), dim=1)

        # Stop if model predicts end of sentence
        if next_item.view(-1).item() == EOS_token:
            break

    return y_input.view(-1).tolist()

In [None]:
def text_to_encode(text, tokenizer):
  input_ids = np.array(tokenizer.encode(text).ids)

  SOS_token = np.array([trg_sos_idx])
  EOS_token = np.array([trg_eos_idx])
  #print(SOS_token, EOS_token)
  X = np.concatenate((SOS_token, input_ids, EOS_token))
  print(X)
  return X

In [None]:
examples = [
    torch.tensor([text_to_encode("dosarul sa stricta.", tokenizer)], dtype=torch.long, device=device),
    # torch.tensor([[2, 1, 1, 1, 1, 1, 1, 1, 1, 3]], dtype=torch.long, device=device),
    # torch.tensor([[2, 1, 0, 1, 0, 1, 0, 1, 0, 3]], dtype=torch.long, device=device),
    # torch.tensor([[2, 0, 1, 0, 1, 0, 1, 0, 1, 3]], dtype=torch.long, device=device),
    # torch.tensor([[2, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 3]], dtype=torch.long, device=device),
    # torch.tensor([[2, 0, 1, 3]], dtype=torch.long, device=device)
]

for idx, example in enumerate(examples):
    result = predict(model_saved, example)
    print(f"Example {idx}")
    print(f"Input: {example.view(-1).tolist()[1:-1]}")
    print(f"Continuation: {result}")
    print(f"Text-OUTPUT:", tokenizer.decode(result[1:-1]))
    print()