## Kiến trúc Transformer

In [1]:
# Import các thư viện cần thiết

import torch # type: ignore
import torch.nn as nn # type: ignore

seed = 1
torch.manual_seed(seed)

import os # type: ignore
import numpy as np # type: ignore
import pandas as pd # type: ignore
import matplotlib.pyplot as plt # type: ignore
import re # type: ignore
import nltk # type: ignore

nltk.download('stopwords') # type: ignore
from nltk.corpus import stopwords # type: ignore
from nltk.stem.porter import PorterStemmer # type: ignore

from torch.utils.data import Dataset, DataLoader # type: ignore
from sklearn.model_selection import train_test_split # type: ignore

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
# 1. Input Embedding, Positional Encoding
class TokenAndPositionalEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_dim, max_length, device='cpu'):
        super().__init__()
        self.device = device
        self.word_emb = nn.Embedding(
                                    num_embeddings = vocab_size,
                                    embedding_dim = embed_dim
                                    )
        self.pos_emb = nn.Embedding(
                                    num_embeddings = max_length,
                                    embedding_dim = embed_dim
                                    )
    def forward(self, x):
        N, seq_len = x.size()
        positions = torch.arange(0, seq_len).expand(N, seq_len).to(self.device)
        output1 = self.word_emb(x)
        output2 = self.pos_emb(positions)
        output = output1 + output2
        return output


In [3]:
# 2. Encoder
class TransformerEncoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.attn = nn.MultiheadAttention(
                                        embed_dim = embed_dim,
                                        num_heads = num_heads,
                                        batch_first = True
                                    )
        self.ffn = nn.Sequential(
                                nn.Linear(in_features=embed_dim, out_features=ff_dim, bias=True),
                                nn.ReLU(),
                                nn.Linear(in_features=ff_dim, out_features=embed_dim, bias=True)
                                )
        self.layernorm_1 = nn.LayerNorm(normalized_shape=embed_dim, eps=1e-6)
        self.layernorm_2 = nn.LayerNorm(normalized_shape=embed_dim, eps=1e-6)
        self.dropout_1 = nn.Dropout(p=dropout)
        self.dropout_2 = nn.Dropout(p=dropout)

    def forward(self, query, key, value):
        att_output, _ = self.attn(query=query, key=key, value=value)
        att_output = self.dropout_1(att_output)
        out_1 = self.layernorm_1(query + att_output)
        ffn_output = self.ffn(out_1)
        ffn_output = self.dropout_2(ffn_output)
        out_2 = self.layernorm_2(out_1 + ffn_output)
        return out_2

class TransformerEncoder(nn.Module):
    def __init__(self,
                 src_vocab_size,
                 embed_dim,
                 max_length,
                 num_layers,
                 num_heads,
                 ff_dim,
                 dropout=0.1,
                 device='cpu'):
        super().__init__()
        self.embedding = TokenAndPositionalEmbedding(src_vocab_size,
                                                     embed_dim,
                                                     max_length,
                                                     device)
        self.layers = nn.ModuleList([TransformerEncoderBlock(
                            embed_dim, num_heads, ff_dim, dropout) for i in range(num_layers)])

    def forward(self, x):
        output = self.embedding(x)
        for layer in self.layers:
            output = layer(output, output, output)
        return output

In [4]:
# 3. Decoder
class TransformerDecoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.attn = nn.MultiheadAttention(
                                        embed_dim = embed_dim,
                                        num_heads = num_heads,
                                        batch_first = True
                                        )
        self.cross_attn = nn.MultiheadAttention(
                                                embed_dim = embed_dim,
                                                num_heads = num_heads,
                                                batch_first = True
                                                )
        self.ffn = nn.Sequential(
                                nn.Linear(in_features=embed_dim, out_features=ff_dim, bias=True),
                                nn.ReLU(),
                                nn.Linear(in_features=ff_dim, out_features=embed_dim, bias=True)
                                )
        self.layernorm_1 = nn.LayerNorm(normalized_shape=embed_dim, eps=1e-6)
        self.layernorm_2 = nn.LayerNorm(normalized_shape=embed_dim, eps=1e-6)
        self.layernorm_3 = nn.LayerNorm(normalized_shape=embed_dim, eps=1e-6)
        self.dropout_1 = nn.Dropout(p=dropout)
        self.dropout_2 = nn.Dropout(p=dropout)
        self.dropout_3 = nn.Dropout(p=dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        att_output, _ = self.attn(x, x, x, attn_mask=tgt_mask)
        att_output = self.dropout_1(att_output)
        out_1 = self.layernorm_1(x + att_output)

        attn_output, _ = self.cross_attn(out_1, enc_output, enc_output, attn_mask=src_mask)
        attn_output = self.dropout_2(attn_output)
        out_2 = self.layernorm_2(out_1 + attn_output)

        ffn_output = self.ffn(out_2)
        ffn_output = self.dropout_3(ffn_output)
        out_3 = self.layernorm_3(out_2 + ffn_output)
        return out_3

class TransformerDecoder(nn.Module):
    def __init__(self,
                 tgt_vocab_size,
                 embed_dim,
                 max_length,
                 num_layers,
                 num_heads,
                 ff_dim,
                 dropout=0.1,
                 device='cpu'):
        super().__init__()
        self.embedding = TokenAndPositionalEmbedding(tgt_vocab_size,
                                                     embed_dim,
                                                     max_length,
                                                     device)
        self.layers = nn.ModuleList([TransformerDecoderBlock(
                            embed_dim, num_heads, ff_dim, dropout) for i in range(num_layers)])


    def forward(self, x, enc_output, src_mask, tgt_mask):
        output = self.embedding(x)
        for layer in self.layers:
            output = layer(output, enc_output, src_mask, tgt_mask)
        return output

In [5]:
# 4. Transformer
class Transformer(nn.Module):
    def __init__(self,
                 src_vocab_size,
                 tgt_vocab_size,
                 embed_dim,
                 max_length,
                 num_layers,
                 num_heads,
                 ff_dim,
                 dropout=0.1,
                 device='cpu'):
        super().__init__()
        self.device = device
        self.encoder = TransformerEncoder(src_vocab_size,
                                          embed_dim,
                                          max_length,
                                          num_layers,
                                          num_heads,
                                          ff_dim,
                                          dropout=dropout,
                                          device=device
                                          )
        self.decoder = TransformerDecoder(tgt_vocab_size,
                                          embed_dim,
                                          max_length,
                                          num_layers,
                                          num_heads,
                                          ff_dim,
                                          dropout=dropout,
                                          device=device
                                          )
        self.fc = nn.Linear(embed_dim, tgt_vocab_size)

    def generate_mask(self, src, tgt):
        src_seq_len = src.shape[1]
        tgt_seq_len = tgt.shape[1]

        src_mask = torch.zeros((src_seq_len, src_seq_len), device=self.device).type(torch.bool)
        tgt_mask = torch.triu(torch.zeros((tgt_seq_len, tgt_seq_len), device=self.device)==1).transpose(0, 1)
        tgt_mask = tgt_mask.float().masked_fill(tgt_mask == 0, float('-inf')).masked_fill(tgt_mask == 1, float(0.0))

        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        enc_output = self.encoder(src)
        dec_output = self.decoder(tgt, enc_output, src_mask, tgt_mask)
        output = self.fc(dec_output)
        return output

In [6]:
# 5. Thử nghiệm
batch_size = 128
src_vocab_size = 1000
tgt_vocab_size = 2000
embed_dim = 200
max_length = 100
num_layers = 2
num_headers = 4
ff_dim = 256

model = Transformer(src_vocab_size,
                    tgt_vocab_size,
                    embed_dim,
                    max_length,
                    num_layers,
                    num_headers,
                    ff_dim)
src = torch.randint(
                    high=2,
                    size=(batch_size, max_length),
                    dtype=torch.int64
                    )
tgt = torch.randint(
                    high=2,
                    size=(batch_size, max_length),
                    dtype=torch.int64
                    )

prediction = model(src, tgt)
prediction.shape #batch_size x max)_length x tgt_vocab_size

torch.Size([128, 100, 2000])

## Text Classification

In [7]:
# 1. Load Dataset
%pip install -q -U transformers datasets accelerate evaluate

from datasets import load_dataset # type: ignore

ds = load_dataset('thainq107/ntc-scv')

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/570 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.8M [00:00<?, ?B/s]

valid-00000-of-00001.parquet:   0%|          | 0.00/6.35M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/6.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/30000 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [11]:
# 2. Preprocessing
from transformers import AutoTokenizer # type: ignore

model_name = "distilbert-base-uncased" # bert-base-uncased

tokenizer = AutoTokenizer.from_pretrained(
                                        model_name,
                                        use_fast = True
                                        )
max_seq_length = 100
max_seq_length = min(max_seq_length, tokenizer.model_max_length)

def preprocess_fuctions(examples):
    # Tokenize the texts
    result = tokenizer(
                        examples['preprocessed_sentence'],
                        padding = 'max_length',
                        max_length = max_seq_length,
                        truncation = True
                        )
    result['label'] = examples['label']
    return result

# Running the preprocessing pipeline on all the datasets
processed_dataset = ds.map(
                            preprocess_fuctions,
                            batched = True,
                            desc='Running tokenizer on dataset'
                            )

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Running tokenizer on dataset:   0%|          | 0/30000 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [13]:
# 3. Modeling
from string import digits
from transformers import AutoConfig, AutoModelForSequenceClassification # type: ignore

num_labels = 2

config = AutoConfig.from_pretrained(
                                    model_name,
                                    num_labels = num_labels,
                                    finetuning_task = 'text-classification'
                                    )
model = AutoModelForSequenceClassification.from_pretrained(
                                                            model_name,
                                                            config = config
                                                            )

# 4. Metric
import numpy as np # type: ignore
import evaluate # type: ignore

metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(digits, axis=-1)
    result = metric.compute(predictions=predictions, references=labels)
    return result

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [14]:
# 5. Trainer
from transformers import TrainingArguments, Trainer # type: ignore

training_args = TrainingArguments(
                                    output_dir = 'save_model',
                                    learning_rate=2e-5,
                                    per_device_train_batch_size=128,
                                    per_device_eval_batch_size=128,
                                    num_train_epochs =10,
                                    eval_strategy='epoch',
                                    save_strategy='epoch',
                                    load_best_model_at_end=True
                                    )
trainer = Trainer(
                    model = model,
                    args = training_args,
                    train_dataset = processed_dataset['train'],
                    eval_dataset = processed_dataset['test'],
                    tokenizer = tokenizer,
                    compute_metrics = compute_metrics
                    )

trainer.train()

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: [32m[41mERROR[0m API key must be 40 characters long, yours was 6


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:


Abort: 

In [None]:
# 6. Trainer
# tran epoch
import time

def train_epoch(model, optimizer, criterion, train_dataloader, device, epoch=0, log_interval=50):
    model.train()
    total_acc, total_count = 0, 0
    losses = []
    start_time = time.time()

    for idx, (inputs, labels) in enumerate(train_dataloader):
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        predictions = model(inputs)

        # compute loss
        loss = criterion(predictions, labels)
        losses.append(loss.item())

        # backward
        loss.backward()
        optimizer.step()
        total_acc += (predictions.argmax(1) == labels).sum().item()
        total_count += labels.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f}".format(
                    epoch, idx, len(train_dataloader), total_acc / total_count
                )
            )
            total_acc, total_count = 0, 0
            start_time = time.time()

    epoch_acc = total_acc / total_count
    epoch_loss = sum(losses) / len(losses)
    return epoch_loss, epoch_acc

# evaluate
def evaluate_epoch(model, criterion, dataloader, device):
    model.eval()
    total_acc, total_count = 0, 0
    losses = []
    with torch.no_grad():
        for idx, (inputs, labels) in enumerate(dataloader):
            inputs = inputs.to(device)
            labels = labels.to(device)

            predictions = model(inputs)

            loss = criterion(predictions, labels)
            losses.append(loss.item())

            total_acc += (predictions.argmax(1) == labels).sum().item()
            total_count += labels.size(0)

    epoch_acc = total_acc / total_count
    epoch_loss = sum(losses) / len(losses)
    return epoch_loss, epoch_acc

# train
def train(model, model_name, save_model, optimizer, criterion, train_dataloader, val_dataloader, num_epochs, device):
    train_accs, train_losses = [], []
    eval_accs, eval_losses = [], []
    best_loss_eval = 100
    times = []
    for epoch in range(1, num_epochs + 1):
        epoch_start_time = time.time()
        # Training
        train_acc, train_loss = train_epoch(model, optimizer, criterion, train_dataloader, device, epoch)
        train_accs.append(train_acc)
        train_losses.append(train_loss)

        # Evaluation
        eval_acc, eval_loss = evaluate_epoch(model, criterion, val_dataloader, device)
        eval_accs.append(eval_acc)
        eval_losses.append(eval_loss)

        # Save best model
        if eval_loss < best_loss_eval:
            torch.save(model.state_dict(), save_model + f'/{model_name}.pt')

        times.append(time.time() - epoch_start_time)

        # Print loss, acc end epoch
        print("-" * 59)
        print(
            "| End of epoch {:3d} | time: {:5.2f}s | Train Accuracy {:8.3f} | Train Loss {:8.3f}"
            "| Valid Accuracy {:8.3f} | Valid Loss {:8.3f}".format(
                epoch, times.time() - epoch_start_time, train_acc, train_loss, eval_loss, eval_acc
            )
        )
        print("-" *59)

    # Load beset model
    model.load_state_dict(torch.load(save_model + f'/{model_name}.pt'))
    model.eval()
    metrics = {
            'train_accuracy' : train_accs,
            'train_loss' : train_losses,
            'valid_accuracy' : eval_accs,
            'valid_loss' : eval_losses,
            'time' : times
            }
    return model, metrics

# Report
import matplotlib.pyplot as plt # type: ignore

def plot_result(num_epochs, train_accs, eval_accs, train_losses, eval_losses):
    epochs = list(range(num_epochs))
    fig, axs = plt.subplots(nrows = 1, ncols = 2, figsize = (12.6))
    axs[0].plot(epochs, train_accs, label = 'Training')
    axs[0].plot(epochs, eval_accs, label = 'Evaluation')
    axs[1].plot(epochs, train_losses, label = 'Training')
    axs[1].plot(epochs, eval_losses, label = 'Evaluation')
    axs[0].set_xlabel('Epochs')
    axs[1].set_xlabel('Epochs')
    axs[0].set_ytitle('Accuracy')
    axs[1].set_ytitle('Loss')
    plt.legend()
    plt.show()

In [None]:
# 6. Modeling
class TransformerEncoderCls(nn.Module):
    def __init__(self,
                 vocab_size, max_length, num_layers, embed_dim, num_heads, ff_dim,
                 dropout=0.1,
                 device='cpu'):
        super().__init__()
        self.encoder = TransformerEncoder(vocab_size,
                                          embed_dim,
                                          max_length,
                                          num_layers,
                                          num_heads,
                                          ff_dim,
                                          dropout,
                                          device)
        self.pooling = nn.AvgPool1d(kernel_size=max_length)
        self.fc1 = nn.Linear(in_features=embed_dim, out_features=20)
        self.fc2 = nn.Linear(in_features=20, out_features=2)
        self.drouout = nn.Dropout(p=dropout)
        self.relu = nn.ReLU()

    def forward(self, x):
        output = self.encoder(x)
        output = self.pooling(output.permute(0, 2, 1)).squeeze()
        output = self.dropout(output)
        output = self.fc1(output)
        output = self.drouout(output)
        output = self.fc2(output)
        return output

# 7. Training
import torch.optim as optim # type: ignore

vocab_size = 10000
max_length = 100
embed_dim = 200
num_layers = 2
num_heads = 4
ff_dim = 128
dropout = 0.1

model = TransformerEncoderCls(vocab_size,
                              max_length,
                              num_layers,
                              embed_dim,
                              num_heads,
                              ff_dim,
                              dropout
                            )
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = TransformerEncoderCls(vocab_size,
                              max_length,
                              num_layers,
                              embed_dim,
                              num_heads,
                              ff_dim,
                              dropout,
                              device
                            )
model.to(device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.00005, weight_decay=0.0001)

num_epochs = 50
save_model = './model'
os.makedirs(save_model, exist_ok=True)
model_name = 'model'
