In [21]:
from datasets import load_dataset
import re
import string
import time
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.functional import to_map_style_dataset

import torch
from torch.utils.data import DataLoader
import torch.nn as nn

from transformer import TransformerEncoder
import matplotlib.pyplot as plt
import torch.optim as optim

In [1]:
ds = load_dataset("thainq107/ntc-scv")
ds

  from .autonotebook import tqdm as notebook_tqdm
Downloading readme: 100%|██████████| 570/570 [00:00<00:00, 1.97kB/s]
Downloading data: 100%|██████████| 18.8M/18.8M [00:02<00:00, 9.37MB/s]
Downloading data: 100%|██████████| 6.35M/6.35M [00:01<00:00, 5.69MB/s]
Downloading data: 100%|██████████| 6.35M/6.35M [00:01<00:00, 5.54MB/s]
Generating train split: 100%|██████████| 30000/30000 [00:00<00:00, 700611.47 examples/s]
Generating valid split: 100%|██████████| 10000/10000 [00:00<00:00, 1010091.51 examples/s]
Generating test split: 100%|██████████| 10000/10000 [00:00<00:00, 1063815.15 examples/s]


DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'preprocessed_sentence'],
        num_rows: 30000
    })
    valid: Dataset({
        features: ['sentence', 'label', 'preprocessed_sentence'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['sentence', 'label', 'preprocessed_sentence'],
        num_rows: 10000
    })
})

# Data preparation

## Preprocessing

In [None]:
def preprocess_text(text):
    # remove URLs https://www.
    url_pattern = re.compile(r"https?://\s+\wwww\.\s+")
    text = url_pattern.sub(r" ", text)
    # remove HTML Tags: <>
    html_pattern = re.compile(r"<[^<>]+>")
    text = html_pattern.sub(" ", text)
    # remove puncs and digits
    replace_chars = list(string.punctuation + string.digits)
    for char in replace_chars:
        text = text.replace(char, "")
    # remove emoji
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U0001F1F2-\U0001F1F4"  # Macau flag
        "\U0001F1E6-\U0001F1FF"  # flags
        "\U0001F600-\U0001F64F"
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "\U0001f926-\U0001f937"
        "\U0001F1F2"
        "\U0001F1F4"
        "\U0001F620"
        "\u200d"
        "\u2640-\u2642"
        "]+",
        flags=re.UNICODE,
    )
    text = emoji_pattern.sub(r"", text)
    # normalize whitespace
    text = " ".join(text.split())
    # lowercasing
    text = text.lower()
    return text

## Representation

In [7]:
def yield_tokens(sentences, tokenizer):
    for sentence in sentences:
        yield tokenizer(sentence)


# word-based tokenizer
tokenizer = get_tokenizer("basic_english")

In [9]:
# build vocabulary
vocab_size = 10000
vocabulary = build_vocab_from_iterator(
    yield_tokens(ds["train"]["preprocessed_sentence"], tokenizer),
    max_tokens=vocab_size,
    specials=["<pad>", "<unk>"],
)
vocabulary.set_default_index(vocabulary["<unk>"])
print(vocabulary.get_itos()[:10])

['<pad>', '<unk>', 'ăn', 'mình', 'có', 'là', 'không', 'quán', 'thì', 'và']


In [12]:
# convert torchtext dataset
def prepare_dataset(df):
    for row in df:
        sentence = row["preprocessed_sentence"]
        encoded_sentence = vocabulary(tokenizer(sentence))
        label = row["label"]
        yield encoded_sentence, label


train_dataset = prepare_dataset(ds["train"])
train_dataset = to_map_style_dataset(train_dataset)

valid_dataset = prepare_dataset(ds["valid"])
valid_dataset = to_map_style_dataset(valid_dataset)

test_dataset = prepare_dataset(ds["test"])
test_dataset = to_map_style_dataset(test_dataset)

print(train_dataset[0])
print(valid_dataset[0])
print(test_dataset[0])

# print shape of datasets
print(len(train_dataset))
print(len(valid_dataset))
print(len(test_dataset))

([351, 111, 112, 529, 124, 228, 196, 53, 159, 43, 103, 256, 46, 2, 11, 31, 52, 723, 32, 491, 991, 533, 32, 491, 220, 1415, 9, 731, 897, 185, 130, 836, 57, 88, 4, 14, 3183, 251, 59], 1)
([5, 7, 322, 221, 256, 21, 28, 116, 84, 689, 584, 3, 950, 102, 221, 254, 9, 221, 25, 581, 258, 53, 13, 11, 21, 59, 59, 255, 72, 65, 324, 1331, 737, 768, 49, 371, 340, 13, 86, 244, 30, 3, 9, 317, 58, 96, 5, 59, 300, 270, 650, 4214, 297, 1201, 81, 756, 701, 441, 180, 17, 2, 88, 5, 1576, 215, 33, 401, 359, 677, 439, 1555, 3, 6, 62, 181, 167, 72, 221, 5, 300, 11, 9, 300, 47, 66, 17, 192, 29, 81, 1071, 43, 246, 8, 52, 7, 13, 110, 56, 394, 167, 2, 359, 17, 38, 728, 42, 162, 235, 90, 1690, 116, 235, 69, 6, 28, 2, 200, 3, 90, 141, 87, 68, 2, 33, 312, 301, 17, 189, 2, 273, 7, 2545, 484, 269, 795, 4, 1331, 109, 1976, 10, 28, 490, 9, 703, 210, 701, 104, 68, 104, 457, 95, 1331, 221, 57], 1)
([584, 183, 3, 131, 984, 87, 598, 576, 9, 357, 61, 1008, 36, 365, 192, 1811, 23, 81, 41, 108, 113, 16, 176, 133, 290, 20, 84, 8

## Data loader

In [14]:
seq_length = 100


def collate_batch(batch):
    sentences, labels = list(zip(*batch))
    encoded_sentences = [
        (
            sentence + ([0] * (seq_length - len(sentence)))
            if len(sentence) < seq_length
            else sentence[:seq_length]
        )
        for sentence in sentences
    ]
    encoded_sentences = torch.tensor(encoded_sentences)
    labels = torch.tensor(labels)
    return encoded_sentences, labels


batch_size = 128
train_dataloader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch
)

valid_dataloader = DataLoader(
    valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch
)

test_dataloader = DataLoader(
    test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch
)

# Model training

## Modeling

In [16]:
class TransformerEncoderCls(nn.Module):
    def __init__(
        self,
        vocab_size,
        max_length,
        num_layers,
        embed_dim,
        num_heads,
        ff_dim,
        dropout=0.1,
        device="cpu",
    ):
        super().__init__()
        self.encoder = TransformerEncoder(
            vocab_size,
            embed_dim,
            max_length,
            num_layers,
            num_heads,
            ff_dim,
            dropout,
            device,
        )
        self.pooling = nn.AvgPool1d(kernel_size=max_length)
        self.fc1 = nn.Linear(in_features=embed_dim, out_features=20)
        self.fc2 = nn.Linear(in_features=20, out_features=2)
        self.dropout = nn.Dropout(p=dropout)
        self.relu = nn.ReLU()

    def forward(self, x):
        output = self.encoder(x)
        output = self.pooling(output.permute(0, 2, 1)).squeeze()
        output = self.dropout(output)
        output = self.fc1(output)
        output = self.relu(output)
        output = self.dropout(output)
        output = self.fc2(output)
        return output

## Trainer

In [18]:
def train_epoch(
    model, optimizer, criterion, train_dataloader, device, epoch=0, log_interval=50
):
    model.train()
    total_acc, total_count = 0, 0
    losses = []
    start_time = time.time()
    for idx, (inputs, labels) in enumerate(train_dataloader):
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        predictions = model(inputs)
        # compute loss
        loss = criterion(predictions, labels)
        losses.append(loss.item())
        # backward
        loss.backward()
        optimizer.step()
        total_acc += (predictions.argmax(1) == labels).sum().item()
        total_count += labels.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f}".format(
                    epoch, idx, len(train_dataloader), total_acc / total_count
                )
            )
            total_acc, total_count = 0, 0
            start_time = time.time()
    epoch_acc = total_acc / total_count
    epoch_loss = sum(losses) / len(losses)
    return epoch_acc, epoch_loss


def evaluate_epoch(model, criterion, valid_dataloader, device):
    model.eval()
    total_acc, total_count = 0, 0
    losses = []
    with torch.no_grad():
        for idx, (inputs, labels) in enumerate(valid_dataloader):
            inputs = inputs.to(device)
            labels = labels.to(device)
            predictions = model(inputs)
            loss = criterion(predictions, labels)
            losses.append(loss.item())
            total_acc += (predictions.argmax(1) == labels).sum().item()
            total_count += labels.size(0)

    epoch_acc = total_acc / total_count
    epoch_loss = sum(losses) / len(losses)
    return epoch_acc, epoch_loss


def train(
    model,
    model_name,
    save_model,
    optimizer,
    criterion,
    train_dataloader,
    valid_dataloader,
    num_epochs,
    device,
):
    train_accs, train_losses = [], []
    eval_accs, eval_losses = [], []
    best_loss_eval = 100
    times = []
    for epoch in range(1, num_epochs + 1):
        epoch_start_time = time.time()
        # Training
        train_acc, train_loss = train_epoch(
            model, optimizer, criterion, train_dataloader, device, epoch
        )
        train_accs.append(train_acc)
        train_losses.append(train_loss)
        # Evaluation
        eval_acc, eval_loss = evaluate_epoch(model, criterion, valid_dataloader, device)
        eval_accs.append(eval_acc)
        eval_losses.append(eval_loss)
        # Save best model
        if eval_loss < best_loss_eval:
            torch.save(model.state_dict(), save_model + f"/{model_name}.pt")
            best_loss_eval = eval_loss
        times.append(time.time() - epoch_start_time)
        # Print loss, acc end epoch
        print("-" * 59)
        print(
            "| End of epoch {:3d} | Time: {:5.2f}s | Train Accuracy {:8.3f} | Train Loss {:8.3f} "
            "| Valid Accuracy {:8.3f} | Valid Loss {:8.3f} ".format(
                epoch,
                time.time() - epoch_start_time,
                train_acc,
                train_loss,
                eval_acc,
                eval_loss,
            )
        )
        print("-" * 59)

    # Load best model
    model.load_state_dict(torch.load(save_model + f"/{model_name}.pt"))
    model.eval()
    metrics = {
        "train_accuracy": train_accs,
        "train_loss": train_losses,
        "valid_accuracy": eval_accs,
        "valid_loss": eval_losses,
        "time": times,
    }
    return model, metrics

In [20]:
def plot_result(num_epochs, train_accs, eval_accs, train_losses, eval_losses):
    epochs = list(range(num_epochs))
    fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))
    axs[0].plot(epochs, train_accs, label="Training")
    axs[0].plot(epochs, eval_accs, label="Evaluation")
    axs[1].plot(epochs, train_losses, label="Training")
    axs[1].plot(epochs, eval_losses, label="Evaluation")
    axs[0].set_xlabel("Epochs")
    axs[1].set_xlabel("Epochs")
    axs[0].set_ylabel("Accuracy")
    axs[1].set_ylabel("Loss")
    plt.legend()

## Training

In [22]:
vocab_size = 10000
max_length = 100
embed_dim = 200
num_layers = 2
num_heads = 4
ff_dim = 128
dropout = 0.1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerEncoderCls(
    vocab_size, max_length, num_layers, embed_dim, num_heads, ff_dim, dropout, device
)
model.to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.00005)

In [None]:
num_epochs = 50
save_model = "./model"
model_name = "model"
model, metrics = train(
    model,
    model_name,
    save_model,
    optimizer,
    criterion,
    train_dataloader,
    valid_dataloader,
    num_epochs,
    device,
)

In [None]:
plot_result(
    num_epochs,
    metrics["train_accuracy"],
    metrics["valid_accuracy"],
    metrics["train_loss"],
    metrics["valid_loss"],
)