# Install required packages

In [1]:
%%capture
!pip install pytorch-lightning
!pip install torchmetrics
!pip install transformers
!pip install datasets

# Import required packages

In [10]:
import os
import zipfile
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
from urllib.request import urlretrieve

import pandas as pd
from tqdm import tqdm

import pytorch_lightning as pl
import torch
import torch.nn.functional as F
import torchmetrics
from datasets import load_dataset
from pytorch_lightning.callbacks import ModelCheckpoint
from torch.utils.data import DataLoader
from transformers import (AutoModelForSequenceClassification, AutoTokenizer,
                          DataCollatorWithPadding)

# For repoducibility
pl.utilities.seed.seed_everything(seed=2401, workers=True)

Global seed set to 2401


2401

# Define dataset, dataloader class and utility functions

In [4]:
class TqdmUpTo(tqdm):
    """From https://github.com/tqdm/tqdm/blob/master/examples/tqdm_wget.py"""

    def update_to(self, blocks=1, bsize=1, tsize=None):
        """
        Parameters
        ----------
        blocks: int, optional
            Number of blocks transferred so far [default: 1].
        bsize: int, optional
            Size of each block (in tqdm units) [default: 1].
        tsize: int, optional
            Total size (in tqdm units). If [default: None] remains unchanged.
        """
        if tsize is not None:
            self.total = tsize  # pylint: disable=attribute-defined-outside-init
        self.update(blocks * bsize - self.n)  # will also set self.n = b * bsize


def download_url(url, filename, directory='.'):
    """Download a file from url to filename, with a progress bar."""
    if not os.path.exists(directory):
        os.makedirs(directory)
    path = os.path.join(directory, filename)

    with TqdmUpTo(unit="B", unit_scale=True, unit_divisor=1024, miniters=1) as t:
        urlretrieve(url, path, reporthook=t.update_to, data=None)  # nosec
    return  path

def _load_data_from(data_dir: Union[str, Path]):
    fnames = ['sentiments.txt', 'sents.txt', 'topics.txt']
    sentiments = []
    sents = []
    topics = []
    for name in fnames:
        with open(f"{data_dir}/{name}", 'r') as f:
            if name == "sentiments.txt":
                sentiments = [int(line.strip()) for line in f.readlines()]
            elif name == "sents.txt":
                sents = [line.strip() for line in f.readlines()]        
            else:
                topics = [int(line.strip()) for line in f.readlines()]
    return sents, sentiments, topics

def _save_to_csv(file_path: Union[str, Path], data):
    sents, sentiments, topics = data
    df = pd.DataFrame({
        "sents": sents,
        "labels": sentiments,
        "topics": topics
    })
    df.to_csv(file_path, index=False)
    return file_path

In [5]:
DS_URL = "https://drive.google.com/uc?export=download&id=1zg7cbRF2nFuJ2Q-AB63xlKuwEX3dTBsx"

class UIT_VSFC(pl.LightningDataModule):
    """
    The Twitter dataset is ndwritten character digits derived from the NIST Special Database 19
    """
    def __init__(self, tokenizer, opts: Dict[str, Any]):
        super().__init__()
        self.tokenizer = tokenizer
        self.batch_size = opts['batch_size']
        self.num_workers = opts['num_workers']
        self.on_gpu = opts['on_gpu']
        self.data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
        self.dataset = None
        self.mapping = {"negative": 0, "neutral": 1, "positive": 2}
        self.inverse_mapping = {v: k for k, v in enumerate(self.mapping)}
        
    def prepare_data(self, *args, **kwargs) -> None:
        data_dir = 'download/UIT_VSFC'
        data_path = 'download/UIT_VSFC.zip'
        if not os.path.exists(data_path):
            # Download the data
            data_path = download_url(DS_URL, "UIT_VSFC.zip", "download")
        if not os.path.exists(data_dir):
            # Unzip file
            with zipfile.ZipFile(data_path, 'r') as zip_ref:
                zip_ref.extractall(data_dir)
        
        # Load and save data to csv
        for path in ["train.csv", "dev.csv", "test.csv"]:
            data = _load_data_from("download/UIT_VSFC/train")
            if path == "train.csv":
                self.train_path = _save_to_csv(path, data)
            elif path == "dev.csv":
                self.dev_path = _save_to_csv(path, data)
            else:
                self.test_path = _save_to_csv(path, data)

        

    def setup(self, stage: str = None) -> None:
        def encode(sample):
            return self.tokenizer(sample['sents'], truncation=True)
        
        raw_datasets = load_dataset('csv', data_files={'train': self.train_path, 'dev': self.dev_path,
                                                 'test': self.test_path})

        self.dataset = raw_datasets.map(encode, batched=True)
        self.dataset = self.dataset.remove_columns(
            ['sents', 'topics']
        )
        self.dataset.set_format("torch") # Set the format of the datasets so they return PyTorch tensors instead of lists.


    def train_dataloader(self):
        return DataLoader(
            self.dataset['train'],
            shuffle=True,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            pin_memory=self.on_gpu,
            collate_fn=self.data_collator
        ) 

    def val_dataloader(self):
        return DataLoader(
            self.dataset['dev'],
            shuffle=False,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            pin_memory=self.on_gpu,
            collate_fn=self.data_collator
        ) 

    def test_dataloader(self):
        return DataLoader(
            self.dataset['test'],
            shuffle=False,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            pin_memory=self.on_gpu,
            collate_fn=self.data_collator
        ) 

    def __repr__(self):
        basic = f"Twitter Dataset\nNum classes: {len(self.mapping)}\nMapping: {self.mapping}\n"
        if self.dataset is None:
            return basic

        batch = next(iter(self.train_dataloader()))
        data = (
            f"Train/val/test sizes: {len(self.dataset['train'])}, {len(self.dataset['dev'])}, {len(self.dataset['test'])}\n"
            f"Input_ids shape: {batch['input_ids'].shape}"
        )
        return basic + data

# Implementation

> Like other neural networks, Transformer models can’t process raw text directly, so the first step of our pipeline is to convert the text inputs into numbers that the model can make sense of. To do this we use a tokenizer, which will be responsible for:
>
>* Splitting the input into words, subwords, or symbols (like punctuation) that are called tokens
>* Mapping each token to an integer
>* Adding additional inputs that may be useful to the model
>
>All this preprocessing needs to be done in exactly the same way as when the model was pretrained. To do this, we use the **AutoTokenizer** class.


In [6]:
class PhoBERT(pl.LightningModule):
    def __init__(self, lr):
        super().__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained("vinai/phobert-base", num_labels=3)
        self.lr = lr

        # Define metrics
        self.val_acc = torchmetrics.Accuracy()
        self.val_f1 = torchmetrics.F1(num_classes=3, average='macro')
        self.test_acc = torchmetrics.Accuracy()
        self.test_f1 = torchmetrics.F1(num_classes=3, average='macro')
    
    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.lr)
    
    def training_step(self, batch, batch_idx):
        outputs = self.model(**batch)
        loss = outputs.loss
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self.model(**batch)
        loss, logits = outputs.loss, outputs.logits
        sentiments = batch['labels']
        scores = F.softmax(logits, dim=-1)
        self.val_acc(scores, sentiments)
        self.val_f1(scores, sentiments)
        self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        self.log('val_acc', self.val_acc, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        self.log('val_f1', self.val_f1, on_step=False, on_epoch=True, prog_bar=True, logger=True)

    def test_step(self, batch, batch_idx):
        outputs = self.model(**batch)
        logits = outputs.logits
        sentiments = batch['labels']
        scores = F.softmax(logits, dim=-1)
        self.test_acc(scores, sentiments)
        self.test_f1(scores, sentiments)
        self.log('test_acc', self.test_acc, on_step=False, on_epoch=True, logger=True)
        self.log('test_f1', self.test_f1, on_step=False, on_epoch=True, logger=True)

# Training

In [14]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
options = {
    "on_gpu": True,
    "batch_size": 8,
    "num_workers": 4
}
datamodule = UIT_VSFC(tokenizer, options)
print(datamodule)

# hyper-parameters
lr = 3e-5
max_epochs = 10
model = PhoBERT(lr)

checkpoint_callback = ModelCheckpoint(
    monitor='val_acc', # save the model with the best validation accuracy
    dirpath='checkpoints',
    mode='max',
)
trainer = pl.Trainer(gpus=1, max_epochs=max_epochs, callbacks=[checkpoint_callback], deterministic=True)
# trainer = pl.Trainer(fast_dev_run=True) #Debug 
# trainer = pl.Trainer(overfit_batches=0.1, max_epochs=max_epochs) #Debug
trainer.fit(model, datamodule)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Twitter Dataset
Num classes: 3
Mapping: {'negative': 0, 'neutral': 1, 'positive': 2}



Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['

Downloading and preparing dataset csv/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/csv/default-d36cdb030230eb7f/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-d36cdb030230eb7f/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0. Subsequent calls will reuse this data.


HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                             | Params
--------------------------------------------------------------
0 | model    | RobertaForSequenceClassification | 135 M 
1 | val_acc  | Accuracy                         | 0     
2 | val_f1   | F1                               | 0     
3 | test_acc | Accuracy                         | 0     
4 | test_f1  | F1                               | 0     
--------------------------------------------------------------
135 M     Trainable params
0         Non-trainable params
135 M     Total params
540.002   Total estimated model params size (MB)





HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

Global seed set to 2401




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




In [16]:
trainer.test()

Using custom data configuration default-d36cdb030230eb7f
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-d36cdb030230eb7f/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0)
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-d36cdb030230eb7f/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0/cache-fa403c440ce3f3fe.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-d36cdb030230eb7f/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0/cache-be019c8cdfccbcab.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-d36cdb030230eb7f/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0/cache-340093b5e395cf0e.arrow
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.9940486550331116, 'test_f1': 0.9854863286018372}
--------------------------------------------------------------------------------


[{'test_acc': 0.9940486550331116, 'test_f1': 0.9854863286018372}]

# Discussion

# Lessons

Besides easy access to dataset, the huggingface dataset libray has the following features:

* Thrive on large datasets: Datasets naturally frees the user from RAM limitation. All dataset are memory-mapped using anefficient zero serialization cost backend (Apache Arrow)

* Smart caching: never wait for your data to process several times.

* Lightweight and fast with a transparent and pythonic API (multi-processing/caching/memory-mapping).

* Built-in interoperability with NumPy, pandas, PyTorch, Tensorflow 2, and JAX.
