In [1]:
import torch
import torch.nn as nn

from tqdm import tqdm
from sklearn.model_selection import train_test_split

from transformers import AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup
import numpy as np
import pandas as pd


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SEED = 28091997
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
data = pd.read_csv("./data/processed/train.csv")
data

Unnamed: 0,question1,question2,label
0,Trong tình huống nào thì người bệnh u máu gan ...,Người bệnh u máu gan được cấy ghép gan khi nào?,1
1,Trong tình huống nào thì người bệnh u máu gan ...,Khi nào người bệnh u máu gan sẽ được phẫu thuật?,1
2,Triệu chứng lâm sàng của bệnh u máu gan là gì ...,Các triệu chứng của bệnh u máu gan?,1
3,Triệu chứng lâm sàng của bệnh u máu gan là gì ...,Dấu hiệu của u máu gan?,1
4,Rượu vang trắng có liên quan tới ung thư da ác...,Rượu vang trắng có liên quan tới ung thư da ác...,1
...,...,...,...
128,Bổ sung vitamin E sau sinh như thế nào là hiệu...,Dấu hiệu của u máu gan?,0
129,Khi nào người bệnh u máu gan sẽ được phẫu thuật?,Mẹ bầu cần tránh ăn những loại đồ ăn nào?,0
130,Ung thư vòm họng giai đoạn cuối biểu hiện như ...,Thận yếu ở phụ nữ biểu hiện như thế nào?,0
131,Những loại đồ ăn nào bà bầu không nên ăn?,Lời khuyên từ chuyên gia khi tắm cho trẻ?,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 133 entries, 0 to 132
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   question1  133 non-null    object
 1   question2  133 non-null    object
 2   label      133 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 3.2+ KB


In [5]:
PHOBERT_VERSION = "vinai/phobert-base"

In [6]:
tokenizer = AutoTokenizer.from_pretrained(PHOBERT_VERSION)

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


In [7]:
train_df, val_df = train_test_split(data, test_size = 0.2)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

In [8]:
MAX_LEN = 40

In [9]:
class PhoBertDataset:
    def __init__(self, first_questions, second_questions, targets, tokenizer):
        self.first_questions = first_questions
        self.second_questions = second_questions
        self.targets = targets
        self.tokenizer = tokenizer
        self.length = len(first_questions)

    def __len__(self):
        return self.length

    def __getitem__(self, item):
        first_questions = str(self.first_questions[item])
        second_questions = str(self.second_questions[item])

        # remove extra white spaces from questions
        first_questions = " ".join(first_questions.split())
        second_questions = " ".join(second_questions.split())

        inputs = self.tokenizer.encode_plus(
            first_questions,
            second_questions,
            add_special_tokens=True,
            padding='max_length',
            max_length=2*MAX_LEN+3, # max length of 2 questions and 3 spectial tokens
            truncation=True
        )

        # return targets 0, when using data set in testing and targets are none
        return {
            "ids": torch.tensor(inputs["input_ids"], dtype=torch.long),
            "mask": torch.tensor(inputs["attention_mask"], dtype=torch.long),
            "token_type_ids": torch.tensor(inputs["token_type_ids"], dtype=torch.long),
            "targets": torch.tensor(int(self.targets[item]), dtype=torch.long) if self.targets is not None else 0
        }

In [10]:
# creates dataset and returns dataloader of it
def get_data_loader(df, targets, batch_size, shuffle, tokenizer):
    dataset = PhoBertDataset(
        first_questions=df["question1"].values,
        second_questions=df["question2"].values,
        targets=targets,
        tokenizer=tokenizer
    )

    data_loader = torch.utils.data.DataLoader(
        dataset,
        batch_size = batch_size,
        shuffle=shuffle
    )

    return data_loader

In [11]:
BATCH_SIZE = 1

In [12]:
# create data loaders of training and validation data
train_data_loader = get_data_loader(
    df=train_df,
    targets=train_df["label"].values,
    batch_size=BATCH_SIZE,
    shuffle=True,
    tokenizer=tokenizer
)

val_data_loader = get_data_loader(
    df=val_df,
    targets=val_df["label"].values,
    batch_size=4*BATCH_SIZE,
    shuffle=True,
    tokenizer=tokenizer
)


# Model

In [13]:
class PhoBertModel(nn.Module):
    def __init__(self, phobert_path):
        super(PhoBertModel, self).__init__()
        self.phobert_path = phobert_path
        self.phobert = AutoModel.from_pretrained(self.phobert_path)
        self.dropout = nn.Dropout(0.3)
        self.out = nn.Linear(768, 1)

    def forward(self, ids, mask, token_type_ids):
        _, pooled = self.phobert(ids, attention_mask=mask, token_type_ids=token_type_ids)
        #add dropout to prevent overfitting
        pooled = self.dropout(pooled)
        return self.out(pooled)

In [14]:
model = PhoBertModel(PHOBERT_VERSION).to(device)

# Training

In [15]:
# Loss function = binary cross entropy loss
# using sigmoid to put probabilities in [0,1] interval
def loss_fn(outputs, targets):
    outputs = torch.squeeze(outputs)
    return nn.BCELoss()(nn.Sigmoid()(outputs), targets)

In [16]:
def calculate_perplexity(data_loader, model, device):
    model.eval()
    # tells Pytorch not to store values of intermediate computations for backward pass because we not gonna need gradients
    with torch.no_grad():
        total_loss = 0
        for batch in data_loader:
            ids = batch["ids"].to(device, dtype=torch.long)
            mask = batch["mask"].to(device, dtype=torch.long)
            token_type_ids = batch["token_type_ids"].to(device, dtype=torch.long)
            targets = batch["targets"].to(device, dtype=torch.float)

            outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
            total_loss += loss_fn(outputs, targets).item()

    model.train()

    return np.exp(total_loss/len(data_loader)) 

In [17]:
def train_loop(epochs, train_data_loader, val_data_loader, model, optimizer, device, scheduler=None):
    it = 1
    total_loss = 0
    curr_perplexity = None
    perplexity = None
    
    model.train()
    for epoch in range(epochs):
        print('Epoch: ', epoch + 1)
        for batch in train_data_loader:
            ids = batch["ids"].to(device, dtype=torch.long)
            mask = batch["mask"].to(device, dtype=torch.long)
            token_type_ids = batch["token_type_ids"].to(device, dtype=torch.long)
            targets = batch["targets"].to(device, dtype=torch.float)

            optimizer.zero_grad()
            
            # do forward pass, will save intermediate computations of the graph for later backprop use.
            outputs = model(ids, mask=mask, token_type_ids=token_type_ids)
            
            loss = loss_fn(outputs, targets)
            total_loss += loss.item()
            
            # running backprop.
            loss.backward()
            
            # doing gradient descent step.
            optimizer.step()
            
            # we are logging current loss/perplexity in every 100 iteration
            if it % 5 == 0:
                
                # computing validation set perplexity in every 500 iteration.
                if it % 20 == 0:
                    curr_perplexity = calculate_perplexity(val_data_loader, model, device)
                    
                    if scheduler is not None:
                        scheduler.step()

                    # making checkpoint of best model weights.
                    if not perplexity or curr_perplexity < perplexity:
                        torch.save(model.state_dict(), 'saved_model')
                        perplexity = curr_perplexity

                print('| Iter', it, '| Avg Train Loss', total_loss / 100, '| Dev Perplexity', curr_perplexity)
                total_loss = 0

            it += 1
   

In [18]:
def run(model, train_df, device, train_data_loader, val_data_loader):
    EPOCHS = 1000
    
    lr = 3e-5
    num_training_steps = int(len(train_data_loader) * EPOCHS)
    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps
    )
    
    
    train_loop(EPOCHS, train_data_loader, val_data_loader,  model, optimizer, device, scheduler)

In [19]:
run(model, train_df, device, train_data_loader, val_data_loader)

ev Perplexity 2.020630507526752
| Iter 104860 | Avg Train Loss 0.035775671601295474 | Dev Perplexity 2.0211818834941426
| Iter 104865 | Avg Train Loss 0.035271175503730774 | Dev Perplexity 2.0211818834941426
| Iter 104870 | Avg Train Loss 0.035245587229728696 | Dev Perplexity 2.0211818834941426
| Iter 104875 | Avg Train Loss 0.03442310273647308 | Dev Perplexity 2.0211818834941426
| Iter 104880 | Avg Train Loss 0.03566416323184967 | Dev Perplexity 2.008531035512192
| Iter 104885 | Avg Train Loss 0.03363487601280212 | Dev Perplexity 2.008531035512192
| Iter 104890 | Avg Train Loss 0.0348351639509201 | Dev Perplexity 2.008531035512192
| Iter 104895 | Avg Train Loss 0.03455220997333527 | Dev Perplexity 2.008531035512192
| Iter 104900 | Avg Train Loss 0.03473040997982025 | Dev Perplexity 2.0063283766815374
| Iter 104905 | Avg Train Loss 0.03393917739391327 | Dev Perplexity 2.0063283766815374
| Iter 104910 | Avg Train Loss 0.03392826735973358 | Dev Perplexity 2.0063283766815374
| Iter 104915