### 1. Import package

#### Connect Google Drive & Install package

In [1]:
from google.colab import drive

drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
import os
# base_dir = "/content/drive/My Drive/Colab Notebooks/Bert_sa"
base_dir = "."

os.chdir(base_dir)

In [2]:
!pip install vncorenlp

Collecting vncorenlp
  Downloading vncorenlp-1.0.3.tar.gz (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: vncorenlp
  Building wheel for vncorenlp (setup.py) ... [?25l[?25hdone
  Created wheel for vncorenlp: filename=vncorenlp-1.0.3-py3-none-any.whl size=2645932 sha256=0e556f6158898d8bd93c4ef97d70ce06362ff5aa644fc2358594dbbb54980c3f
  Stored in directory: /root/.cache/pip/wheels/5d/d9/b3/41f6c6b1ab758561fd4aab55dc0480b9d7a131c6aaa573a3fa
Successfully built vncorenlp
Installing collected packages: vncorenlp
Successfully installed vncorenlp-1.0.3


In [8]:
# Download VnCoreNLP-1.1.1.jar & its word segmentation component (i.e. RDRSegmenter)
!mkdir -p vncorenlp/models/wordsegmenter
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/wordsegmenter.rdr
!mv VnCoreNLP-1.1.1.jar vncorenlp/
!mv vi-vocab vncorenlp/models/wordsegmenter/
!mv wordsegmenter.rdr vncorenlp/models/wordsegmenter/

--2024-04-11 02:32:19--  https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 27412575 (26M) [application/octet-stream]
Saving to: ‘VnCoreNLP-1.1.1.jar’


2024-04-11 02:32:21 (45.0 MB/s) - ‘VnCoreNLP-1.1.1.jar’ saved [27412575/27412575]

--2024-04-11 02:32:21--  https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 526544 (514K) [application/octet-stream]
Saving to: ‘vi-vo

#### Import package

In [3]:
import numpy as np
import pandas as pd
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from tensorflow.keras.preprocessing.sequence import pad_sequences
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoModel, AutoTokenizer, AdamW

2024-04-11 09:45:06.359030: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-11 09:45:06.634365: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
from vncorenlp import VnCoreNLP
rdrsegmenter = VnCoreNLP(base_dir + "/vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m')

In [23]:
hidden_size = 768
num_classes = 2
learning_rate = 1e-5
batch_size = 32
num_epochs = 3
dropout = 0.15
MAX_LEN = 256

In [6]:
phobert = AutoModel.from_pretrained('vinai/phobert-base-v2')
tokenizer = AutoTokenizer.from_pretrained('vinai/phobert-base-v2')

Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### 2. Build model

In [9]:
class Attention(nn.Module):
    def __init__(self, hidden_size, dropout=0.5) -> None:
        super().__init__()
        self.hidden_size = hidden_size
        self.weight = nn.Parameter(torch.Tensor(hidden_size, 1))
        self.dropout = nn.Dropout(dropout)
        nn.init.xavier_uniform_(self.weight)

    def forward(self, inputs):
        print(inputs.size(), self.weight.size())
        attention_scores = torch.matmul(inputs, self.weight).squeeze(-1)
        attention_weights = torch.softmax(attention_scores, dim=-1)
        dropout_output = self.dropout(attention_weights)
        weighted_sum = torch.matmul(inputs.transpose(1, 2), dropout_output.unsqueeze(-1)).squeeze(-1)
        return weighted_sum

In [10]:
class PhoBertBiLSTMAttentionModel(nn.Module):
    def __init__(self, hidden_size, num_classes, dropout=0.1) -> None:
        super().__init__()
        self.phobert = AutoModel.from_pretrained('vinai/phobert-base-v2')
        self.bilstm = nn.LSTM(
                                bidirectional=True,
                                input_size=hidden_size,
                                hidden_size=hidden_size//2,
                                batch_first=True
                            )
        self.attention = Attention(hidden_size, dropout)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        phobert_output = self.phobert(input_ids=input_ids, attention_mask=attention_mask)[0]
        lstm_output, _ = self.bilstm(phobert_output)
        attention_output = self.attention(lstm_output)
        dropout_output = self.dropout(attention_output)
        logits = self.fc(dropout_output)
        return logits

### 3. Load Dataset

In [11]:
train_path = base_dir + '/data/train.crash'
test_path = base_dir + '/data/test.crash'

In [12]:
def load_data(path, has_label=False):
    ids, sentences, labels = [], [], []
    with open(path, 'r') as f_r:
        data = f_r.read().strip()

        if has_label:
            data = re.findall('train_[\s\S]+?\"\n[01]\n\n', data)
        else:
            data = re.findall('test_[\s\S]+?\"\n[01]\n\n', data)
        for sample in data:
            splits = sample.strip().split('\n')

            id = splits[0]
            label = int(splits[-1])
            text = ' '.join(splits[1:-1])[1:-1]
            text = rdrsegmenter.tokenize(text)
            text = ' '.join([' '.join(x) for x in text])

            ids.append(id)
            sentences.append(text)
            if has_label:
                labels.append(label)
    if has_label:
        return ids, sentences, labels

    return ids, sentences


In [13]:
train_id, train_sentences, train_labels = load_data(train_path, has_label=True)
# test_id, test_sentences = load_data(test_path)

In [14]:
def _tokenizer(dataset, tokenizer, MAX_LEN):
    encoding = tokenizer(dataset, truncation=True, padding=True, return_tensors='pt')

    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']

    def _pad_sequences(data):
        return pad_sequences(data, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")

    ids = torch.tensor(_pad_sequences(input_ids))
    masks = torch.tensor(_pad_sequences(attention_mask))

    return ids, masks


In [15]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_sentences, train_labels, test_size=0.15, random_state=42)

train_ids, train_mask = _tokenizer(train_sentences, tokenizer, MAX_LEN)
val_ids, val_mask = _tokenizer(val_sentences, tokenizer, MAX_LEN)

train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [16]:
train_dataset = TensorDataset(train_ids, train_mask, train_labels)
val_dataset = TensorDataset(val_ids, val_mask, val_labels)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

### 4. Training

In [19]:
def train_epoch(data_loader, model, optimizer, device):
    running_loss = 0
    total_preds = 0
    correct_preds = 0
    pbar = tqdm(data_loader)
    for batch in pbar:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = nn.CrossEntropyLoss(logits, labels)
        pred = torch.argmax(logits, dim=1)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

        correct_preds += (pred == labels).sum().item()
        total_preds += len(labels)
        pbar.set_description(f"acc {correct_preds / total_preds:.4f}")

    return running_loss


In [20]:
def test_epoch(data_loader, model, optimizer, device):
  val_loss = 0.0
  correct_preds = 0
  total_preds = 0
  with torch.no_grad():
    pbar = tqdm(data_loader)
    for batch in pbar:
      input_ids, attention_mask, labels = batch
      input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
      optimizer.zero_grad()
      logits = model(input_ids, attention_mask)
      loss = nn.CrossEntropyLoss(logits, labels)
      val_loss += loss.item()
      _, predicted = torch.max(logits, 1)
      correct_preds += (predicted == labels).sum().item()
      total_preds += labels.size(0)
      pbar.set_description(f"test acc {correct_preds / total_preds:.4f}")



  return val_loss, correct_preds, total_preds

In [21]:
def predict(sequences, MAX_LEN, model, device):
  seq_tok = tokenizer(sequences)
  train_ids = torch.tensor(pad_sequences(seq_tok['input_ids'], maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post"))
  train_mask = torch.tensor(pad_sequences(seq_tok['attention_mask'], maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post"))
  with torch.no_grad():
    output = model(train_ids.to(device), train_mask.to(device))
    _, predicted = torch.max(output, 1)

  return predicted

#### Define model

In [22]:
model = PhoBertBiLSTMAttentionModel(hidden_size, num_classes, dropout).to(device)
optimizer = AdamW(model.parameters(), lr=learning_rate)

Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
for epoch in range(num_epochs):
    model.train()
    running_loss = train_epoch(train_loader, model, optimizer, device)

    model.eval()
    val_loss, correct_preds, total_preds = test_epoch(val_loader, model, optimizer, device)

    avg_train_loss = running_loss / len(train_loader)
    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = correct_preds / total_preds * 100
    print(f"Epoch {epoch + 1}/{num_epochs}: Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%")

  0%|          | 0/427 [00:25<?, ?it/s]

torch.Size([32, 256, 768]) torch.Size([768, 1])





RuntimeError: Boolean value of Tensor with more than one value is ambiguous

### 5. Save & load model

In [None]:
output_path = 'model.pth'
torch.save(model.state_dict(), base_dir + output_path)

In [None]:
hidden_size = 768
num_classes = 2
learning_rate = 1e-5
batch_size = 32
num_epochs = 5
dropout = 0.1
MAX_LEN = 256

In [None]:
model = PhoBertBiLSTMAttentionModel(hidden_size, num_classes, dropout).to(device)
model_path = base_dir + '/phoBertmodel.h5'
model.load_state_dict(torch.load(model_path))
model.eval()

Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PhoBertBiLSTMAttentionModel(
  (phobert): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(258, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (Lay

### 6. Testing

In [None]:
predicted = predict(["Sản phẩm này tệ quá", "áo có mùi hôi"], MAX_LEN, model, device)
print(predicted)

torch.Size([2, 256, 768]) torch.Size([768, 1])
tensor([1, 1], device='cuda:0')
