<a href="https://colab.research.google.com/github/minhvn1433/Deep-learning-project/blob/main/phobert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###### **Set Up**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from gensim.utils import simple_preprocess
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup

import torch
from torch import nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader

In [None]:
def seed_everything(seed_value):
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True


seed_everything(86)

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"There are {torch.cuda.device_count()} GPU(s) available.")
    print("Device name:", torch.cuda.get_device_name(0))
else:
    print("No GPU available, using the CPU instead.")
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla T4


###### **Load Data**

In [None]:
train_df = pd.read_excel("train.xlsx")
valid_df = pd.read_excel("valid.xlsx")
test_df = pd.read_excel("test.xlsx")

train_df = pd.concat([train_df, valid_df], ignore_index=True)
skf = StratifiedKFold(n_splits=5)
for fold, (train_index, valid_index) in enumerate(
    skf.split(X=train_df, y=train_df.Emotion)
):
    train_df.loc[valid_index, "kfold"] = fold

train_df.sample(5)

Unnamed: 0,Emotion,Sentence,kfold
3706,Enjoyment,dễ thương ghê ! làm mình nhớ các cậu bạn cùng ...,2.0
3913,Enjoyment,đáng yêuc vãi anh ơi,3.0
234,Surprise,bất ngờ chưa =))),0.0
658,Surprise,cha con y chang nhau 😂 1 khuôn đúc ra,0.0
821,Enjoyment,lợi ích của việc giữ vững quan điểm 😄,0.0


###### **DataLoader**

In [None]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)

config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.iloc[index]
        sentence, label = self.get_input_data(row)

        encoding = self.tokenizer.encode_plus(
            sentence,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors="pt",
        )

        return {
            "sentence": sentence,
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "targets": torch.tensor(label),
        }

    def labelencoder(self, label):
        if label == "Enjoyment":
            return 0
        elif label == "Disgust":
            return 1
        elif label == "Sadness":
            return 2
        elif label == "Anger":
            return 3
        elif label == "Surprise":
            return 4
        elif label == "Fear":
            return 5
        else:
            return 6

    def get_input_data(self, row):
        sentence = row["Sentence"]
        sentence = " ".join(simple_preprocess(sentence))
        label = self.labelencoder(row["Emotion"])

        return sentence, label

In [None]:
def prepare_loaders(df, fold):
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    train_dataset = SentimentDataset(df_train, tokenizer, max_length=120)
    valid_dataset = SentimentDataset(df_valid, tokenizer, max_length=120)

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
    valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=True, num_workers=2)

    return train_loader, valid_loader

###### **Train & Eval Function**

In [None]:
def train(model, criterion, optimizer, scheduler, loader):
    model.train()
    size = len(loader.dataset)
    num_batches = len(loader)
    train_loss, correct = 0, 0

    for data in loader:
        input_ids = data["input_ids"].to(device)
        attention_mask = data["attention_mask"].to(device)
        targets = data["targets"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()

        train_loss += loss.item()
        correct += (outputs.argmax(1) == targets).sum().item()

    train_loss /= num_batches
    correct /= size
    print(f"loss: {train_loss:.4f}   accuracy: {correct:.4f}   ", end="")

In [None]:
def eval(model, criterion, loader):
    model.eval()
    size = len(loader.dataset)
    num_batches = len(loader)
    val_loss, correct = 0, 0

    with torch.no_grad():
        for data in loader:
            input_ids = data["input_ids"].to(device)
            attention_mask = data["attention_mask"].to(device)
            targets = data["targets"].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, targets)

            val_loss += loss.item()
            correct += (outputs.argmax(1) == targets).sum().item()

    val_loss /= num_batches
    correct /= size
    print(f"val_loss: {val_loss:.4f}   val_accuracy: {correct:.4f}")

    return correct

###### **Model**

In [None]:
class SentimentClassifier(nn.Module):
    def __init__(self, num_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained("vinai/phobert-base")
        self.dropout = nn.Dropout(p=0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        _, x = self.bert(
            input_ids=input_ids, attention_mask=attention_mask, return_dict=False
        )
        x = self.dropout(x)
        x = self.fc(x)
        return x

###### **Training**

In [None]:
num_epochs = 6
best_accs = []

for fold in range(skf.n_splits):
    print(f"Fold {fold + 1}")
    train_loader, valid_loader = prepare_loaders(train_df, fold)

    model = SentimentClassifier(num_classes=7).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr=5e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * num_epochs
    )

    best_acc = 0
    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}   ", end="")

        train(model, criterion, optimizer, scheduler, train_loader)
        val_acc = eval(model, criterion, valid_loader)

        if val_acc > best_acc:
            torch.save(model.state_dict(), f"phobert_fold_{fold + 1}.pth")
            best_acc = val_acc
    best_accs.append(best_acc)

avg_acc = torch.tensor(best_accs).mean().item()
print(f"Average accuracy: {avg_acc:.4f}")

Fold 1


pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

Epoch 1/6   loss: 1.5743   accuracy: 0.4036   val_loss: 1.2432   val_accuracy: 0.5525
Epoch 2/6   loss: 1.1379   accuracy: 0.5913   val_loss: 1.1278   val_accuracy: 0.5886
Epoch 3/6   loss: 0.8616   accuracy: 0.6960   val_loss: 1.1108   val_accuracy: 0.5974
Epoch 4/6   loss: 0.6238   accuracy: 0.7836   val_loss: 1.1086   val_accuracy: 0.6271
Epoch 5/6   loss: 0.4400   accuracy: 0.8616   val_loss: 1.1469   val_accuracy: 0.6295
Epoch 6/6   loss: 0.3294   accuracy: 0.9011   val_loss: 1.1854   val_accuracy: 0.6287
Fold 2
Epoch 1/6   loss: 1.4904   accuracy: 0.4391   val_loss: 1.1810   val_accuracy: 0.5750
Epoch 2/6   loss: 1.0346   accuracy: 0.6282   val_loss: 1.1001   val_accuracy: 0.6030
Epoch 3/6   loss: 0.7270   accuracy: 0.7465   val_loss: 1.1393   val_accuracy: 0.6014
Epoch 4/6   loss: 0.5071   accuracy: 0.8324   val_loss: 1.1989   val_accuracy: 0.6030
Epoch 5/6   loss: 0.3490   accuracy: 0.8939   val_loss: 1.2909   val_accuracy: 0.6038
Epoch 6/6   loss: 0.2517   accuracy: 0.9310   v

###### **Testing**

In [None]:
models = []


def test(loader):
    for fold in range(skf.n_splits):
        model = SentimentClassifier(num_classes=7).to(device)
        model.load_state_dict(torch.load(f"phobert_fold_{fold + 1}.pth"))
        model.eval()
        models.append(model)

    predicts = []
    true_labels = []

    with torch.no_grad():
        for data in loader:
            input_ids = data["input_ids"].to(device)
            attention_mask = data["attention_mask"].to(device)
            targets = data["targets"].to(device)

            total_outs = []
            for model in models:
                outputs = model(input_ids, attention_mask)
                total_outs.append(outputs)

            total_outs = torch.stack(total_outs)
            _, pred = torch.max(total_outs.mean(0), 1)
            predicts.extend(pred)
            true_labels.extend(targets)

    predicts = torch.stack(predicts).cpu()
    true_labels = torch.stack(true_labels).cpu()
    print(classification_report(true_labels, predicts, digits=4))

In [None]:
test_dataset = SentimentDataset(test_df, tokenizer, max_length=50)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True, num_workers=2)
test(test_loader)

              precision    recall  f1-score   support

           0     0.6910    0.6373    0.6631       193
           1     0.5899    0.6212    0.6052       132
           2     0.6667    0.6897    0.6780       116
           3     0.5926    0.4000    0.4776        40
           4     0.6571    0.6216    0.6389        37
           5     0.7000    0.7609    0.7292        46
           6     0.5556    0.6202    0.5861       129

    accuracy                         0.6335       693
   macro avg     0.6361    0.6215    0.6254       693
weighted avg     0.6356    0.6335    0.6326       693



###### **Inference**

In [None]:
emotions = ["Enjoyment", "Disgust", "Sadness", "Anger", "Surprise", "Fear", "Other"]


def infer(sentence, max_length=120):
    sentence = " ".join(simple_preprocess(sentence))

    encoding = tokenizer.encode_plus(
        sentence,
        truncation=True,
        add_special_tokens=True,
        max_length=max_length,
        padding="max_length",
        return_attention_mask=True,
        return_token_type_ids=False,
        return_tensors="pt",
    )

    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    total_outs = []
    for model in models:
        outputs = model(input_ids, attention_mask)
        total_outs.append(outputs)

    total_outs = torch.stack(total_outs)
    _, pred = torch.max(total_outs.mean(0), 1)

    print(emotions[pred])

In [None]:
infer("Uồi lại được học bổng thích thế!")
infer("Mấy điều ông ấy nói làm tao ghê tởm.")
infer("Mình lại tạch môn rồi buồn ghê :(")
infer("Đừng có mà động vào người tao. Tao đang hơi bực.")
infer("Không thể tin được, sao có thể được 5 điểm tất cả các môn.")
infer("Lúc thầy ấy giận cả lớp trông thật đáng sợ.")

Surprise
Disgust
Sadness
Anger
Surprise
Fear
