<a href="https://colab.research.google.com/github/minhvn1433/Deep-learning-project/blob/main/gru.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###### **Set Up**

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from gensim.utils import simple_preprocess
from transformers import get_linear_schedule_with_warmup

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import torch
from torch import nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader

In [None]:
def seed_everything(seed_value):
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True


seed_everything(86)

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"There are {torch.cuda.device_count()} GPU(s) available.")
    print("Device name:", torch.cuda.get_device_name(0))
else:
    print("No GPU available, using the CPU instead.")
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla T4


###### **Load Data**

In [None]:
train_df = pd.read_excel("train.xlsx")
valid_df = pd.read_excel("valid.xlsx")
test_df = pd.read_excel("test.xlsx")

train_df = pd.concat([train_df, valid_df], ignore_index=True)
skf = StratifiedKFold(n_splits=5)
for fold, (train_index, valid_index) in enumerate(
    skf.split(X=train_df, y=train_df.Emotion)
):
    train_df.loc[valid_index, "kfold"] = fold

train_df.sample(5)

Unnamed: 0,Emotion,Sentence,kfold
3706,Enjoyment,dễ thương ghê ! làm mình nhớ các cậu bạn cùng ...,2.0
3913,Enjoyment,đáng yêuc vãi anh ơi,3.0
234,Surprise,bất ngờ chưa =))),0.0
658,Surprise,cha con y chang nhau 😂 1 khuôn đúc ra,0.0
821,Enjoyment,lợi ích của việc giữ vững quan điểm 😄,0.0


###### **DataLoader**

In [None]:
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df.Sentence)

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.iloc[index]
        sentence, label = self.get_input_data(row)

        sequence = self.tokenizer.texts_to_sequences([sentence])
        padded = pad_sequences(
            sequence, maxlen=self.max_length, padding="post", truncating="post"
        )

        return {
            "sentence": sentence,
            "inputs": torch.tensor(padded).flatten(),
            "targets": torch.tensor(label),
        }

    def labelencoder(self, label):
        if label == "Enjoyment":
            return 0
        elif label == "Disgust":
            return 1
        elif label == "Sadness":
            return 2
        elif label == "Anger":
            return 3
        elif label == "Surprise":
            return 4
        elif label == "Fear":
            return 5
        else:
            return 6

    def get_input_data(self, row):
        sentence = row["Sentence"]
        sentence = " ".join(simple_preprocess(sentence))
        label = self.labelencoder(row["Emotion"])

        return sentence, label

In [None]:
def prepare_loaders(df, fold, tokenizer):
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    train_dataset = SentimentDataset(df_train, tokenizer, max_length=120)
    valid_dataset = SentimentDataset(df_valid, tokenizer, max_length=120)

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
    valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=True, num_workers=2)

    return train_loader, valid_loader

###### **Train & Eval Function**

In [None]:
def train(model, criterion, optimizer, scheduler, loader):
    model.train()
    size = len(loader.dataset)
    num_batches = len(loader)
    train_loss, correct = 0, 0

    for data in loader:
        inputs = data["inputs"].to(device)
        targets = data["targets"].to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        train_loss += loss.item()
        correct += (outputs.argmax(1) == targets).sum().item()

    train_loss /= num_batches
    correct /= size
    print(f"loss: {train_loss:.4f}   accuracy: {correct:.4f}   ", end="")

In [None]:
def eval(model, criterion, loader):
    model.eval()
    size = len(loader.dataset)
    num_batches = len(loader)
    val_loss, correct = 0, 0

    with torch.no_grad():
        for data in loader:
            inputs = data["inputs"].to(device)
            targets = data["targets"].to(device)

            outputs = model(inputs)
            loss = criterion(outputs, targets)

            val_loss += loss.item()
            correct += (outputs.argmax(1) == targets).sum().item()

    val_loss /= num_batches
    correct /= size
    print(f"val_loss: {val_loss:.4f}   val_accuracy: {correct:.4f}")

    return correct

###### **Model**

In [None]:
class SentimentClassifier(nn.Module):
    def __init__(
        self, vocab_size, embedding_dim, max_length, gru_dim, dense_dim, num_classes
    ):
        super(SentimentClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, gru_dim, bidirectional=True)
        self.fc1 = nn.Linear(gru_dim * 2, dense_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(dense_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        _, x = self.gru(x.permute(1, 0, 2))
        x = torch.cat((x[-2, :, :], x[-1, :, :]), dim=1)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

###### **Training**

In [None]:
num_epochs = 30
best_accs = []

for fold in range(skf.n_splits):
    print(f"Fold {fold + 1}")
    train_loader, valid_loader = prepare_loaders(train_df, fold, tokenizer)

    model = SentimentClassifier(
        vocab_size=5000,
        embedding_dim=64,
        max_length=120,
        gru_dim=45,
        dense_dim=30,
        num_classes=7,
    ).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr=9.3e-4)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * num_epochs
    )

    best_acc = 0
    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}   ", end="")

        train(model, criterion, optimizer, scheduler, train_loader)
        val_acc = eval(model, criterion, valid_loader)

        if val_acc > best_acc:
            torch.save(model.state_dict(), f"gru_fold_{fold + 1}.pth")
            best_acc = val_acc
    best_accs.append(best_acc)

avg_acc = torch.tensor(best_accs).mean().item()
print(f"Average accuracy: {avg_acc:.4f}")

Fold 1
Epoch 1/30   loss: 1.7746   accuracy: 0.2807   val_loss: 1.7479   val_accuracy: 0.2831
Epoch 2/30   loss: 1.6822   accuracy: 0.3295   val_loss: 1.6570   val_accuracy: 0.3472
Epoch 3/30   loss: 1.5298   accuracy: 0.4057   val_loss: 1.5817   val_accuracy: 0.3865
Epoch 4/30   loss: 1.3863   accuracy: 0.4692   val_loss: 1.5418   val_accuracy: 0.4090
Epoch 5/30   loss: 1.2681   accuracy: 0.5266   val_loss: 1.5615   val_accuracy: 0.4194
Epoch 6/30   loss: 1.1597   accuracy: 0.5677   val_loss: 1.6063   val_accuracy: 0.4210
Epoch 7/30   loss: 1.0675   accuracy: 0.6132   val_loss: 1.6127   val_accuracy: 0.4362
Epoch 8/30   loss: 0.9732   accuracy: 0.6537   val_loss: 1.6406   val_accuracy: 0.4483
Epoch 9/30   loss: 0.8970   accuracy: 0.6828   val_loss: 1.6992   val_accuracy: 0.4523
Epoch 10/30   loss: 0.8182   accuracy: 0.7187   val_loss: 1.7512   val_accuracy: 0.4539
Epoch 11/30   loss: 0.7449   accuracy: 0.7417   val_loss: 1.8136   val_accuracy: 0.4507
Epoch 12/30   loss: 0.6772   accur

###### **Testing**

In [None]:
models = []


def test(loader):
    for fold in range(skf.n_splits):
        model = SentimentClassifier(
            vocab_size=5000,
            embedding_dim=64,
            max_length=120,
            gru_dim=45,
            dense_dim=30,
            num_classes=7,
        ).to(device)
        model.load_state_dict(torch.load(f"gru_fold_{fold + 1}.pth"))
        model.eval()
        models.append(model)

    predicts = []
    true_labels = []

    with torch.no_grad():
        for data in loader:
            inputs = data["inputs"].to(device)
            targets = data["targets"].to(device)

            total_outs = []
            for model in models:
                outputs = model(inputs)
                total_outs.append(outputs)

            total_outs = torch.stack(total_outs)
            _, pred = torch.max(total_outs.mean(0), 1)
            predicts.extend(pred)
            true_labels.extend(targets)

    predicts = torch.stack(predicts).cpu()
    true_labels = torch.stack(true_labels).cpu()
    print(classification_report(true_labels, predicts, digits=4))

In [None]:
test_dataset = SentimentDataset(test_df, tokenizer, max_length=120)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True, num_workers=2)
test(test_loader)

              precision    recall  f1-score   support

           0     0.6049    0.6425    0.6231       193
           1     0.4897    0.5379    0.5126       132
           2     0.5664    0.5517    0.5590       116
           3     0.6429    0.2250    0.3333        40
           4     0.3889    0.1892    0.2545        37
           5     0.7308    0.4130    0.5278        46
           6     0.3605    0.4806    0.4120       129

    accuracy                         0.5137       693
   macro avg     0.5406    0.4343    0.4603       693
weighted avg     0.5300    0.5137    0.5093       693



###### **Inference**

In [None]:
emotions = ["Enjoyment", "Disgust", "Sadness", "Anger", "Surprise", "Fear", "Other"]


def infer(sentence, max_length=120):
    sentence = " ".join(simple_preprocess(sentence))

    sequence = tokenizer.texts_to_sequences([sentence])
    padded = pad_sequences(
        sequence, maxlen=max_length, padding="post", truncating="post"
    )

    inputs = torch.tensor(padded).to(device)

    total_outs = []
    for model in models:
        outputs = model(inputs)
        total_outs.append(outputs)

    total_outs = torch.stack(total_outs)
    _, pred = torch.max(total_outs.mean(0), 1)

    print(emotions[pred])

In [None]:
infer("Uồi lại được học bổng thích thế!")
infer("Mấy điều ông ấy nói làm tao ghê tởm.")
infer("Mình lại tạch môn rồi buồn ghê :(")
infer("Đừng có mà động vào người tao. Tao đang hơi bực.")
infer("Không thể tin được, sao có thể được 5 điểm tất cả các môn.")
infer("Lúc thầy ấy giận cả lớp trông thật đáng sợ.")

Enjoyment
Disgust
Sadness
Disgust
Disgust
Disgust
