In [1]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
import torch.optim as optim
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm



In [2]:
phobert = AutoModel.from_pretrained("vinai/phobert-base")
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
data = pd.read_csv('/workspace/nlplab/nmquy/pytorch/full_train.csv')

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,RevId,UserId,Comment,image_urls,Rating
0,0,3839333,10106093.0,"Xôi dẻo, đồ ăn đậm vị. Hộp xôi được lót lá trô...",['https://images.foody.vn/res/g97/966781/s800/...,1.0
1,1,2824877,786914.0,Gọi ship 1 xuất cari gà bánh naan và 3 miếng g...,['https://images.foody.vn/res/g69/688413/s800/...,0.0
2,2,9816702,22467889.0,"Thời tiết lạnh như này, cả nhà rủ nhau đến leg...",['https://images.foody.vn/res/g72/715078/s800/...,1.0
3,3,2684585,1889449.0,Em có đọc review thấy mng bảo trà sữa nướng đề...,['https://images.foody.vn/res/g90/895545/s800/...,0.0
4,4,2737987,8839942.0,"Đồ ăn rất ngon, nhà hàng cũng rất đẹp, tất cả ...",['https://images.foody.vn/res/g4/30186/s800/fo...,1.0


In [5]:
len(data)

9073

In [6]:
data = data.dropna()

In [7]:
len(data)

9070

In [8]:
data_review = data['Comment'].tolist()
data_label = data['Rating'].astype(int).tolist()

In [9]:
data_review[0]

'Xôi dẻo, đồ ăn đậm vị. Hộp xôi được lót lá trông rất thích'

In [10]:
data_label[0]

1

In [11]:
len(data_review)

9070

In [12]:
len(data_label)

9070

In [13]:
len(tokenizer.get_vocab())

64001

In [14]:
combined_data = list(zip(data_review, data_label))

In [15]:
random.shuffle(combined_data)

In [16]:
train_data, label = zip(*combined_data)

In [17]:
import re

In [18]:
from underthesea import word_tokenize, text_normalize

In [19]:
def clean_data(sent):
    vietnamese_characters = "a-zA-Z0-9ÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚĂĐĨŨƠàáâãèéêìíòóôõùúăđĩũơƯĂẠẢẤẦẨẪẬẮẰẲẴẶẸẺẼỀỀỂẾưăạảấầẩẫậắằẳẵặẹẻẽềềểếỄỆỈỊỌỎỐỒỔỖỘỚỜỞỠỢỤỦỨỪễệỉịọỏốồổỗộớờởỡợụủứừỬỮỰỲỴÝỶỸửữựỳýỵỷỹ"
    sent = sent.lower()
    sent = sent.strip()
    sent = re.sub('[+]', ' ', sent)
    sent = re.sub(" \\[a-z]" , ' ', sent)
    sent = re.sub('[^' + vietnamese_characters + ']' , ' ', sent)
    sent = re.sub("([A-Za-z]+[0-9]+)|([0-9]+[A-Za-z]+)", " ",sent)
    sent = re.sub("[0-9]{3,}", " ", sent)
    sent = re.sub(r'(\D)\1{2,}', r'\1',sent)
    sent = sent.strip()

    sent_temp = []
    for x in sent.split(" "):
        if len(x) > 1:
            sent_temp.append(x)
    sent = " ".join(sent_temp)
    sent = text_normalize(sent)

    return sent




In [20]:
def preprocess(sentences, max_len=None):
    preprocess_sent = []

    for sent in sentences:
        text = word_tokenize(sent, format="text")
        new_sent = text
        if max_len is not None:
            words = text.split(" ")
            if len(words) > max_len:
                words = words[0:max_len]
            new_sent = " ".join(words)
        preprocess_sent.append(new_sent)

    return preprocess_sent


In [21]:
def encode(preprocess_sent, max_len=128):
    encode_sents = []
    masks = []

    for sent in preprocess_sent:
        sent_info = tokenizer.encode_plus(
            sent,
            padding="max_length",
            truncation=True,
            add_special_tokens=True,
            max_length=max_len,
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='np'
        )
        encode_sent = sent_info['input_ids'].flatten()
        mask = sent_info['attention_mask'].flatten()

        encode_sents.append(encode_sent)
        masks.append(mask)

    return encode_sents, masks

In [22]:
data_train = train_data[0:7000]
label_train = label[0:7000]
data_valid = train_data[7001:]
label_valid = label[7001:]

In [23]:
print(data_train[0])
print(label[0])

Mình vừa đặt ship đồ về nhà ăn, mặc dù thời gian chờ now ship lâu vãi chưởng ))) nhưng chất lượng đồ ăn thì toẹt vờiii, kimchi cũng ngon nứaaaa, sẽ ủng hộ dại
0


In [24]:
train_data_clean = [clean_data(sent) for sent in data_train]
data_valid = [clean_data(sent) for sent in data_valid]

In [25]:
train_data_preprocess = preprocess(train_data_clean)
valid_data_preprocess = preprocess(data_valid)

In [26]:
train_ids, train_mask = encode(train_data_preprocess, max_len=256)
valid_ids, valid_mask = encode(valid_data_preprocess, max_len=256)


In [47]:
batch_size = 8

In [48]:
train_input = torch.tensor(train_ids)
valid_input = torch.tensor(valid_ids)

train_label = torch.tensor(label_train)
valid_label = torch.tensor(label_valid)

train_mask = torch.tensor(train_mask)
valid_mask = torch.tensor(valid_mask)

  train_mask = torch.tensor(train_mask)
  valid_mask = torch.tensor(valid_mask)


In [49]:
train_data = TensorDataset(train_input, train_mask, train_label)
train_sample = RandomSampler(train_data)
train_loader = DataLoader(train_data, sampler=train_sample, batch_size=batch_size)


In [50]:
valid_data = TensorDataset(valid_input, valid_mask, valid_label)
valid_sample = RandomSampler(valid_data)
valid_loader = DataLoader(valid_data, sampler=valid_sample, batch_size=batch_size)


In [51]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [52]:
class Model_Sentiment(nn.Module):
    def __init__(self, output_size=2, dropout=0.1):
        super(Model_Sentiment, self).__init__()
        self.output_size = output_size
        self.dropout = dropout
        self.bert =  AutoModel.from_pretrained("vinai/phobert-base", num_labels=2, output_hidden_states=True)
        self.fc2 = nn.Linear(4*768, self.output_size, bias=True)
        nn.init.normal_(self.fc2.weight, std=0.02)
        nn.init.normal_(self.fc2.bias, 0)
    
    def forward(self, inputs, attention_mask):
        outputs = self.bert(input_ids=inputs, attention_mask=attention_mask)
        x = torch.cat((outputs[2][-1][:,0, ...],outputs[2][-2][:,0, ...], outputs[2][-3][:,0, ...], outputs[2][-4][:,0, ...]),-1)
        x = self.fc2(x)
        out = x
        return out
        


In [90]:
model_sentiment = Model_Sentiment()

Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [104]:
count = 0
for param in model_sentiment.parameters():
    if param.requires_grad == True:
        count += 1

print(count)


201


In [105]:
cnt = 0
for param in model_sentiment.parameters():
    if cnt < 100:
        param.requires_grad = False
    cnt += 1


In [106]:
count = 0
for param in model_sentiment.parameters():
    if param.requires_grad == True:
        count += 1

print(count)

101


In [107]:
max_epochs = 10
lr = 1e-4
weight_decay = 0.01
optimizer_parameter = filter(lambda p : p.requires_grad, model_sentiment.parameters())


In [108]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(
    optimizer_parameter,
    lr = lr,
    weight_decay=weight_decay
)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=len(train_loader) * max_epochs
)

In [109]:
def flatten_accuracy(pred, label):
    pred_flat = np.argmax(pred, axis = 1).flatten()
    label_flat = label.flatten()
    return np.sum(pred_flat == label_flat) / len(label_flat)

In [110]:
model_sentiment.to(device=device)

Model_Sentiment(
  (bert): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(258, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerN

In [111]:
for batch in tqdm(train_loader):
    input, input_mask, label = batch
    print("input = ",input)
    print("input_mask = ",input_mask)
    print("label = ",label)
    break

  0%|          | 0/875 [00:00<?, ?it/s]

input =  tensor([[    0,    68,  3310,  ...,     1,     1,     1],
        [    0,  1340,   136,  ...,     1,     1,     1],
        [    0,   654,   898,  ...,     1,     1,     1],
        ...,
        [    0,  1953,    10,  ...,     1,     1,     1],
        [    0,  1338,   734,  ...,     1,     1,     1],
        [    0,    68, 49833,  ...,     1,     1,     1]])
input_mask =  tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
label =  tensor([1, 0, 1, 1, 1, 1, 1, 1])





In [112]:
for batch in tqdm(valid_loader):
    input, input_mask, label = batch
    print("input = ",input)
    print("input_mask = ",input_mask)
    print("label = ",label)
    break

  0%|          | 0/259 [00:00<?, ?it/s]

input =  tensor([[    0,   946, 11941,  ...,     1,     1,     1],
        [    0, 23257,  1325,  ...,     1,     1,     1],
        [    0,   946,   170,  ...,     1,     1,     1],
        ...,
        [    0,   320,    89,  ...,     1,     1,     1],
        [    0,   244,  5516,  ...,     1,     1,     1],
        [    0,    68,   320,  ...,     1,     1,     1]])
input_mask =  tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
label =  tensor([1, 1, 1, 1, 1, 0, 1, 0])





In [113]:
def train_epochs(model, criterion, optimizer, scheduler, train_loader, device):
    total_loss = 0
    total_acc = 0
    total = 0
    temp = 0
    model.train()

    for batch in tqdm(train_loader):
        input, input_mask, label = batch
        input = input.to(device=device)
        input_mask = input_mask.to(device=device)
        label = label.to(device=device)

        optimizer.zero_grad()

        output = model(input, input_mask)

        loss = criterion(output, label)
        loss.backward()

        nn.utils.clip_grad_norm_(optimizer_parameter, max_norm=1.0)
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        total += len(label)

        logits = output.detach().cpu().numpy()
        label = label.to('cpu').numpy()
        acc = flatten_accuracy(logits, label)
        total_acc += acc

        temp += 1

    return total_loss/ total , total_acc/ temp



In [114]:
device

device(type='cuda')

In [115]:
def valid_epochs(model, criterion, valid_loader, device):
    total_loss = 0
    accuracy = 0
    temp = 0
    total = 0
    model.eval()

    for batch in tqdm(valid_loader):
        input, input_mask, label = batch
        input = input.to(device=device)
        input_mask = input_mask.to(device=device)
        label = label.to(device=device)

        with torch.no_grad():
            logit = model(input, input_mask)
            loss = criterion(logit, label)

            logit = logit.detach().cpu().numpy()
            label = label.to('cpu').numpy()

            acc = flatten_accuracy(logit, label)
            accuracy += acc
            total_loss += loss.item()
            total += len(label)
            temp += 1
    return total_loss / total, accuracy/ temp


In [116]:
epochs = 5
n_epochs = 0
train_loss = []
train_acc = []
valid_loss = []
valid_acc = []

for _ in range(epochs):
    loss, acc = train_epochs(model = model_sentiment, criterion=criterion, optimizer=optimizer, scheduler=lr_scheduler, train_loader=train_loader, device=device)
    val_loss, val_acc = valid_epochs(model = model_sentiment, criterion=criterion, valid_loader=valid_loader, device=device)
    train_loss.append(loss)
    train_acc.append(acc)
    valid_acc.append(val_acc)
    valid_loss.append(val_loss)

    print(f'Epochs {_} : Train accuracy = {acc},  Loss = {loss}')
    print(f'Epochs {_} : Valid accuracy = {val_acc},  Loss = {val_loss}')


100%|██████████| 875/875 [01:59<00:00,  7.33it/s]
100%|██████████| 259/259 [00:18<00:00, 14.31it/s]


Epochs 0 : Train accuracy = 0.7871428571428571,  Loss = 0.06522452805723462
Epochs 0 : Valid accuracy = 0.78996138996139,  Loss = 0.0650539448175988


100%|██████████| 875/875 [02:00<00:00,  7.27it/s]
100%|██████████| 259/259 [00:19<00:00, 13.16it/s]


Epochs 1 : Train accuracy = 0.7871428571428571,  Loss = 0.06500148065601076
Epochs 1 : Valid accuracy = 0.7905405405405406,  Loss = 0.06516893101435689


100%|██████████| 875/875 [02:00<00:00,  7.23it/s]
100%|██████████| 259/259 [00:19<00:00, 13.16it/s]


Epochs 2 : Train accuracy = 0.7871428571428571,  Loss = 0.0650347018284457
Epochs 2 : Valid accuracy = 0.7905405405405406,  Loss = 0.06490124481896262


100%|██████████| 875/875 [02:01<00:00,  7.21it/s]
100%|██████████| 259/259 [00:19<00:00, 13.16it/s]


Epochs 3 : Train accuracy = 0.787,  Loss = 0.06503199277605329
Epochs 3 : Valid accuracy = 0.7902509652509653,  Loss = 0.06440309995093053


100%|██████████| 875/875 [02:01<00:00,  7.18it/s]
100%|██████████| 259/259 [00:19<00:00, 13.13it/s]

Epochs 4 : Train accuracy = 0.7871428571428571,  Loss = 0.06487974854239395
Epochs 4 : Valid accuracy = 0.7902509652509653,  Loss = 0.06495863707702824



