In [None]:
#transformers==4.0.0
#sentencepiece==0.1.91
#pytorch_lightning==0.8.1
!pip install pytorch_lightning
!pip install transformers
!pip install sentencepiece



In [None]:
import random
from torch.utils.data import Dataset

senttag2word = {'POS': 'positive', 'NEG': 'negative', 'NEU': 'neutral'}
senttag2opinion = {'POS': 'great', 'NEG': 'bad', 'NEU': 'ok'}
sentword2opinion = {'positive': 'great', 'negative': 'bad', 'neutral': 'ok'}

aspect_cate_list = ['Character - Outlook',
                    'Character - Voice',
                    'Character - Overall',
                    'Show',
                    'Song',
                    'Examiner',
                    'Others']

# main import lib

In [None]:
import argparse
import os
import logging
import time
import pickle
from tqdm import tqdm

import torch
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from pytorch_lightning import seed_everything

from transformers import AdamW, T5ForConditionalGeneration, T5Tokenizer
# from transformers import BertTokenizer, EncoderDecoderModel
from transformers import MBartForConditionalGeneration, MBartTokenizer
from transformers import AutoModelForCausalLM
from transformers import get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

In [None]:
logger = logging.getLogger(__name__)

In [None]:
def read_line_examples_from_file(data_path, silence):
    """
    Read data from file, each line is: sent####labels
    Return List[List[word]], List[Tuple]
    """
    sents, labels = [], []
    with open(data_path, 'r', encoding='UTF-8') as fp:
        words, labels = [], []
        for line in fp:
            line = line.strip()
            if line != '':
                words, tuples = line.split('####')
                sents.append(words.split())
                labels.append(eval(tuples))
    if silence:
        print(f"Total examples = {len(sents)}")
    return sents, labels

# Get

In [None]:
def get_para_aste_targets(sents, labels):
    targets = []
    for i, label in enumerate(labels):
        all_tri_sentences = []
        for tri in label:
            # a is an aspect term
            if len(tri[0]) == 1:
                a = sents[i][tri[0][0]]
            else:
                start_idx, end_idx = tri[0][0], tri[0][-1]
                a = ' '.join(sents[i][start_idx:end_idx+1])

            # b is an opinion term
            if len(tri[1]) == 1:
                b = sents[i][tri[1][0]]
            else:
                start_idx, end_idx = tri[1][0], tri[1][-1]
                b = ' '.join(sents[i][start_idx:end_idx+1])

            # c is the sentiment polarity
            c = senttag2opinion[tri[2]]           # 'POS' -> 'good'

            one_tri = f"It is {c} because {a} is {b}"
            all_tri_sentences.append(one_tri)
        targets.append(' [SSEP] '.join(all_tri_sentences))
    return targets


In [None]:
def get_para_tasd_targets(sents, labels):

    targets = []
    for label in labels:
        all_tri_sentences = []
        for triplet in label:
            at, ac, sp = triplet

            man_ot = sentword2opinion[sp]   # 'positive' -> 'great'

            if at == 'NULL':
                at = 'it'
            one_tri = f"{ac} is {man_ot} because {at} is {man_ot}"
            all_tri_sentences.append(one_tri)

        target = ' [SSEP] '.join(all_tri_sentences)
        targets.append(target)
    return targets

In [None]:
def get_para_asqp_targets(sents, labels):
    """
    Obtain the target sentence under the paraphrase paradigm
    """
    targets = []
    for label in labels:
        all_quad_sentences = []
        for quad in label:
            at, ac, sp, ot = quad

            man_ot = sentword2opinion[sp]  # 'POS' -> 'good'

            if at == 'NULL':  # for implicit aspect term
                at = 'it'

            one_quad_sentence = f"{ac} is {man_ot} because {at} is {ot}"
            all_quad_sentences.append(one_quad_sentence)

        target = ' [SSEP] '.join(all_quad_sentences)
        targets.append(target)
    return targets

# Get trans

In [None]:
def get_transformed_io(data_path, data_dir):
    """
    The main function to transform input & target according to the task
    """
    sents, labels = read_line_examples_from_file(data_path, False)

    # the input is just the raw sentence
    inputs = [s.copy() for s in sents]

    task = 'asqp'
    if task == 'aste':
        targets = get_para_aste_targets(sents, labels)
    elif task == 'tasd':
        targets = get_para_tasd_targets(sents, labels)
    elif task == 'asqp':
        targets = get_para_asqp_targets(sents, labels)
    else:
        raise NotImplementedError

    return inputs, targets

In [None]:
class ABSADataset(Dataset):
    def __init__(self, tokenizer, data_dir, data_type, max_len=128):
        self.data_path = f'{data_dir}/{data_type}.txt'
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.data_dir = data_dir

        self.inputs = []
        self.targets = []

        self._build_examples()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()

        src_mask = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
        target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze

        return {"source_ids": source_ids, "source_mask": src_mask,
                "target_ids": target_ids, "target_mask": target_mask}

    def _build_examples(self):

        inputs, targets = get_transformed_io(self.data_path, self.data_dir)

        for i in range(len(inputs)):
            # change input and target to two strings
            input = ' '.join(inputs[i])
            target = targets[i]

            tokenized_input = self.tokenizer.batch_encode_plus(
              [input], max_length=self.max_len, padding="max_length",
              truncation=True, return_tensors="pt"
            )
            tokenized_target = self.tokenizer.batch_encode_plus(
              [target], max_length=self.max_len, padding="max_length",
              truncation=True, return_tensors="pt"
            )

            self.inputs.append(tokenized_input)
            self.targets.append(tokenized_target)

# Main

In [None]:
data_path = '/kaggle/input/ds200-data/Final_Data'

In [None]:
sents, _ = read_line_examples_from_file(data_path + '/Data.txt', 0)

In [None]:
max_len=128
def get_dataset(tokenizer, type_path):
    return ABSADataset(tokenizer=tokenizer, data_dir=data_path,
                       data_type=type_path, max_len=max_len)

In [None]:
from huggingface_hub.hf_api import HfFolder
HfFolder.save_token('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')

In [None]:
# tokenizer = T5Tokenizer.from_pretrained('VietAI/vit5-base')
# tokenizer = AutoTokenizer.from_pretrained('vinai/bartpho-word')
# tokenizer = AutoTokenizer.from_pretrained('Viet-Mistral/Vistral-7B-Chat')
tokenizer = AutoTokenizer.from_pretrained("google/byt5-small")

tokenizer_config.json:   0%|          | 0.00/2.59k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

In [None]:
print(f"Here is an example (from the train set):")
dataset = ABSADataset(tokenizer=tokenizer, data_dir=data_path,
                      data_type='train', max_len=max_len)

Here is an example (from the train set):


In [None]:
data_sample = dataset[2]  # a random data sample
print('Input :', tokenizer.decode(data_sample['source_ids'], skip_special_tokens=True))
print('Output:', tokenizer.decode(data_sample['target_ids'], skip_special_tokens=True))

Input : Mong VIE cho bài này vào zing mp3
Output: Others is ok because it is NULL


# Create model and trainer

In [None]:
class T5FineTuner(pl.LightningModule):
    """
    Fine tune a pre-trained T5 model
    """
    def __init__(self, tfm_model, tokenizer):
        super(T5FineTuner, self).__init__()
        self.model = tfm_model
        self.tokenizer = tokenizer
        self.automatic_optimization = False
        self.validation_step_outputs = []

    def is_logger(self):
        return True

    def forward(self, input_ids, attention_mask=None, decoder_input_ids=None,
                decoder_attention_mask=None, labels=None):
        return self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            labels=labels,
        )

    def _step(self, batch):
        lm_labels = batch["target_ids"]
        lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100

        outputs = self(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            labels=lm_labels,
            decoder_attention_mask=batch['target_mask']
        )

        loss = outputs[0]
        return loss

    def training_step(self, batch, batch_idx):
        loss = self._step(batch)

        self.manual_backward(loss)
        optimizer = self.optimizers()
        #scheduler = self.lr_schedulers()

        optimizer.step()
        optimizer.zero_grad()
        #scheduler.step()

        tensorboard_logs = {"train_loss": loss}
        return {"loss": loss, "log": tensorboard_logs}

    def on_training_epoch_end(self, outputs):
        avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
        tensorboard_logs = {"avg_train_loss": avg_train_loss}
        return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

    def validation_step(self, batch, batch_idx):
        loss = self._step(batch)
        self.validation_step_outputs.append(loss)
        self.log("val_loss", loss)
        return loss

    def on_validation_epoch_end(self):
        avg_loss = torch.stack(self.validation_step_outputs).mean()
        tensorboard_logs = {"val_loss": avg_loss}
        return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

    def configure_optimizers(self):
        """ Prepare optimizer and schedule (linear warmup and decay) """
        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=3e-4, eps=1e-8)
        self.opt = optimizer
        return [optimizer]

    '''
    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
        #if self.trainer.use_tpu:
            #xm.optimizer_step(optimizer)
        #else:
        optimizer.step()
        optimizer.zero_grad()
        self.lr_scheduler.step()
    '''

    def get_tqdm_dict(self):
        tqdm_dict = {"loss": "{:.4f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}
        return tqdm_dict

    def train_dataloader(self):
        train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="train")
        dataloader = DataLoader(train_dataset, batch_size=16,
                                drop_last=True, shuffle=True, num_workers=4)
        t_total = (
            (len(dataloader.dataset) // (16 * max(1, 0)))
            // 1
            * float(30)
        )
        scheduler = get_linear_schedule_with_warmup(
            self.opt, num_warmup_steps=0.0, num_training_steps=t_total
        )
        self.lr_scheduler = scheduler
        return dataloader

    def val_dataloader(self):
        val_dataset = get_dataset(tokenizer=self.tokenizer, type_path="dev")
        return DataLoader(val_dataset, batch_size=16, num_workers=4)

# call back

In [None]:
class LoggingCallback(pl.Callback):
    def on_validation_end(self, trainer, pl_module):
        logger.info("***** Validation results *****")
        if pl_module.is_logger():
            metrics = trainer.callback_metrics
        # Log results
        for key in sorted(metrics):
            if key not in ["log", "progress_bar"]:
                logger.info("{} = {}\n".format(key, str(metrics[key])))

    def on_test_end(self, trainer, pl_module):
        logger.info("***** Test results *****")

        if pl_module.is_logger():
            metrics = trainer.callback_metrics

        # Log and save results to file
        #output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
        #with open(output_test_results_file, "w") as writer:
            #for key in sorted(metrics):
                #if key not in ["log", "progress_bar"]:
                    #logger.info("{} = {}\n".format(key, str(metrics[key])))
                    #writer.write("{} = {}\n".format(key, str(metrics[key])))

In [None]:
import re

sentiment_word_list = ['positive', 'negative', 'neutral']
opinion2word = {'great': 'positive', 'bad': 'negative', 'ok': 'neutral'}
opinion2word_under_o2m = {'good': 'positive', 'great': 'positive', 'best': 'positive',
                          'bad': 'negative', 'okay': 'neutral', 'ok': 'neutral', 'average': 'neutral'}
numopinion2word = {'SP1': 'positive', 'SP2': 'negative', 'SP3': 'neutral'}


def extract_spans_para(task, seq, seq_type):
    quads = []
    sents = [s.strip() for s in seq.split('[SSEP]')]
    if task == 'aste':
        for s in sents:
            # It is bad because editing is problem.
            try:
                c, ab = s.split(' because ')
                c = opinion2word.get(c[6:], 'nope')    # 'good' -> 'positive'
                a, b = ab.split(' is ')
            except ValueError:
                # print(f'In {seq_type} seq, cannot decode: {s}')
                a, b, c = '', '', ''
            quads.append((a, b, c))
    elif task == 'tasd':
        for s in sents:
            # food quality is bad because pizza is bad.
            try:
                ac_sp, at_sp = s.split(' because ')

                ac, sp = ac_sp.split(' is ')
                at, sp2 = at_sp.split(' is ')

                sp = opinion2word.get(sp, 'nope')
                sp2 = opinion2word.get(sp2, 'nope')
                if sp != sp2:
                    print(f'Sentiment polairty of AC({sp}) and AT({sp2}) is inconsistent!')

                # if the aspect term is implicit
                if at.lower() == 'it':
                    at = 'NULL'
            except ValueError:
                # print(f'In {seq_type} seq, cannot decode: {s}')
                ac, at, sp = '', '', ''

            quads.append((ac, at, sp))
    elif task == 'asqp':
        for s in sents:
            # food quality is bad because pizza is over cooked.
            try:
                ac_sp, at_ot = s.split(' because ')
                ac, sp = ac_sp.split(' is ')
                sp = opinion2word.get(sp, 'nope')
                at, ot = at_ot.split(' is ')

                # if the aspect term is implicit
                if at.lower() == 'it':
                    at = 'NULL'
            except ValueError:
                try:
                    # print(f'In {seq_type} seq, cannot decode: {s}')
                    pass
                except UnicodeEncodeError:
                    # print(f'In {seq_type} seq, a string cannot be decoded')
                    pass
                ac, at, sp, ot = '', '', '', ''

            quads.append((ac, at, sp, ot))
    else:
        raise NotImplementedError
    return quads


def compute_f1_scores(pred_pt, gold_pt):
    """
    Function to compute F1 scores with pred and gold quads
    The input needs to be already processed
    """
    # number of true postive, gold standard, predictions
    n_tp, n_gold, n_pred = 0, 0, 0

    for i in range(len(pred_pt)):
        n_gold += len(gold_pt[i])
        n_pred += len(pred_pt[i])

        for t in pred_pt[i]:
            if t in gold_pt[i]:
                n_tp += 1

    print(f"number of gold spans: {n_gold}, predicted spans: {n_pred}, hit: {n_tp}")
    precision = float(n_tp) / float(n_pred) if n_pred != 0 else 0
    recall = float(n_tp) / float(n_gold) if n_gold != 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if precision != 0 or recall != 0 else 0
    scores = {'precision': precision, 'recall': recall, 'f1': f1}

    return scores


def compute_scores(pred_seqs, gold_seqs, sents):
    """
    Compute model performance
    """
    assert len(pred_seqs) == len(gold_seqs)
    num_samples = len(gold_seqs)

    all_labels, all_preds = [], []

    for i in range(num_samples):
        gold_list = extract_spans_para('asqp', gold_seqs[i], 'gold')
        pred_list = extract_spans_para('asqp', pred_seqs[i], 'pred')

        all_labels.append(gold_list)
        all_preds.append(pred_list)

    print("\nResults:")
    scores = compute_f1_scores(all_preds, all_labels)
    print(scores)

    return scores, all_labels, all_preds

In [None]:
def evaluate(data_loader, model, sents):
    """
    Compute scores given the predictions and gold labels
    """
    device = torch.device('cpu')
    model.model.to(device)

    model.model.eval()

    outputs, targets = [], []

    for batch in tqdm(data_loader):
        # need to push the data to device
        outs = model.model.generate(input_ids=batch['source_ids'].to(device),
                                    attention_mask=batch['source_mask'].to(device),
                                    max_length=128)  # num_beams=8, early_stopping=True)

        dec = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
        target = [tokenizer.decode(ids, skip_special_tokens=True) for ids in batch["target_ids"]]

        outputs.extend(dec)
        targets.extend(target)

    '''
    print("\nPrint some results to check the sanity of generation method:", '\n', '-'*30)
    for i in [1, 5, 25, 42, 50]:
        try:
            print(f'>>Target    : {targets[i]}')
            print(f'>>Generation: {outputs[i]}')
        except UnicodeEncodeError:
            print('Unable to print due to the coding error')
    print()
    '''

    scores, all_labels, all_preds = compute_scores(outputs, targets, sents)
    results = {'scores': scores, 'labels': all_labels, 'preds': all_preds}
    # pickle.dump(results, open(f"{args.output_dir}/results-{args.dataset}.pickle", 'wb'))

    return results, outputs


# load model

# train

In [None]:
# tfm_model = T5ForConditionalGeneration.from_pretrained('VietAI/vit5-base')
# bp_model = MBartForConditionalGeneration.from_pretrained('vinai/bartpho-word')
# vm_model = AutoModelForCausalLM.from_pretrained('Viet-Mistral/Vistral-7B-Chat')
gg_model = AutoModelForSeq2SeqLM.from_pretrained("google/byt5-small")

pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
#model = T5FineTuner(tfm_model, tokenizer)
model = T5FineTuner(gg_model, tokenizer)

In [None]:
output_dir = '/kaggle/working/results'

In [None]:
train_params = dict(
    #default_root_dir=output_dir,
    accumulate_grad_batches=1,
    #gpus=0,
    #gradient_clip_val=1.0,
    max_epochs=20,
    callbacks=[EarlyStopping(monitor="val_loss", mode="min")],
)

# train_params = dict(
#     #default_root_dir=output_dir,
#     accumulate_grad_batches=1,
#     accelerator='gpu',
#     devices=1,
#     #gradient_clip_val=1.0,
#     max_epochs=20,
#     callbacks=[LoggingCallback()],
# )

In [None]:
trainer = pl.Trainer(**train_params)
trainer.fit(model)

2024-07-12 14:34:59.984222: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-12 14:34:59.984324: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-12 14:35:00.095735: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

  self.pid = os.fork()


Training: |          | 0/? [00:00<?, ?it/s]

  self.pid = os.fork()


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

In [None]:
print("\n****** Conduct Evaluating with the last state ******")


sents, _ = read_line_examples_from_file(data_path + '/test.txt', 0)

print()
test_dataset = ABSADataset(tokenizer, data_dir=data_path,
                               data_type='test', max_len=max_len)
test_loader = DataLoader(test_dataset, batch_size=32, num_workers=4)

scores, sent_pred = evaluate(test_loader, model, sents)

'''
# write to file
log_file_path = f"results_log/{args.dataset}.txt"
local_time = time.asctime(time.localtime(time.time()))

exp_settings = f"Datset={args.dataset}; Train bs={args.train_batch_size}, num_epochs = {args.num_train_epochs}"
exp_results = f"F1 = {scores['f1']:.4f}"

log_str = f'============================================================\n'
log_str += f"{local_time}\n{exp_settings}\n{exp_results}\n\n"
'''


****** Conduct Evaluating with the last state ******



100%|██████████| 16/16 [06:51<00:00, 25.74s/it]


Results:
number of gold spans: 613, predicted spans: 577, hit: 161
{'precision': 0.27902946273830154, 'recall': 0.2626427406199021, 'f1': 0.27058823529411763}







In [None]:
type(scores['preds'])

list

In [None]:
scores['preds'][0]

[('Others', 'NULL', 'neutral', 'NULL')]

In [None]:
scores['preds'][0][0][0]

'Others'

In [None]:
aspect_pred = [i[0][0] for i in scores['preds']]

In [None]:
aspect_label = [i[0][0] for i in scores['labels']]

In [None]:
from sklearn.metrics import classification_report

print(classification_report(aspect_label, aspect_pred))

                     precision    recall  f1-score   support

                          0.00      0.00      0.00         0
Character - Outlook       0.67      0.17      0.27        12
Character - Overall       0.70      0.71      0.71       154
  Character - Voice       0.61      0.53      0.57        32
           Examiner       0.78      0.50      0.61        14
             Others       0.64      0.72      0.68       140
               Show       0.75      0.58      0.66        72
               Song       0.65      0.76      0.70        76

           accuracy                           0.67       500
          macro avg       0.60      0.50      0.52       500
       weighted avg       0.68      0.67      0.67       500



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
sentinment_pred = [i[0][2] for i in scores['preds']]

In [None]:
sentinment_label = [i[0][2] for i in scores['labels']]

In [None]:
print(classification_report(sentinment_label, sentinment_pred))

              precision    recall  f1-score   support

                   0.00      0.00      0.00         0
    negative       0.62      0.42      0.50        73
     neutral       0.70      0.73      0.71       171
    positive       0.78      0.81      0.79       256

    accuracy                           0.73       500
   macro avg       0.52      0.49      0.50       500
weighted avg       0.73      0.73      0.72       500



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
scores['labels'][3]

[('Song', 'NULL', 'positive', 'ko chan')]

In [None]:
scores['preds'][3]

[('Others', 'NULL', 'neutral', 'NULL')]

In [None]:
de = test_dataset[27]  # a random data sample
print('Input :', tokenizer.decode(de['source_ids'], skip_special_tokens=True))
print('Output:', tokenizer.decode(de['target_ids'], skip_special_tokens=True))

Input : Bài này hay qá điiiii có cái nhìn khác hẳn về bray luôn😮😮😮
Output: Examiner is great because bray is cái nhìn khác [SSEP] Song is great because Bài is hay


In [None]:
sent_pred

['Others is ok because it is NULL',
 'Character - Voice is ok because ICD is kể câu chuyện của anh ấy bằng rap [SSEP] Song is great because lời rap is g',
 'Others is ok because it is NULL',
 'Others is ok because it is NULL',
 'Character - Overall is great because Kiều is Đỉnh',
 'Character - Overall is great because Gừng is hợp với bray',
 'Character - Overall is bad because HIEUTHUHAI is mấấu chứ thi đầu là mấy ông sau chả còn gì',
 'Others is ok because it is NULL',
 'Show is great because Rap Viet sang KOR is hay quá',
 'Character - Overall is great because Pháo is hức chờ đón [SSEP] Character - Overall is great because rica is thất',
 'Song is great because it is Hay',
 'Show is bad because King of rap is thua xa ráp việt',
 'Character - Overall is great because Nhật Hoàng is Thích',
 'Character - Voice is great because Chị Umie is quá đỉnh',
 'Others is ok because it is NULL',
 'Others is ok because it is NULL',
 'Character - Overall is great because Rhyder is mãi đỉnh',
 'Chara

In [None]:
import pickle
import torch
import io

In [None]:
with open('/kaggle/working/byt5.pkl', 'wb') as file:
    pickle.dump(model, file)