In [1]:
import accelerate
import sys
import argparse
from unittest.util import _MAX_LENGTH
import pandas as pd
import transformers
from transformers import (
    AutoModelForSequenceClassification, 
    AutoTokenizer, 
    AutoModel, 
    AutoConfig,
    BertModel,
    MODEL_MAPPING,
    CONFIG_MAPPING
)
import logging   
from unidecode import unidecode
import numpy as np
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import get_scheduler
import torch
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from sklearn.metrics import f1_score, accuracy_score
import random
import copy
from tqdm import tqdm as tqdm
from sklearn.metrics import classification_report
from sklearn.utils import class_weight
from prettytable import PrettyTable
from accelerate import Accelerator
from transformers.utils.versions import require_version
from datasets import load_metric
import accelerate
import utils
import joblib
from datetime import datetime
import os
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
os.environ['XRT_TPU_CONFIG'] = "localservice;0;localhost:51011"

In [3]:
logger = logging.getLogger(__name__)
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

accelerator = Accelerator()
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)
logger.info(accelerator.state)
logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR)
if accelerator.is_local_main_process:\
    transformers.utils.logging.set_verbosity_info()
else:
    transformers.utils.logging.set_verbosity_error()

accelerator.wait_for_everyone()

device = accelerator.device

2022-08-22 01:22:15.613672: E tensorflow/core/framework/op_kernel.cc:1623] OpKernel ('op: "TPURoundRobin" device_type: "CPU"') for unknown op: TPURoundRobin
08/22/2022 01:22:28 - INFO - __main__ - Distributed environment: TPU
Num processes: 1
Process index: 0
Local process index: 0
Device: xla:1
Mixed precision type: no

2022-08-22 01:22:15.613724: E tensorflow/core/framework/op_kernel.cc:1623] OpKernel ('op: "TpuHandleToProtoKey" device_type: "CPU"') for unknown op: TpuHandleToProtoKey


In [12]:
model_classification = AutoModelForSequenceClassification.from_pretrained('model_0/model_classification').to(device)
model_regression = AutoModelForSequenceClassification.from_pretrained('model_0/model_regression').to(device)
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

loading configuration file model_0/model_classification/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "model_0/model_classification",
  "architectures": [
    "XLMRobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26": "LABEL_26",
    

In [13]:
per_device_eval_batch_size = 64
# test_file = 'Data/clean-public.z'
test_file = 'Data/test_1.z'
max_model_length = 512

In [14]:
data = joblib.load(test_file)
print(len(data))

1000


In [9]:
# data[0]

In [15]:
def GenericDataLoader(data, batch_size, max_model_length):
    ids = []
    masks = []
    labels = []
#     max_length = min(max_model_length, args.max_seq_length)
    for sample in data:
        sent = sample[0]
#             if len(sent) < 5:
#                 continue
        inputs = tokenizer(sent, return_tensors="np", padding='max_length', truncation=True, max_length=max_model_length)
        encoded_sent = inputs['input_ids'][0]
        mask = inputs['attention_mask'][0]
        ids.append(encoded_sent)
        masks.append(mask)
        labels.append(sample[1])
    inputs = torch.tensor(np.array(ids))
    masks = torch.tensor(np.array(masks))
#         labels = torch.tensor(np.array([i[1] for i in data]), dtype=torch.float)
    labels = torch.tensor(np.array(labels), dtype=torch.float)
    data = TensorDataset(inputs, masks, labels)
    sampler = SequentialSampler(data)
    dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size)
    return dataloader

In [48]:
def evaluation(model, test_dataloader, model_type=None):

        model.eval()
        targets, preds = [], []
        for batch in tqdm(test_dataloader):

            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            with torch.no_grad():
                outputs = model(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_input_mask)
            logits = outputs[0]
            outputs, labels = accelerator.gather([logits, b_labels])
            
            targets.append(labels)
            preds.append(outputs)
            
        targets = torch.cat(targets)
        preds = torch.cat(preds)
        
        targets = utils.convert_logits(targets)
        if model_type == 'classification':
            preds = utils.convert_logits(preds)
        elif model_type == 'regression':
            preds = torch.round(5*torch.sigmoid(preds))
        score =  utils.calculate_score(targets, preds)
            
        return round(score, 4), preds, targets

In [27]:
test_dataloader = GenericDataLoader(data, per_device_eval_batch_size, max_model_length)

In [53]:
# trước khi sửa
score_cls, pred_cls, targets = evaluation(model_classification, test_dataloader, model_type='classification')

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:03<00:00,  4.05it/s]


In [50]:
# trước khi sửa
score_reg, pred_reg, _ = evaluation(model_regression, test_dataloader, model_type='regression')

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:03<00:00,  4.07it/s]


In [51]:
print(score_cls)
print(score_reg)

0.7593
0.7133


In [39]:
pred_cls = pred_cls.tolist()
pred_reg = pred_reg.tolist()

In [40]:
# pred_reg.shape

In [42]:
pred_cls[0]

[0, 0, 0, 0, 0, 0]

In [43]:
pred_reg[0]

[1.0, 1.0, 0.0, 0.0, 0.0, 0.0]

In [46]:
res = []
for i, j in zip(pred_cls, pred_reg):
    res.append([max(x_i, x_j) for (x_i, x_j) in zip(i,j)])

In [54]:
res = torch.tensor(res)

In [56]:
utils.calculate_score(res, targets)

0.7239846388498942

In [57]:
len(data)

1000

In [58]:
test_dataloader = GenericDataLoader(data, per_device_eval_batch_size, max_model_length)

In [59]:
model_classification.eval()
targets, preds = [], []
for batch in tqdm(test_dataloader):

    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    with torch.no_grad():
        outputs = model_classification(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)
    logits = outputs[0]
    outputs, labels = accelerator.gather([logits, b_labels])

    targets.append(labels)
    preds.append(outputs)

targets = torch.cat(targets)
preds = torch.cat(preds)

targets = utils.convert_logits(targets)
preds = utils.convert_logits(preds)

print(targets.shape)
print(preds.shape)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:03<00:00,  4.06it/s]


torch.Size([1000, 6])
torch.Size([1000, 6])


In [60]:
targets = targets.tolist()
preds = preds.tolist()
print(len(targets))
print(len(targets[0]))

1000
6


In [61]:
print(data[0][0])
print(targets[0])
print(preds[0])
utils.compare_arrays(targets[0], preds[0])

Sân mới được nâng cấp, sạch và mới, đặc biệt là mặt sân. Phần chỗ ngồi đa phần không có ghế, ngồi bệ xi măng.
[5, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]


False

In [62]:
df = pd.read_csv('Data/test_1.csv')
columns = df.columns.tolist()
df = df[columns[:7]]
df.head(10)

Unnamed: 0,review,giai_tri,luu_tru,nha_hang,an_uong,van_chuyen,mua_sam
0,"Sân mới được nâng cấp, sạch và mới, đặc biệt l...",5,0,0,0,0,0
1,"Phòng cách âm không được tốt cho lắm, tiếng xả...",0,2,0,0,0,0
2,"Quán rộng, thoáng mát, hải sản tươi sống, giá ...",0,0,0,5,0,0
3,nhân viên thân thiện. có ăn sáng kèm giá phòng...,0,4,0,0,0,0
4,Chỗ nghỉ ok nhưng ace không nên ăn ở quán ngay...,0,4,0,2,0,0
5,"Nhân viên dễ thương, nhiệt tình",0,5,5,0,0,0
6,Cân nhắc khi đến đây. Mang tiếng bar mà đồ uốn...,3,0,0,2,0,0
7,Hồ bơi hơi bẩn và để không đó hơi lãng phí.,1,0,0,0,0,0
8,Thật sự chưa thấy chổ nào cho thuê xe máy nhiệ...,0,0,0,0,4,0
9,"Đồ hợp gu với giới trẻ, giá ổn không quá cao, ...",0,0,0,0,0,4


In [63]:
count = 0 
for t, p in zip(targets, preds):
    if utils.compare_arrays(t, p):
        count += 1
print(count)

351


In [64]:
columns = df.columns.tolist()
pre_c = ['pre_'+i for i in columns[1:]]
x = [columns[0]]
for i,j in zip(columns[1:], pre_c):
    x.append(i)
    x.append(j)
x.append('correct')

In [65]:
df1 = pd.DataFrame(columns=x)
df1['review'] = df['review']

In [66]:
len(df1)

1000

In [68]:
results = []
for sample, t, p in zip(data, targets, preds):
    res = []
    res.append(sample[0])
    for i,j in zip(t, p):
        res.append(i)
        res.append(j)
    if utils.compare_arrays(t, p):
        res.append(1)
    else:
        res.append(0)
#     print(res)
    results.append(res)

In [69]:
len(results)

1000

In [70]:
df1 = pd.DataFrame(results, columns=x)

In [71]:
df1.head(10)

Unnamed: 0,review,giai_tri,pre_giai_tri,luu_tru,pre_luu_tru,nha_hang,pre_nha_hang,an_uong,pre_an_uong,van_chuyen,pre_van_chuyen,mua_sam,pre_mua_sam,correct
0,"Sân mới được nâng cấp, sạch và mới, đặc biệt l...",5,0,0,0,0,0,0,0,0,0,0,0,0
1,"Phòng cách âm không được tốt cho lắm, tiếng xả...",0,0,2,1,0,0,0,0,0,0,0,0,0
2,"Quán rộng, thoáng mát, hải sản tươi sống, giá ...",0,0,0,0,0,5,5,5,0,0,0,0,0
3,nhân viên thân thiện. có ăn sáng kèm giá phòng...,0,0,4,4,0,0,0,0,0,0,0,0,1
4,Chỗ nghỉ ok nhưng ace không nên ăn ở quán ngay...,0,0,4,0,0,0,2,3,0,0,0,0,0
5,"Nhân viên dễ thương, nhiệt tình",0,0,5,5,5,5,0,0,0,0,0,0,1
6,Cân nhắc khi đến đây. Mang tiếng bar mà đồ uốn...,3,3,0,0,0,3,2,2,0,0,0,0,0
7,Hồ bơi hơi bẩn và để không đó hơi lãng phí.,1,2,0,0,0,0,0,0,0,0,0,0,0
8,Thật sự chưa thấy chổ nào cho thuê xe máy nhiệ...,0,0,0,0,0,0,0,0,4,5,0,0,0
9,"Đồ hợp gu với giới trẻ, giá ổn không quá cao, ...",0,0,0,0,0,0,0,0,0,0,4,5,0


In [72]:
df1.to_csv('Data/compare_label_and_predict_test_1.csv')

In [23]:
# aspect_targets = []
# aspect_preds = []
# for t, p in zip(targets, preds):
#     aspect_targets.append([1 if i > 0 else 0 for i in t])
#     aspect_preds.append([1 if i > 0 else 0 for i in p])

In [1]:
# from sklearn.metrics import plot_confusion_matrix
# aspects = ['vui chơi', 'lưu trú', 'nhà hàng', 'ăn uống', 'vận chuyển', 'mua sắm']

# fig, axes = plt.subplots(nrows=6, ncols=3, figsize=(15,10))
# # ax.plot(x, y)
# count = 0
# for i in range(6):
#     for j in range(i+1, 6):
#         target = [p[i] for p in aspect_targets]
#         pred = [t[i] for t in aspect_preds]
# #         plot_confusion_matrix(conf_mat=binary,
# #                               ax=axes.flatten()[count], 
                                
# #                               cmap='Blues',
# #                              cdisplay_labels=[aspects[i], aspects[j]])
# #         ax.title.set_text(aspects[i]+'and'+aspects[j])
# #         count += 1
#         break
#     break
# # plt.tight_layout()  
# # plt.show()

In [2]:
# import numpy as np


# def plot_confusion_matrix(cm,
#                           target_names,
#                           title='Confusion matrix',
#                           cmap=None,
#                           normalize=True):
#     """
#     given a sklearn confusion matrix (cm), make a nice plot

#     Arguments
#     ---------
#     cm:           confusion matrix from sklearn.metrics.confusion_matrix

#     target_names: given classification classes such as [0, 1, 2]
#                   the class names, for example: ['high', 'medium', 'low']

#     title:        the text to display at the top of the matrix

#     cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
#                   see http://matplotlib.org/examples/color/colormaps_reference.html
#                   plt.get_cmap('jet') or plt.cm.Blues

#     normalize:    If False, plot the raw numbers
#                   If True, plot the proportions

#     Usage
#     -----
#     plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
#                                                               # sklearn.metrics.confusion_matrix
#                           normalize    = True,                # show proportions
#                           target_names = y_labels_vals,       # list of names of the classes
#                           title        = best_estimator_name) # title of graph

#     Citiation
#     ---------
#     http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

#     """
#     import matplotlib.pyplot as plt
#     import numpy as np
#     import itertools

#     accuracy = np.trace(cm) / float(np.sum(cm))
#     misclass = 1 - accuracy

#     if cmap is None:
#         cmap = plt.get_cmap('Blues')

#     plt.figure(figsize=(8, 6))
#     plt.imshow(cm, interpolation='nearest', cmap=cmap)
#     plt.title(title)
#     plt.colorbar()

#     if target_names is not None:
#         tick_marks = np.arange(len(target_names))
#         plt.xticks(tick_marks, target_names, rotation=45)
#         plt.yticks(tick_marks, target_names)

#     if normalize:
#         cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


#     thresh = cm.max() / 1.5 if normalize else cm.max() / 2
#     for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
#         if normalize:
#             plt.text(j, i, "{:0.4f}".format(cm[i, j]),
#                      horizontalalignment="center",
#                      color="white" if cm[i, j] > thresh else "black")
#         else:
#             plt.text(j, i, "{:,}".format(cm[i, j]),
#                      horizontalalignment="center",
#                      color="white" if cm[i, j] > thresh else "black")


#     plt.tight_layout()
#     plt.ylabel('True label')
#     plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
#     plt.show()

In [3]:
# # from sklearn.metrics import plot_confusion_matrix
# aspects = ['vui chơi', 'lưu trú', 'nhà hàng', 'ăn uống', 'vận chuyển', 'mua sắm']

# # fig, axes = plt.subplots(nrows=6, ncols=3, figsize=(15,10))
# # ax.plot(x, y)
# count = 0
# for i in range(6):
#     for j in range(i+1, 6):
#         target = [p[i] for p in aspect_targets]
#         pred = [t[i] for t in aspect_preds]
#         print(len(target))
#         print(len(pred))
#         plot_confusion_matrix(cm           = np.array([target, pred]), 
#                       normalize    = True,
#                       target_names = [aspects[i], aspects[j]],
#                       title        = "Confusion Matrix, Normalized")
#         break
#     break
# # plt.tight_layout()  
# # plt.show()

In [172]:
# target