In [8]:
import accelerate
import sys
# sys.path.append('/home/caohainam/Review-Analytics')
import argparse
from unittest.util import _MAX_LENGTH
import pandas as pd
import transformers
from transformers import (
    AutoModelForSequenceClassification, 
    AutoTokenizer, 
    AutoModel, 
    AutoConfig,
    BertModel,
    MODEL_MAPPING,
    CONFIG_MAPPING
)
import logging   
from unidecode import unidecode
import numpy as np
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import get_scheduler
import torch
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from sklearn.metrics import f1_score, accuracy_score
import random
import copy
from tqdm import tqdm as tqdm
from sklearn.metrics import classification_report
from sklearn.utils import class_weight
from prettytable import PrettyTable
from accelerate import Accelerator
from transformers.utils.versions import require_version
from datasets import load_metric
import accelerate
import utils
import joblib
from datetime import datetime
import os
import pandas as pd

In [3]:
!ls

Data		back_translation.ipynb	metric.pdf	    process_data.ipynb
Data_Add	data_train		model_0		    requirements.txt
Data_seg	doc.txt			model_1		    review-analysis-v2
DownstreamTask	evaluation.ipynb	model_2		    utils.ipynb
README.md	hardcopy.0		predict.ipynb	    utils.py
__pycache__	key-rnd-vps.json	preprocess_text.py


In [4]:
os.environ['XRT_TPU_CONFIG'] = "localservice;0;localhost:51011"

In [5]:
logger = logging.getLogger(__name__)
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

accelerator = Accelerator()
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)
logger.info(accelerator.state)
logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR)
if accelerator.is_local_main_process:\
    transformers.utils.logging.set_verbosity_info()
else:
    transformers.utils.logging.set_verbosity_error()

accelerator.wait_for_everyone()

# device = accelerator.device
# device = 'cpu'

2022-08-20 15:23:46.863410: E tensorflow/core/framework/op_kernel.cc:1623] OpKernel ('op: "TPURoundRobin" device_type: "CPU"') for unknown op: TPURoundRobin
2022-08-20 15:23:46.863465: E tensorflow/core/framework/op_kernel.cc:1623] OpKernel ('op: "TpuHandleToProtoKey" device_type: "CPU"') for unknown op: TpuHandleToProtoKey
08/20/2022 15:23:57 - INFO - __main__ - Distributed environment: TPU
Num processes: 1
Process index: 0
Local process index: 0
Device: xla:1
Mixed precision type: no



In [5]:
# device

In [6]:
def read_data(data_file):
    if '.csv' in data_file:
        df = pd.read_csv(data_file, delimiter=',', header=0, encoding="utf8")
        df = df.dropna()
#         sent = []
#         for sample in df.values.tolist():
#             sent.append(sample[0])
#         return sent
        sent = df['text'].tolist()
        return sent
    else:
        return joblib.load(data_file)

def GenericDataLoader(data, batch_size, max_model_length):
    ids = []
    masks = []
    max_length = min(max_model_length, max_seq_length)
    for sent in data:
        try:
            inputs = tokenizer(sent, return_tensors="np", padding='max_length', truncation=True, max_length=max_length)
            encoded_sent = inputs['input_ids'][0]
            mask = inputs['attention_mask'][0]
            ids.append(encoded_sent)
            masks.append(mask)
        except:
            print(sent)
    inputs = torch.tensor(np.array(ids))
    masks = torch.tensor(np.array(masks))
    data = TensorDataset(inputs, masks)
    sampler = SequentialSampler(data)
    dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size)
    return dataloader

def predict(model, test_dataloader, num_test_sample):

    model.eval()
    targets, preds = [], []
    for batch in tqdm(test_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask = batch
        with torch.no_grad():
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        logits = outputs[0]
        outputs = accelerator.gather([logits])
        preds.append(logits)
    logit_preds = torch.cat(preds)[:num_test_sample*36]
    score_preds = utils.convert_logits(logit_preds).reshape(-1, 6).to(device) # ko co to(device) thi sieu lau, why?
    return logit_preds, score_preds

In [7]:
# test_file = 'Data_Add/19-8_khachsan.csv'
# test_file = 'Data_Add/19-8_vanchuyen.csv'
test_file = 'Data_Add/19-8_all-topic.csv'
model_name_or_path = 'model_0/model_with_remake_data'
max_seq_length = 512
num_labels = 36
per_device_eval_batch_size = 64
tokenizer_name = 'xlm-roberta-base'
# device = 'cpu'
device = accelerator.device

In [8]:
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, num_labels=num_labels).to(device)
config = model.config
max_model_length = config.max_position_embeddings
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

loading configuration file model_0/model_with_remake_data/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "model_0/model_with_remake_data",
  "architectures": [
    "XLMRobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26": "LABEL_26",


In [9]:
test_data = read_data(test_file)
num_test_sample = len(test_data)
test_dataloader = GenericDataLoader(test_data, per_device_eval_batch_size, max_model_length)

In [1]:
print(len(test_dataloader))
print(len(test_data))

NameError: name 'test_dataloader' is not defined

In [11]:
# test_data = ['không gian hiện đại phục vu chu đáo']
# num_test_sample = len(test_data)
# test_dataloader = GenericDataLoader(test_data, per_device_eval_batch_size, max_model_length)

In [12]:
logit_preds, score_preds = predict(model, test_dataloader, num_test_sample)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 108/108 [00:34<00:00,  3.09it/s]


In [13]:
logit_preds.shape

torch.Size([6857, 36])

In [14]:
score_preds.shape

torch.Size([6857, 6])

In [None]:
# with torch.no_grad():
#     for i in range(logit_preds.shape[0]):
#         logit = logit_preds[i].to('cpu')
#         if any(utils.get_confidence(logit_preds[0])) < 0.8:
#             print(test_data[i])

In [24]:
# with torch.no_grad():
#     predict_ = predict_.to('cpu')

In [15]:
logit_preds = logit_preds.tolist()

In [None]:
# ko chay duoc =))
score_preds = score_preds.tolist()

In [31]:
def get_confidence(logit):
    # input: tensor
    conf = []
    for i in range(0, 36, 6):
        x = torch.tensor(logit[i:i+6])    
#         print(type(x))
        x = round(max(torch.nn.functional.softmax(x, dim=0).tolist()), 4)
        conf.append(x)
    return conf

In [45]:
res = []
for idx, i in enumerate(logit_preds):
    count += 1
    conf = get_confidence(i)
#     print(conf)
    if any(t < 0.8 for t in conf):
#         print(test_data[idx])
        res.append([test_data[idx]] + )

In [80]:
df = pd.read_csv(test_file, delimiter=',', header=0, encoding="utf8")
df = df.dropna()
print(len(df))
df.head()

6857


Unnamed: 0.1,Unnamed: 0,text,location,id,giai_tri,luu_tru,nha_hang,an_uong,van_chuyen,mua_sam
0,4417,Phòng ốc ở đây rộng rãi Công ty mình rất hay t...,4_all-topic.txt,4454,0,0,0,0,0,0
1,4418,Chủ quán dễ thương lắm. Nhà cô có vườn dâu nữa...,4_all-topic.txt,4455,0,0,0,0,0,0
2,4419,"Hôm nay đi hội sách cv Thống Nhất nè , cv siêu...",4_all-topic.txt,4456,0,0,0,0,0,0
3,4420,Ko gian ngoài trời thoáng mát. Bánh pizza to t...,4_all-topic.txt,4457,0,0,0,0,0,0
4,4421,Mình mới giặt sấy ở cửa hàng Laundromat TTV nà...,4_all-topic.txt,4458,0,0,0,0,0,0


In [81]:
df = df.drop(columns=['Unnamed: 0', 'giai_tri', 'luu_tru', 'nha_hang', 'an_uong', 'van_chuyen', 'mua_sam'])

In [82]:
add_df = {'giai_tri': [], 
         'luu_tru': [],
         'nha_hang': [],
         'an_uong': [],
         'van_chuyen': [],
         'mua_sam': []}

In [None]:
keys = list(add_df.keys())
for i in predict_:
    score = (i.tolist())
    for i, key in enumerate(keys):
        add_df[key].append(score[i])

In [56]:
add_df = pd.DataFrame(add_df)
add_df.head()

Unnamed: 0,giai_tri,luu_tru,nha_hang,an_uong,van_chuyen,mua_sam
0,4,0,0,0,0,0
1,0,0,0,0,3,0
2,0,0,0,0,4,0
3,0,0,0,0,3,3
4,0,0,0,0,3,0


In [57]:
print(len(df))
print(len(add_df))

231
231


In [58]:
new_df = pd.concat([df, add_df], axis=1)

In [59]:
new_df.head()

Unnamed: 0,text,location,id,giai_tri,luu_tru,nha_hang,an_uong,van_chuyen,mua_sam
0,Một chuyến đi thật tuyệt vời và học được nhiều...,3_vanchuyen.txt,11326,4,0,0,0,0,0
1,Xe thoải mái. Đến lúc đêm nhưng lái xe không đ...,3_vanchuyen.txt,11327,0,0,0,0,3,0
2,Thẻ tiện lợi dễ sử dụng và dễ đổi ở sân bay.,3_vanchuyen.txt,11328,0,0,0,0,4,0
3,Mình không dùng visa thanh toán mà chọn t...,3_vanchuyen.txt,11329,0,0,0,0,3,3
4,Mua vé của travel phải in ra vé mới được đi tàu.,3_vanchuyen.txt,11330,0,0,0,0,3,0


In [60]:
new_df.to_csv('Data_Add/vanchuyen_predict.csv', index=False)

In [11]:
def convert_logit(logit):
    res = []
    for i in range(0, 36, 6):
        x = logit[i:i+6]
        res.append(torch.argmax(x))
    res = torch.stack(res)
    return res

In [12]:
review_sentence = 'không gian hiện đại phục vu chu đáo'
input = tokenizer(review_sentence, return_tensors="pt", padding='max_length', truncation=True, max_length=300).to(device)
logit = model(**input)[0][0]
# print(logit.shape)
# predict_results = utils.convert_logits(logit).reshape(-1, 6)[0].tolist()
predict_results = convert_logit(logit).tolist()
print(predict_results)
RATING_ASPECTS = ["giai_tri", "luu_tru", "nha_hang", "an_uong", "di_chuyen", "mua_sam"]
output = {
        "review": review_sentence,
        "results": {}
      }
for count, r in enumerate(RATING_ASPECTS):
    output["results"][r] = predict_results[count]

print(output)

[0, 0, 4, 0, 0, 0]
{'review': 'không gian hiện đại phục vu chu đáo', 'results': {'giai_tri': 0, 'luu_tru': 0, 'nha_hang': 4, 'an_uong': 0, 'di_chuyen': 0, 'mua_sam': 0}}


In [15]:
conf = []
with torch.no_grad():
    logit = logit.to('cpu')
    for i in range(0, 36, 6):
        x = logit[i:i+6]    
#         print(type(x))
        x = round(max(torch.nn.functional.softmax(x, dim=0).tolist()), 4)
        conf.append(x)
if any(conf) < 0.7:
    print(1)

# So sánh 2 model classification va regression

In [11]:
def convert_logit_to_score(logit):
    res = []
    for i in range(0, 36, 6):
        x = logit[i:i+6]
        res.append(x.index(max(x)))
    return res

In [9]:
# test_file = 'Data_Add/19-8_khachsan.csv'
# test_file = 'Data_Add/19-8_vanchuyen.csv'
test_file = 'Data/test_1.z'
model_name_or_path = 'model_0/model_with_remake_data'
max_seq_length = 512
num_labels = 36
per_device_eval_batch_size = 64
tokenizer_name = 'xlm-roberta-base'
# device = 'cpu'
device = accelerator.device

In [15]:
testdata = read_data(test_file)

In [14]:
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-base/resolve/main/config.json from cache at /home/caohainam/.cache/huggingface/transformers/87683eb92ea383b0475fecf99970e950a03c9ff5e51648d6eee56fb754612465.dfaaaedc7c1c475302398f09706cbb21e23951b73c6e2b3162c1c8a99bb3b62a
Model config XLMRobertaConfig {
  "_name_or_path": "xlm-roberta-base",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_vers

loading configuration file model_0/model_classification/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "model_0/model_classification",
  "architectures": [
    "XLMRobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26": "LABEL_26",
    

In [40]:
%%time
model = AutoModelForSequenceClassification.from_pretrained('model_0/model_classification')
res_classification = []
for idx, sample in enumerate(testdata):
    label = convert_logit_to_score(sample[1])
    input = tokenizer(sample[0], return_tensors="pt", padding='max_length', truncation=True, max_length=300)
    with torch.no_grad():
        logit = model(**input)[0][0].tolist()
    pred = convert_logit_to_score(logit)
    if not utils.compare_arrays(label, pred):
        res_classification.append(idx)

loading configuration file model_0/model_classification/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "model_0/model_classification",
  "architectures": [
    "XLMRobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26": "LABEL_26",
    

CPU times: user 2h 55min 36s, sys: 9min 4s, total: 3h 4min 40s
Wall time: 4min 46s


In [41]:
%%time
model = AutoModelForSequenceClassification.from_pretrained('model_0/model_regression')
res_regression = []
for idx, sample in enumerate(testdata):
    label = convert_logit_to_score(sample[1])
    input = tokenizer(sample[0], return_tensors="pt", padding='max_length', truncation=True, max_length=300)
#     print(input)
    with torch.no_grad():
        logit = model(**input)[0][0]
    pred = torch.round(5*torch.sigmoid(logit))
    if not utils.compare_arrays(label, pred):
        res_regression.append(idx)

loading configuration file model_0/model_regression/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "model_0/model_regression",
  "architectures": [
    "XLMRobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers

CPU times: user 2h 57min 28s, sys: 9min 29s, total: 3h 6min 58s
Wall time: 4min 49s


In [42]:
len(res_classification)

646

In [43]:
len(res_regression)

711

In [44]:
count = 0
for i in res_classification:
    if i not in res_regression:
        count += 1
print(count)

109


In [45]:
count = 0
for i in res_regression:
    if i not in res_classification:
        count += 1
print(count)

174
