In [1]:
import pandas as pd
from models import *
from tqdm import tqdm
tqdm.pandas()
from torch import nn
import json
import numpy as np
import pickle
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from argparse import Namespace
from transformers import *
import torch
import matplotlib.pyplot as plt
import torch.utils.data
import torch.nn.functional as F
import argparse
from transformers.modeling_utils import * 
from fairseq.data.encoders.fastbpe import fastBPE
from fairseq.data import Dictionary
from vncorenlp import VnCoreNLP
from utils import * 

In [2]:
args = Namespace(
    test_path = 'data_clean.csv',
    dict_path = "./phobert_base/dict.txt",
    config_path = "./phobert_base/config.json",
    rdrsegmenter_path = '/home/tuna/FDM/MarketSentiment/PhoBert-Sentiment-Classification/VnCoreNLP-master/VnCoreNLP-1.1.1.jar',
    pretrained_path = './phobert_base/model.bin',
    max_sequence_length = 256,
    batch_size = 24,
    ckpt_path = './models',
    bpe_codes = "./phobert_base/bpe.codes"
)

In [3]:
bpe = fastBPE(args)
rdrsegmenter = VnCoreNLP(args.rdrsegmenter_path, annotators="wseg", max_heap_size='-Xmx500m') 

./phobert_base/bpe.codes


In [4]:
# Load model
config = RobertaConfig.from_pretrained(
    args.config_path,
    output_hidden_states=True,
    num_labels=1
)
model_bert = RobertaForAIViVN.from_pretrained(args.pretrained_path, config=config)
model_bert.cuda()
tsfm = model_bert.roberta

In [5]:
# Load the dictionary  
vocab = Dictionary()
vocab.add_from_file(args.dict_path)

In [6]:
test_df = pd.read_csv(args.test_path, sep='#').fillna("###")
print(test_df)
test_df.text = test_df.text.progress_apply(lambda x: ' '.join([' '.join(sent) for sent in rdrsegmenter.tokenize(x)]))
X_test = convert_lines(test_df, vocab, bpe,args.max_sequence_length)

  2%|▏         | 7/430 [00:00<00:06, 67.29it/s]

                                                  text sentiment_final
0    Xin thông báo khi nào hết dịch Corona thì lúc ...               P
1    5 mã trắng cửa bán rồi thì FLC tí nữa thôi là ...               P
2    Hihi theo a Quyết đến hết mùa dịch chắc cũng p...               P
3    Mai tây (ETFs) nó xả hàng ROS, mà a QC còn chơ...               N
4    Art, Hai, KLF lái con chốt, a e vô đỡ thấy thư...               N
..                                                 ...             ...
425  Tin vĩ mô:.- Bộ TC gia tăng gói hỗ trợ một số ...               P
426  T2 này khả năng thoát dớp nhưng về lâu dài phe...               N
427                             Giảm thêm 700 đáy chắc               N
428  Cuối tuần ngập tràn tin xấu.Cá Mập, đội lái đã...               N
429                                Vni lên là lên !!!!               P

[430 rows x 2 columns]


100%|██████████| 430/430 [00:03<00:00, 142.69it/s]
100%|██████████| 430/430 [00:00<00:00, 2365.59it/s]


In [7]:
preds_fold = []
model_bert.load_state_dict(torch.load(os.path.join(args.ckpt_path, 'model_0.bin')))
test_dataset = torch.utils.data.TensorDataset(torch.tensor(X_test,dtype=torch.long))
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False)
model_bert.eval()
pbar = tqdm(enumerate(test_loader),total=len(test_loader),leave=False)
for i, (x_batch,) in pbar:
    y_pred = model_bert(x_batch.cuda(), attention_mask=(x_batch>0).cuda())
    y_pred = y_pred.view(-1).detach().cpu().numpy()
    preds_fold = np.concatenate([preds_fold, y_pred])
preds_fold = sigmoid(preds_fold)
print(preds_fold)

                                               

[0.67722899 0.6837119  0.69720818 0.63185299 0.66825543 0.64536438
 0.68647396 0.68881821 0.61823149 0.69261806 0.66358323 0.66086594
 0.66897912 0.63223537 0.67260869 0.66865696 0.66988795 0.75793343
 0.7081559  0.69726558 0.6334864  0.69928145 0.69900164 0.70276776
 0.68616173 0.67808487 0.68686212 0.64393327 0.62991635 0.64595152
 0.6789204  0.69673243 0.65110739 0.65934951 0.69675033 0.67428666
 0.7006828  0.69872111 0.70466294 0.65996625 0.67299807 0.68408527
 0.6958926  0.68243259 0.63908698 0.654583   0.67387529 0.65794283
 0.69952491 0.6926459  0.69625293 0.69355109 0.6866958  0.65460756
 0.68632902 0.7011391  0.63234857 0.67673275 0.65577075 0.68115816
 0.62978256 0.72564897 0.65360723 0.68278406 0.70261478 0.72348574
 0.68097362 0.68454529 0.69903781 0.64326645 0.65006478 0.67512336
 0.69443938 0.67760886 0.70042047 0.77305541 0.64678992 0.66780546
 0.62973098 0.65839186 0.65123938 0.69159547 0.63839702 0.68777659
 0.69563063 0.67772907 0.66826608 0.66761622 0.6406341  0.6940



In [8]:
len(preds_fold)

430

In [9]:
test_df['result'] = preds_fold

In [12]:
test_df.to_csv('result.csv', sep='#', index=False, encoding='utf-8-sig')