### Path of model
- Change this to path of model you want to evaluate.

In [1]:
# ''안을 평가하시려는 모델의 파일 경로로 변경하고 아래 코드를 쭉 돌리시면 결과가 생성됩니다. (상단의 Run All 버튼을 눌러주세요.)
# 학습하신 모델은 config.json 파일의 model_save_path로 지정하신 폴더 안에 생성되었습니다.
# model_best의 경로를 넣고 측정하시면 됩니다.

model_path = 'model/model_best'

### Set the parameters

In [2]:
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
from sklearn import metrics
from torch.utils.data import DataLoader

from model import CNNLstmBert
from data import preprocessing

In [None]:
segment_size = 100
output_size = 15
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNNLstmBert(output_size).to(device)
model.eval()

## Evaluate

In [4]:
def evaluation(dataloader, model, device):
    puncs = [i for i in range(1, 15)]
    with torch.no_grad():
        y_preds = []
        y_labels = []
        for inputs, labels in tqdm(dataloader):
            inputs, labels = inputs.to(device), labels.to(device)
            _, output = model(inputs, device)
            y_preds+=list(output.argmax(dim=1).cpu().data.numpy().flatten())
            y_labels+=list(labels.cpu().data.numpy().flatten())
    result = metrics.precision_recall_fscore_support(y_labels, y_preds, average=None, labels=puncs)
    result = pd.DataFrame(np.array(result[:3]), columns=['，', '。', '！', '？', '；', '：', '“', '”', '…', '─', '、', '·', '《', '》'], index=['Precision', 'Recall', 'F1'])
    result['Entire'] = metrics.precision_recall_fscore_support(y_labels, y_preds, average='macro',labels=puncs)[:3]
    return result

#### Evaluate on our testset

In [5]:
testsets = []
with open('data/our_test.txt', 'r', encoding='utf-8') as f:
    test_data = f.readlines()
testsets.append(test_data)
testset = preprocessing(testsets, segment_size)
testloader = DataLoader(testset, batch_size=200, shuffle=None)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 171606/171606 [00:06<00:00, 25070.31it/s]


In [6]:
model.load_state_dict(torch.load(model_path))

our_result = evaluation(testloader, model, device)
our_result

# 아래 값이 저희가 생성한 테스트셋에서 모델을 평가한 결과입니다.

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:07<00:00,  1.22it/s]


Unnamed: 0,，,。,！,？,；,：,“,”,…,─,、,·,《,》,Entire
Precision,0.766602,0.682886,0.3125,0.65534,0.190751,0.47451,0.452693,0.452381,0.428571,0.25,0.682353,1.0,0.757576,0.750693,0.561204
Recall,0.739544,0.769584,0.178571,0.789474,0.212903,0.53304,0.299038,0.3,0.06383,0.013514,0.774711,0.390244,0.657895,0.592998,0.451096
F1,0.75283,0.723647,0.227273,0.71618,0.20122,0.502075,0.360162,0.360759,0.111111,0.025641,0.725605,0.561404,0.704225,0.662592,0.473909


#### Evaluate on origin testset

In [7]:
testsets = []
with open('data/test_iwslt.txt', 'r', encoding='utf-8') as f:
    test_data = f.readlines()
testsets.append(test_data)
testset = preprocessing(testsets, segment_size)
testloader = DataLoader(testset, batch_size=200, shuffle=None)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 47965/47965 [00:01<00:00, 25005.59it/s]


In [8]:
model.load_state_dict(torch.load(model_path))

iwslt_result = evaluation(testloader, model, device)
iwslt_result
# 아래 값이 저희가 iwslt2012 테스트셋에서 모델을 평가한 결과입니다.

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.42it/s]


Unnamed: 0,，,。,！,？,；,：,“,”,…,─,、,·,《,》,Entire
Precision,0.446095,0.356516,0.121212,0.683871,0.0,0.182927,0.243243,0.176471,0.0,0.0,0.256198,0.272727,0.4,0.409091,0.253454
Recall,0.591779,0.591928,0.4,0.721088,0.0,0.285714,0.391304,0.268657,0.0,0.0,0.568807,0.666667,0.347826,0.321429,0.368229
F1,0.508712,0.445006,0.186047,0.701987,0.0,0.223048,0.3,0.213018,0.0,0.0,0.353276,0.387097,0.372093,0.36,0.289306
