In [1]:
import os
import sys
module_path = ".."
sys.path.append(module_path)

from dataset_preprocessing import make_keyphrase_dataset, make_bio_tagged_dataset

In [2]:
from tqdm import tqdm
import re
import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report, fbeta_score

In [3]:
MODEL_NAME = "KeyBERT"

In [4]:
def process_doc(doc):
    processed_doc = doc.strip('"').replace("\n", " ").strip()
    processed_doc = re.sub('\s+', ' ', processed_doc)
    
    return processed_doc

def process_label(text):
    processed_text = text.strip('"').replace("\n", " ").strip()
    processed_text = re.sub('\s+', ' ', processed_text)

    return processed_text

In [5]:
prediction = pd.read_csv(f"{MODEL_NAME}-KoAirBERT.csv")

In [6]:
documents = prediction["본문"]
gold_standards = prediction["원인 키워드"]
model_predictions = prediction[f"{MODEL_NAME} 예측 키워드"]

In [7]:
for i in range(len(prediction)):
    prediction["원인 키워드"][i] = prediction["원인 키워드"][i]

In [8]:
model_predictions

0      "경우에는 최종 확인을 한 후 진입을 했어야 하나,", "수도공항에서 출발을 위해 ...
1      "ΟΟΟ 항공기가 Push-back하여 이륙을 위해 지상 활주 중 속도지시계 이상으...
2                         "수 있다고", "바쁜", "위와", "CG", "후"
3      "readback 내용을 주의 깊게 경청하지 않는다면", "구별이 쉽지 않았고, 또...
4      "기압값을 설정하지 않아 지시된 12,000ft를 순간 적으로 (1~2초, 약", ...
                             ...                        
214                          "더", "맞다고", "역시", "TO", "후"
215    "'IGEDA'에서 NEXT WPT 인 'NF'으로 비행 중임을 설명하자", "En...
216    "활성화되어 있어 Autopilot을 해제했고 경고", "해보기로 했다. 여러 번 ...
217    "연료도", "알려주면서 연료는 2,800lbs가 추가로 소요된다고 하였다.", "...
218                    "들을 수", "분명히 정면", "때", "최초", "제주"
Name: KeyBERT 예측 키워드, Length: 219, dtype: object

In [9]:
gold_labels = [process_label(label) for label in prediction["원인 키워드"][0].split('", "')]
gold_labels

['Z3를 F로 잘못 인지하여', '최종 확인을 한 후 진입을 했어야 하나, 확인 미흡', '승무원 상호 간 CRM 부족']

In [10]:
gold_standard_kp_dataset = make_keyphrase_dataset(prediction, col_name="원인 키워드")
model_prediction_kp_dataset = make_keyphrase_dataset(prediction, col_name=f"{MODEL_NAME} 예측 키워드", ignore_duplication_error=True)

bio_tagged_gold_standard = make_bio_tagged_dataset(gold_standard_kp_dataset)
bio_tagged_model_prediction = make_bio_tagged_dataset(model_prediction_kp_dataset)

bio_tagged_gold_standard = bio_tagged_gold_standard.replace(to_replace="B-causal factor", value="key").replace(to_replace="I-causal factor", value="key")
bio_tagged_model_prediction = bio_tagged_model_prediction.replace(to_replace="B-causal factor", value="key").replace(to_replace="I-causal factor", value="key")

100% 219/219 [00:00<00:00, 4360.54it/s]
100% 219/219 [00:00<00:00, 3340.19it/s]
100% 219/219 [00:02<00:00, 107.69it/s]
100% 219/219 [00:02<00:00, 85.03it/s]


In [11]:
if len(bio_tagged_gold_standard) == len(bio_tagged_model_prediction):
    print(f"Start Evaluation of {MODEL_NAME} model")
    
    print()
    
    print("precision_score")
    print(precision_score(bio_tagged_gold_standard["tag"], bio_tagged_model_prediction["tag"], average="micro"))
    print(precision_score(bio_tagged_gold_standard["tag"], bio_tagged_model_prediction["tag"], average="macro"))
    print(precision_score(bio_tagged_gold_standard["tag"], bio_tagged_model_prediction["tag"], average="weighted"))
    print(precision_score(bio_tagged_gold_standard["tag"], bio_tagged_model_prediction["tag"], average=None))
    print(precision_score(bio_tagged_gold_standard["tag"], bio_tagged_model_prediction["tag"], average='micro', labels=["key"]))
    
    print()
    
    print("recall_score")
    print(recall_score(bio_tagged_gold_standard["tag"], bio_tagged_model_prediction["tag"], average="micro"))
    print(recall_score(bio_tagged_gold_standard["tag"], bio_tagged_model_prediction["tag"], average="macro"))
    print(recall_score(bio_tagged_gold_standard["tag"], bio_tagged_model_prediction["tag"], average="weighted"))
    print(recall_score(bio_tagged_gold_standard["tag"], bio_tagged_model_prediction["tag"], average=None))
    print(recall_score(bio_tagged_gold_standard["tag"], bio_tagged_model_prediction["tag"], average='micro', labels=["key"]))
    
    print()

    print("f1_score")
    print(f1_score(bio_tagged_gold_standard["tag"], bio_tagged_model_prediction["tag"], average="micro"))
    print(f1_score(bio_tagged_gold_standard["tag"], bio_tagged_model_prediction["tag"], average="macro"))
    print(f1_score(bio_tagged_gold_standard["tag"], bio_tagged_model_prediction["tag"], average="weighted"))
    print(f1_score(bio_tagged_gold_standard["tag"], bio_tagged_model_prediction["tag"], average=None))
    print(f1_score(bio_tagged_gold_standard["tag"], bio_tagged_model_prediction["tag"], average='micro', labels=["key"]))
    
    print()
    
    print("f2_score")
    print(fbeta_score(bio_tagged_gold_standard["tag"], bio_tagged_model_prediction["tag"], beta=2, average="micro"))
    print(fbeta_score(bio_tagged_gold_standard["tag"], bio_tagged_model_prediction["tag"], beta=2, average="macro"))
    print(fbeta_score(bio_tagged_gold_standard["tag"], bio_tagged_model_prediction["tag"], beta=2, average="weighted"))
    print(fbeta_score(bio_tagged_gold_standard["tag"], bio_tagged_model_prediction["tag"], beta=2, average=None))
    print(fbeta_score(bio_tagged_gold_standard["tag"], bio_tagged_model_prediction["tag"], beta=2, average='micro', labels=["key"]))

Start Evaluation of KeyBERT model

precision_score
0.758836598666936
0.5293587185887528
0.765896764681491
[0.86332256 0.19539488]
0.19539487841618247

recall_score
0.758836598666936
0.5310966063689548
0.758836598666936
[0.85263863 0.20955458]
0.20955458112162473

f1_score
0.758836598666936
0.5300872532587927
0.7623021633903827
[0.85794734 0.20222717]
0.20222717149220487

f2_score
0.7588365986669361
0.5306575089046375
0.7602069125977273
[0.85475421 0.20656081]
0.20656080804404203
