In [1]:
import json
import pandas as pd
from utils import handle_reproducibility, preprocess
import unicodedata

In [2]:
with open('data/qa/processed_train_150_r2_pg0.json', 'r') as f1:
    qa_data = json.load(f1)

In [3]:
gt_data_dir = 'qa_pred_test/gt.csv'
pred_data_dir = 'qa_pred_test/qa_c3_1.csv'

In [4]:
gt = pd.read_csv(gt_data_dir)
pred = pd.read_csv(pred_data_dir)


In [5]:
gt["answer"] = gt["answer"].apply(lambda text: unicodedata.normalize("NFKC", text))

In [6]:
gt.head()

Unnamed: 0,id,answer
0,657,A
1,658,C
2,121,A
3,122,B
4,632,C


In [7]:
pred.head()

Unnamed: 0,id,answer
0,657,A
1,658,C
2,121,A
3,122,A
4,632,C


## General Statistics

accuracy

In [8]:
sum(gt['answer'] == pred['answer'])/len(pred)

0.6142857142857143

ration of neg and positive questions

In [9]:
all_qas = []
for _id in gt['id'].values:
    all_qas.append(qa_data[_id - 1])

all_neg_qas = 0
for qa in all_qas:
    question = qa['question']['stem']
    if '誤' in question or '不' in question or '沒' in question:
        all_neg_qas +=1
print(f"neg Q: {all_neg_qas}; pos Q: {len(pred) - all_neg_qas}")

neg Q: 45; pos Q: 25


## Wrong questions analysis

Hypothesis 1: neg Qs are harder to answer

In [10]:
wrong_pred = pred[gt['answer'] != pred['answer']]
wrong_ids = wrong_pred['id'].values
wrong_ans = wrong_pred['answer'].values

In [11]:
wrong_qas = []
for _id in wrong_ids:
    wrong_qas.append(qa_data[_id - 1])

In [12]:
wrong_neg_qa = 0
for i, qa in enumerate(wrong_qas):
    question = qa['question']['stem']
    if '誤' in question or '不' in question or '沒' in question:
        wrong_neg_qa +=1
print(f"neg Q: {wrong_neg_qa}; pos Q: {len(wrong_qas) - wrong_neg_qa}")

neg Q: 18; pos Q: 9


In [13]:
print(f"neg Q wrong ratio: {round(wrong_neg_qa/all_neg_qas, 2)}")
print(f"pos Q wrong ratio: {round((len(wrong_qas) - wrong_neg_qa)/(len(pred) - all_neg_qas), 2)}")

neg Q wrong ratio: 0.4
pos Q wrong ratio: 0.36


## Wrong question types

- enough information too hard: 203, 672, 675(講錯)
- enough information but wrong: 122, 440， 312, '172'(neg Q), 505(來回改變), 509, 29, 409, 393, 217, 285, 142, 143, 500, 262, 39
- not enough info: 634, 313, 394, 216, 194, 676(太少), 624

In [14]:
for i, qa in enumerate(wrong_qas):
    print('——————————————————————————————————————————————————————————————————————-')
    question = qa['question']['stem']
    print(qa['id'])
    print(qa['text'])
    print(qa['question']['stem'])
    print(qa['question']['choices'])
    print(f"gt: {qa['answer']} pred: {wrong_ans[i]}")

——————————————————————————————————————————————————————————————————————-
122
個管師:嗯哼,然後你那時候有想過說,誒其實你自己可以來試試看嗎?民眾:因為那時候,基本上應該是很少,就是跟不認識的人做性行為,所以就比較少,想說誒好像還好。個管師:所以你現在固,就是有固砲?民眾:恩。個管師:然後你這個固砲多久了?民眾:前後應該半年多了吧。個管師:但你都只有跟他,你沒有再約,還是你有固砲但也有約?民眾:就約的話就是沒有打,沒有打砲。個管師:那其它工作忙沒有辦法請假,那我可能沒有辦法回來,沒有關係你不用太緊張。民眾:所以回診一樣都要是禮拜三跟禮拜五這兩個時間?
下列關於民眾的性關係，何者錯誤？
[{'text': '有固砲', 'label': 'A'}, {'text': '半年多，沒有打砲', 'label': 'B'}, {'text': '很少，跟不認識的人做性行為', 'label': 'C'}]
gt: Ｂ pred: A
——————————————————————————————————————————————————————————————————————-
634
醫師:你有看到就亮亮的。好,啊另外就是,肝臟的部分,肝臟的部分就是,這邊,還有這個地方顯影也比較明顯,可能可以反映出你肝發炎指數高。民眾:嗯。醫師:所以不是跟你開刀的地方。民眾:又不一樣?醫師:不過確實啦,你那時候應該⋯好吧應該就⋯民眾:骨頭那天在做的時候反而是類似種肩膀的這邊,他叫我抬高,我有點,這個肩膀會痛。醫師:還可以接受。民眾:對。可是你是胸椎啊。胸椎可能,會不會是因為肺部那個東西造成的。
請問何者不是醫師建議民眾追蹤的地方？
[{'text': '肩膀', 'label': 'A'}, {'text': '肝臟', 'label': 'B'}, {'text': '肺部', 'label': 'C'}]
gt: Ａ pred: C
——————————————————————————————————————————————————————————————————————-
440
醫師:好,今天,今天還沒打嘛?民眾:好很多很多了。真的要感謝你。你給我,那個讓,讓我可以在⋯不必住院。真的,不然我沒有那個,我的體質你