In [1]:
import json

#### Read Google Dataset

In [2]:
factuality_data = json.load(open('../Dataset/xsum_hallucination_annotations/factuality_annotations_xsum_summaries.json'))
hallucination_data = json.load(open('../Dataset/xsum_hallucination_annotations/hallucination_annotations_xsum_summaries.json'))

In [3]:
print(len(factuality_data))
print(len(hallucination_data))

5597
11185


In [4]:
factuality_data[0]

{'bbcid': 29911712,
 'system': 'BERTS2S',
 'summary': 'more than 50 pupils at a bristol academy have been sent home from school because of a lack of uniform.',
 'is_factual': 'no',
 'worker_id': 'wid_0'}

In [5]:
hallucination_data[0]

{'bbcid': 34687720,
 'system': 'BERTS2S',
 'summary': 'rory mcilroy will take a one-shot lead into the final round of the wgc-hsbc champions after carding a three-under',
 'hallucination_type': 'extrinsic',
 'hallucinated_span': 'rory mcilroy will take a one-shot lead into the final round of the wgc-hsbc champions after carding a three-under',
 'worker_id': 'wid_0'}

#### 

#### Read Calculated Probability

In [6]:
google_data_with_proba = json.load(open('google_data_with_proba.json'))

In [7]:
print(len(google_data_with_proba))

500


In [8]:
google_data_with_proba['34687720']['Gold']

{'summary': 'rory mcilroy moved to within a shot of joint leaders victor dubuisson and jaco van zyl after the third round of the turkish airlines open.',
 'summary_upper': 'Rory McIlroy moved to within a shot of joint leaders Victor Dubuisson and Jaco van Zyl after the third round of the Turkish Airlines open .',
 'ents': [{'start': 0,
   'end': 4,
   'label': -1,
   'type': 'PERSON',
   'ent': 'Rory',
   'prior': 0.380859375,
   'posterior': 0.93017578125},
  {'start': 5,
   'end': 12,
   'label': -1,
   'type': 'PERSON',
   'ent': 'McIlroy',
   'prior': 0.9189453125,
   'posterior': 0.78173828125},
  {'start': 53,
   'end': 59,
   'label': -1,
   'type': 'PERSON',
   'ent': 'Victor',
   'prior': 0.0023136138916015625,
   'posterior': 0.000522613525390625},
  {'start': 60,
   'end': 69,
   'label': -1,
   'type': 'PERSON',
   'ent': 'Dubuisson',
   'prior': 0.97119140625,
   'posterior': 0.82958984375},
  {'start': 74,
   'end': 78,
   'label': -1,
   'type': 'PERSON',
   'ent': 'Jaco

#### Claculate Factuality Correlation

In [9]:
factuality_data[1]

{'bbcid': 29911712,
 'system': 'BERTS2S',
 'summary': 'more than 50 pupils at a bristol academy have been sent home from school because of a lack of uniform.',
 'is_factual': 'no',
 'worker_id': 'wid_1'}

In [10]:
factuality = {}
for i, f in enumerate(factuality_data):
    if f['bbcid'] not in factuality:
        factuality[f['bbcid']] = {}
    if f['system'] not in factuality[f['bbcid']]:
        factuality[f['bbcid']][f['system']] = []
        
    if f['is_factual'] == 'yes':
        factuality[f['bbcid']][f['system']].append(True)
    elif f['is_factual'] == 'no':
        factuality[f['bbcid']][f['system']].append(False)
    elif f['is_factual'] is None:
        factuality[f['bbcid']][f['system']].append(False)
    else:
        print(i)
        raise Exception('Unkown Label: {}'.format(f['is_factual']))

In [11]:
hallucination = {}
for h in hallucination_data:
    if h['bbcid'] not in hallucination:
        hallucination[h['bbcid']] = {}
    if h['system'] not in hallucination[h['bbcid']]:
        hallucination[h['bbcid']][h['system']] = []
    
    if h['hallucination_type'] == 'extrinsic' and len(h['hallucinated_span']) < len(h['summary']):
        hallucination[h['bbcid']][h['system']].append(h['hallucinated_span'])

In [12]:
# entity in hallucination span (extrinsic), and summary false: false-hallucination
# entity in hallucination span (extrinsic), and summary true: true-hallucnination
# entity not in hallucination span and summary true: non-hallucination

In [13]:
google_data_with_proba['34687720']['BERTS2S']

{'summary': 'rory mcilroy will take a one-shot lead into the final round of the wgc-hsbc champions after carding a three-under',
 'summary_upper': 'Rory McIlroy will take a one-shot lead into the final round of the Wgc-Hsbc champions after carding a Three-Under',
 'ents': [{'start': 0,
   'end': 4,
   'label': 2,
   'type': 'PERSON',
   'ent': 'Rory',
   'prior': 0.379150390625,
   'posterior': 0.923828125},
  {'start': 5,
   'end': 12,
   'label': 2,
   'type': 'PERSON',
   'ent': 'McIlroy',
   'prior': 0.97119140625,
   'posterior': 0.7802734375},
  {'start': 25,
   'end': 28,
   'label': 2,
   'type': 'CARDINAL',
   'ent': 'one',
   'prior': 0.004116058349609375,
   'posterior': 0.1072998046875},
  {'start': 63,
   'end': 75,
   'label': 2,
   'type': 'ORG',
   'ent': 'the Wgc-Hsbc',
   'prior': 0.0,
   'posterior': 0.0},
  {'start': 102,
   'end': 107,
   'label': 2,
   'type': 'CARDINAL',
   'ent': 'Three',
   'prior': 1.138448715209961e-05,
   'posterior': 0.00046896934509277344}

In [14]:
factuality[34687720]['BERTS2S']

[False, False, False]

In [15]:
hallucination[34687720]['BERTS2S']

['the final round of the wgc-hsbc champions']

#### Evaluating

In [16]:
from utils_google_evaluation import read_document

In [17]:
def check_factual(scores):
    if None in scores: return False
#     if len(scores) == sum(scores):
#         return True
#     else:
#         return False

    if sum(scores) * 2 >= len(scores):
        return True
    else:
        return False

In [18]:
def check_hallucinated(entity, spans):
    for s in spans:
        if entity in s:
            return True
    return False

In [19]:
factual_label, hallucination_label, posterior_label = [], [], []
prior_probs, posterior_probs = [], []
overlap_preds, threshold_preds = [], []

for bbcid in google_data_with_proba:
    for system in google_data_with_proba[bbcid]:
        if int(bbcid) not in factuality or system not in factuality[int(bbcid)]: continue
    
        for e in google_data_with_proba[bbcid][system]['ents']:
            if 'posterior' not in e or e['posterior'] is None: continue
            
            is_factual = check_factual(factuality[int(bbcid)][system])
            is_hallucinated = check_hallucinated(e['ent'], hallucination[int(bbcid)][system])

            if is_factual and is_hallucinated:
                factual_label.append(1)
                hallucination_label.append(1)
            elif is_factual and not is_hallucinated:
                factual_label.append(1)
                hallucination_label.append(0)
            elif (not is_factual) and is_hallucinated:
                factual_label.append(0)
                hallucination_label.append(1)
            elif (not is_factual) and (not is_hallucinated):
                factual_label.append(0)
                hallucination_label.append(0)
            else:
                continue
            
            prior_probs.append(e['prior'])
            posterior_probs.append(e['posterior'])
            
            if e['posterior'] > e['prior']:
                posterior_label.append(1)
            else:
                posterior_label.append(0)
                
            if e['ent'].lower() in read_document(int(bbcid)).lower():
                overlap_preds.append(1)
            else:
                overlap_preds.append(0)
                
            if e['posterior'] > 0.4:
                threshold_preds.append(1)
            else:
                threshold_preds.append(0)

In [20]:
print(len(factual_label))
assert len(factual_label) == len(posterior_label)

5508


#### Overlap Baseline

In [21]:
from sklearn.metrics import classification_report

In [22]:
print(classification_report(factual_label, threshold_preds, target_names=['Non-factual', 'Factual'], digits=4))

              precision    recall  f1-score   support

 Non-factual     0.9426    0.5594    0.7022      5080
     Factual     0.1023    0.5958    0.1746       428

    accuracy                         0.5623      5508
   macro avg     0.5225    0.5776    0.4384      5508
weighted avg     0.8773    0.5623    0.6612      5508



In [23]:
print(classification_report(factual_label, overlap_preds, target_names=['Non-factual', 'Factual'], digits=4))

              precision    recall  f1-score   support

 Non-factual     0.9273    0.3915    0.5506      5080
     Factual     0.0809    0.6355    0.1435       428

    accuracy                         0.4105      5508
   macro avg     0.5041    0.5135    0.3470      5508
weighted avg     0.8615    0.4105    0.5190      5508



In [24]:
print(classification_report([1 if i == 0 else 0 for i in hallucination_label], overlap_preds, target_names=['Non-factual', 'Factual'], digits=4))

              precision    recall  f1-score   support

 Non-factual     0.2159    0.6652    0.3259       696
     Factual     0.9307    0.6505    0.7657      4812

    accuracy                         0.6523      5508
   macro avg     0.5733    0.6578    0.5458      5508
weighted avg     0.8404    0.6523    0.7102      5508



#### LM-based Baseline

In [25]:
print(classification_report(factual_label, posterior_label, target_names=['Non-factual', 'Factual'], digits=4))

              precision    recall  f1-score   support

 Non-factual     0.9376    0.3638    0.5242      5080
     Factual     0.0862    0.7126    0.1538       428

    accuracy                         0.3909      5508
   macro avg     0.5119    0.5382    0.3390      5508
weighted avg     0.8714    0.3909    0.4954      5508



In [26]:
print(classification_report([1 if i == 0 else 0 for i in hallucination_label], posterior_label, target_names=['Non-factual', 'Factual'], digits=4))

              precision    recall  f1-score   support

 Non-factual     0.1558    0.4411    0.2302       696
     Factual     0.8900    0.6542    0.7541      4812

    accuracy                         0.6273      5508
   macro avg     0.5229    0.5476    0.4922      5508
weighted avg     0.7972    0.6273    0.6879      5508



#### Load KNN Model

In [27]:
import numpy as np
import pickle

In [99]:
# load the model from disk

knn_model = pickle.load(open('classifiers/knn_main_factual_model.pkl', 'rb'))

In [100]:
prediction = knn_model.predict(np.array([prior_probs, posterior_probs]).T)

In [101]:
print(classification_report([1 if i == 0 else 0 for i in hallucination_label], prediction, target_names=['Non-factual', 'Factual'], digits=4))

              precision    recall  f1-score   support

 Non-factual     0.1904    0.4152    0.2611       696
     Factual     0.8980    0.7446    0.8141      4812

    accuracy                         0.7030      5508
   macro avg     0.5442    0.5799    0.5376      5508
weighted avg     0.8086    0.7030    0.7442      5508



In [102]:
# Main model: n = 7
#               precision    recall  f1-score   support

#  Non-factual     0.1904    0.4152    0.2611       696
#      Factual     0.8980    0.7446    0.8141      4812

#     accuracy                         0.7030      5508
#    macro avg     0.5442    0.5799    0.5376      5508
# weighted avg     0.8086    0.7030    0.7442      5508

In [103]:
print(classification_report(factual_label, prediction, target_names=['Non-factual', 'Factual'], digits=4))

              precision    recall  f1-score   support

 Non-factual     0.9407    0.2811    0.4329      5080
     Factual     0.0847    0.7897    0.1530       428

    accuracy                         0.3206      5508
   macro avg     0.5127    0.5354    0.2929      5508
weighted avg     0.8742    0.3206    0.4111      5508



In [31]:
# print(classification_report([1 if i == 0 else 0 for i in hallucination_label], prediction, target_names=['Non-factual', 'Factual'], digits=4))

In [32]:
# Our model:
#               precision    recall  f1-score   support

#  Non-factual     0.7426    0.5708    0.6454       657
#      Factual     0.5138    0.6963    0.5913       428

#     accuracy                         0.6203      1085
#    macro avg     0.6282    0.6335    0.6184      1085
# weighted avg     0.6523    0.6203    0.6241      1085

# LM + KNN
#               precision    recall  f1-score   support

#  Non-factual     0.6784    0.7900    0.7300       657
#      Factual     0.5687    0.4252    0.4866       428

#     accuracy                         0.6461      1085
#    macro avg     0.6236    0.6076    0.6083      1085
# weighted avg     0.6352    0.6461    0.6340      1085