In [1]:
import json

#### Read Original Google Dataset

In [2]:
factuality_data = json.load(open('../data/xsum_hallucination_annotations/factuality_annotations_xsum_summaries.json'))
hallucination_data = json.load(open('../data/xsum_hallucination_annotations/hallucination_annotations_xsum_summaries.json'))

In [3]:
print(len(factuality_data))
print(len(hallucination_data))

5597
11185


In [4]:
factuality_data[2330]

{'bbcid': 33517280,
 'system': 'TranS2S',
 'summary': 'five men have been charged after a protest at heathrow airport led to the closure of a runway at heathrow airport.',
 'is_factual': 'no',
 'worker_id': 'wid_1'}

In [5]:
hallucination_data[0]

{'bbcid': 34687720,
 'system': 'BERTS2S',
 'summary': 'rory mcilroy will take a one-shot lead into the final round of the wgc-hsbc champions after carding a three-under',
 'hallucination_type': 'extrinsic',
 'hallucinated_span': 'rory mcilroy will take a one-shot lead into the final round of the wgc-hsbc champions after carding a three-under',
 'worker_id': 'wid_0'}

#### Read Calculated Probability

In [6]:
google_data_with_proba = json.load(open('../data/Maynez_entity_data_with_prob.json', 'r'))

In [7]:
print(len(google_data_with_proba))

500


In [8]:
for bbcid in google_data_with_proba:
    for system in google_data_with_proba[bbcid]:
        for e in google_data_with_proba[bbcid][system]['ents']:
            if 'cnndm_cmlm_cedar' in e and 'xsum_cmlm_bos' in e:
                e['prior'] = e['bart.large']
                e['posterior'] = e['xsum_cmlm_bos']
            else:
                e['prior'] = None
                e['posterior'] = None

In [None]:
google_data_with_proba['34687720']['Gold']

#### Claculate Factuality Correlation

In [None]:
factuality_data[1]

In [None]:
factuality = {}
for i, f in enumerate(factuality_data):
    if f['bbcid'] not in factuality:
        factuality[f['bbcid']] = {}
    if f['system'] not in factuality[f['bbcid']]:
        factuality[f['bbcid']][f['system']] = []
        
    if f['is_factual'] == 'yes':
        factuality[f['bbcid']][f['system']].append(True)
    elif f['is_factual'] == 'no':
        factuality[f['bbcid']][f['system']].append(False)
    elif f['is_factual'] is None:
        factuality[f['bbcid']][f['system']].append(False)
    else:
        print(i)
        raise Exception('Unkown Label: {}'.format(f['is_factual']))

In [None]:
hallucination = {}
for h in hallucination_data:
    if h['bbcid'] not in hallucination:
        hallucination[h['bbcid']] = {}
    if h['system'] not in hallucination[h['bbcid']]:
        hallucination[h['bbcid']][h['system']] = []
    
    if h['hallucination_type'] == 'extrinsic' and len(h['hallucinated_span']) < len(h['summary']):
        hallucination[h['bbcid']][h['system']].append(h['hallucinated_span'])

In [None]:
hallucination[34687720]['BERTS2S']

In [None]:
# entity in hallucination span (extrinsic), and summary false: false-hallucination
# entity in hallucination span (extrinsic), and summary true: true-hallucnination
# entity not in hallucination span and summary true: non-hallucination

#### Evaluating

In [None]:
from utils import read_document

In [None]:
def check_factual(scores):
    if None in scores: return False
#     if len(scores) == sum(scores):
#         return True
#     else:
#         return False

    if sum(scores) * 2 >= len(scores):
        return True
    else:
        return False

In [None]:
def check_hallucinated(entity, spans):
    for s in spans:
        if entity in s:
            return True
    return False

In [None]:
factual_label, hallucination_label, posterior_label = [], [], []
prior_probs, posterior_probs = [], []
overlap_preds, threshold_preds = [], []

for bbcid in google_data_with_proba:
    for system in google_data_with_proba[bbcid]:
        if int(bbcid) not in factuality or system not in factuality[int(bbcid)]: continue
        if system not in ['BERTS2S']: continue
    
        for e in google_data_with_proba[bbcid][system]['ents']:
            if 'posterior' not in e or e['posterior'] is None: continue
            
            is_factual = check_factual(factuality[int(bbcid)][system])
            is_hallucinated = check_hallucinated(e['ent'], hallucination[int(bbcid)][system])
            is_entity_in_document = e['ent'].lower() in read_document(int(bbcid), '/home/mcao610/scratch/summarization/XSum/xsum-preprocessed/document/').lower()

            if is_factual and is_hallucinated:
                factual_label.append(1)
                hallucination_label.append(1)
            elif is_factual and not is_hallucinated:
                factual_label.append(1)
                hallucination_label.append(0)
            elif (not is_factual) and is_hallucinated:
                factual_label.append(0)
                hallucination_label.append(1)
            elif (not is_factual) and (not is_hallucinated):
                factual_label.append(1)
                hallucination_label.append(0)
            else:
                continue
            
            prior_probs.append(e['prior'])
            posterior_probs.append(e['posterior'])
            
            if e['posterior'] > e['prior']:
                posterior_label.append(1)
            else:
                posterior_label.append(0)
                
            if is_entity_in_document:
                overlap_preds.append(1)
            else:
                overlap_preds.append(0)
                
            if e['posterior'] > 0.4:
                threshold_preds.append(1)
            else:
                threshold_preds.append(0)

In [None]:
print(len(factual_label))
assert len(factual_label) == len(posterior_label)

#### Draw Diagram

In [None]:
%matplotlib inline

from draw import plot_scatter

In [None]:
prior_posterior = []
for pos, pri, f, h in zip(posterior_probs, prior_probs, factual_label, hallucination_label):
    my_label = -1
    if f == 1 and h == 1:
        my_label = 1
    elif f == 0 and h == 1:
        my_label = 2
    elif f == 1 and h == 0:
        my_label = 0
    assert my_label != -1
    prior_posterior.append({'prior': pri, 'posterior': pos, 'label': my_label})

In [None]:
input_data = [
    [(p['prior'], p['posterior']) for p in prior_posterior if p['label'] == 0],
    [(p['prior'], p['posterior']) for p in prior_posterior if p['label'] == 1],
    [(p['prior'], p['posterior']) for p in prior_posterior if p['label'] == 2]
]
labels = ['Non-hallucination', 'Hallucination True', 'Hallucination False']
plot_scatter(input_data, labels)

#### Overlap Baseline

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(factual_label, threshold_preds, target_names=['Non-factual', 'Factual'], digits=4))

In [None]:
print(classification_report(factual_label, overlap_preds, target_names=['Non-factual', 'Factual'], digits=4))

In [None]:
print(classification_report([1 if i == 0 else 0 for i in hallucination_label], overlap_preds, target_names=['Non-factual', 'Factual'], digits=4))

#### LM-based Baseline

In [None]:
print(classification_report(factual_label, posterior_label, target_names=['Non-factual', 'Factual'], digits=4))

In [None]:
print(classification_report([1 if i == 0 else 0 for i in hallucination_label], posterior_label, target_names=['Non-factual', 'Factual'], digits=4))

#### Load KNN Model

In [None]:
import numpy as np
import pickle

In [None]:
def predict(knn_model, posteriors, priors):
    posteriors = np.array(posteriors)
    priors = np.array(priors)

    x_mat = np.vstack([posteriors / np.std(posteriors), priors / np.std(priors)]).transpose()
    # x_mat = np.vstack([posteriors, priors]).transpose()

    return knn_model.predict(x_mat)

In [None]:
# load the model from disk
knn_model = pickle.load(open('classifiers/knn_mlm_clm.pkl', 'rb'))

In [None]:
prediction = predict(knn_model, posterior_probs, prior_probs)

In [None]:
print(classification_report([1 if i == 0 else 0 for i in hallucination_label], prediction, target_names=['Non-hallutionated', 'Hallutionated'], digits=4))

In [None]:
# LM + KNN:
#                    precision    recall  f1-score   support

# Non-hallutionated     0.1417    0.3926    0.2083       135
#     Hallutionated     0.9123    0.7266    0.8089      1174

#          accuracy                         0.6921      1309
#         macro avg     0.5270    0.5596    0.5086      1309
#      weighted avg     0.8328    0.6921    0.7470      1309

# Main model (n=5):
#                    precision    recall  f1-score   support

# Non-hallutionated     0.1545    0.3778    0.2194       135
#     Hallutionated     0.9142    0.7624    0.8314      1174

#          accuracy                         0.7227      1309
#         macro avg     0.5344    0.5701    0.5254      1309
#      weighted avg     0.8359    0.7227    0.7683      1309


In [None]:
print(classification_report(factual_label, prediction, target_names=['Non-factual', 'Factual'], digits=4))

In [None]:
# LM + KNN (n=4)
#               precision    recall  f1-score   support

#  Non-factual     0.1389    0.3200    0.1937       125
#      Factual     0.9167    0.7905    0.8490      1184

#     accuracy                         0.7456      1309
#    macro avg     0.5278    0.5553    0.5213      1309
# weighted avg     0.8425    0.7456    0.7864      1309

# Main Model (n=4)
#               precision    recall  f1-score   support

#  Non-factual     0.1373    0.2240    0.1702       125
#      Factual     0.9122    0.8514    0.8807      1184

#     accuracy                         0.7914      1309
#    macro avg     0.5247    0.5377    0.5255      1309
# weighted avg     0.8382    0.7914    0.8129      1309