In [1]:
import json

from scipy import stats

#### Read Google Dataset

In [2]:
factuality_data = json.load(open('../data/xsum_hallucination_annotations/factuality_annotations_xsum_summaries.json'))
hallucination_data = json.load(open('../data/xsum_hallucination_annotations/hallucination_annotations_xsum_summaries.json'))

In [3]:
print(len(factuality_data))
print(len(hallucination_data))

5597
11185


In [4]:
factuality_data[0]

{'bbcid': 29911712,
 'system': 'BERTS2S',
 'summary': 'more than 50 pupils at a bristol academy have been sent home from school because of a lack of uniform.',
 'is_factual': 'no',
 'worker_id': 'wid_0'}

In [5]:
hallucination_data[0]

{'bbcid': 34687720,
 'system': 'BERTS2S',
 'summary': 'rory mcilroy will take a one-shot lead into the final round of the wgc-hsbc champions after carding a three-under',
 'hallucination_type': 'extrinsic',
 'hallucinated_span': 'rory mcilroy will take a one-shot lead into the final round of the wgc-hsbc champions after carding a three-under',
 'worker_id': 'wid_0'}

#### 

#### Read Calculated Probability

In [6]:
google_data_with_proba = json.load(open('../data/Maynez_entity_data_with_prob.json'))

In [7]:
google_data_with_proba['34687720']['Gold']

{'summary': 'rory mcilroy moved to within a shot of joint leaders victor dubuisson and jaco van zyl after the third round of the turkish airlines open.',
 'summary_upper': 'Rory McIlroy moved to within a shot of joint leaders Victor Dubuisson and Jaco van Zyl after the third round of the Turkish Airlines open .',
 'ents': [{'start': 0,
   'end': 4,
   'label': -1,
   'type': 'PERSON',
   'ent': 'Rory',
   'bart.large': 0.380859375,
   'xsum_cmlm_bos': 0.93017578125,
   'cnndm_cmlm_cedar': 0.180908203125,
   'bart.large.xsum': 0.65087890625},
  {'start': 5,
   'end': 12,
   'label': -1,
   'type': 'PERSON',
   'ent': 'McIlroy',
   'bart.large': 0.9189453125,
   'xsum_cmlm_bos': 0.78173828125,
   'cnndm_cmlm_cedar': 0.81103515625,
   'bart.large.xsum': 0.806640625},
  {'start': 53,
   'end': 59,
   'label': -1,
   'type': 'PERSON',
   'ent': 'Victor',
   'bart.large': 0.0023136138916015625,
   'xsum_cmlm_bos': 0.000522613525390625,
   'cnndm_cmlm_cedar': 0.0006346702575683594,
   'bart.l

#### Claculate Factuality Correlation

In [8]:
import pickle
import numpy as np

In [9]:
factuality_data[1]

{'bbcid': 29911712,
 'system': 'BERTS2S',
 'summary': 'more than 50 pupils at a bristol academy have been sent home from school because of a lack of uniform.',
 'is_factual': 'no',
 'worker_id': 'wid_1'}

In [10]:
factuality = {}
for i, f in enumerate(factuality_data):
    if f['bbcid'] not in factuality:
        factuality[f['bbcid']] = {}
    if f['system'] not in factuality[f['bbcid']]:
        factuality[f['bbcid']][f['system']] = []
        
    if f['is_factual'] == 'yes':
        factuality[f['bbcid']][f['system']].append(True)
    elif f['is_factual'] == 'no':
        factuality[f['bbcid']][f['system']].append(False)
    elif f['is_factual'] is None:
        factuality[f['bbcid']][f['system']].append(False)
    else:
        print(i)
        raise Exception('Unkown Label: {}'.format(f['is_factual']))

In [11]:
def factuality_score(scores):
    if None in scores: return 0.0
    return sum(scores) / len(scores)

In [12]:
def factuality_score_proba(ents, prob_type='xsum_cmlm_bos'):
    posteriors = []
    if len(ents) == 0:
        return 0.5
    for e in ents:
        posteriors.append(e[prob_type])
    return min(posteriors)

In [13]:
def factuality_score_knn(knn_model, ents, prior_name='bart.large', posterior_name='xsum_cmlm_bos'):
    if len(ents) == 0:
        return 0.5

    priors, posteriors = [], []
    for e in ents:
        assert prior_name in e and posterior_name in e
        priors.append(e[prior_name])
        posteriors.append(e[posterior_name])
    
    priors = np.array(priors)
    posteriors = np.array(posteriors)
    x_mat = np.vstack([posteriors, priors]).transpose()
    # x_mat = np.vstack([posteriors / np.std(posteriors), priors / np.std(priors)]).transpose()

    preds = knn_model.predict_proba(x_mat)  # [batch_size, 2]
    return np.min(preds[:, 1])

#     preds = knn_model.predict(x_mat)  # [batch_size, 2]
#     return np.min(preds)    

In [14]:
knn_model = pickle.load(open('classifiers/knn_mlm_clm.pkl', 'rb'))

In [15]:
print(factuality[29911712]['BERTS2S'])
print(factuality_score(factuality[29911712]['BERTS2S']))

[False, False, False]
0.0


In [16]:
print(google_data_with_proba['29911712']['BERTS2S'])
print(factuality_score_proba(google_data_with_proba['29911712']['BERTS2S']['ents']))
factuality_score_knn(knn_model, google_data_with_proba['29911712']['BERTS2S']['ents'])

{'summary': 'more than 50 pupils at a bristol academy have been sent home from school because of a lack of uniform.', 'summary_upper': 'More than 50 pupils at a Bristol Academy have been sent home from school because of a lack of uniform .', 'ents': [{'start': 0, 'end': 12, 'label': 0, 'type': 'CARDINAL', 'ent': 'More than 50', 'bart.large': 0.0021419525146484375, 'xsum_cmlm_bos': 0.0176849365234375, 'cnndm_cmlm_cedar': 0.0011053085327148438, 'bart.large.xsum': 0.0126800537109375}, {'start': 25, 'end': 40, 'label': 0, 'type': 'ORG', 'ent': 'Bristol Academy', 'bart.large': 3.0994415283203125e-06, 'xsum_cmlm_bos': 0.0010528564453125, 'cnndm_cmlm_cedar': 7.033348083496094e-06, 'bart.large.xsum': 0.00015664100646972656}]}
0.0010528564453125


0.5

In [23]:
human_factuality_scores = []
model_factuality_scores = []
knn_factuality_scores = []

try:
    for bbcid in factuality:
        if bbcid == 33928888 or bbcid == 39553812: continue
        for system in factuality[bbcid]:
            if system != 'BERTS2S': continue
            human_factuality_scores.append(factuality_score(factuality[bbcid][system]))
            model_factuality_scores.append(factuality_score_proba(google_data_with_proba[str(bbcid)][system]['ents']))
            knn_factuality_scores.append(factuality_score_knn(
                knn_model,
                google_data_with_proba[str(bbcid)][system]['ents']
            ))
except:
    print(bbcid)
    print(system)
assert len(human_factuality_scores) == len(model_factuality_scores)

In [24]:
google_data_with_proba['40764446']['BERTS2S']

{'summary': 'a kenyan police officer has been shot dead by a gunman who broke into his farm home, police say.',
 'summary_upper': 'A Kenyan police officer has been shot dead by a gunman who broke into his farm home , police say .',
 'ents': [{'start': 2,
   'end': 8,
   'label': 2,
   'type': 'NORP',
   'ent': 'Kenyan',
   'bart.large': 0.0015163421630859375,
   'xsum_cmlm_bos': 0.90087890625,
   'cnndm_cmlm_cedar': 0.8818359375,
   'bart.large.xsum': 0.05084228515625}]}

In [25]:
human_factuality_scores[:10]

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

In [26]:
model_factuality_scores[:6]

[6.377696990966797e-06,
 0.026458740234375,
 2.2590160369873047e-05,
 6.616115570068359e-06,
 0.0082855224609375,
 0.771484375]

In [27]:
knn_factuality_scores[:6]

[0.75, 0.0, 0.75, 0.75, 0.25, 1.0]

In [28]:
stats.spearmanr(human_factuality_scores, model_factuality_scores)

SpearmanrResult(correlation=0.18647470914050102, pvalue=4.490541602459233e-05)