In [1]:
import json
import spacy

nlp = spacy.load('en_core_web_sm')

#### Read JSON Data

In [2]:
hallucination_data = json.load(open('hallucination_data.json'))
factuality_data = json.load(open('factuality_data.json'))

In [3]:
print(len(hallucination_data))
print(len(factuality_data))

11184
5597


#### Extract Entity

In [4]:
from tqdm import tqdm

In [5]:
def process_ents(sentence):
    ents = []
    for ent in nlp(sentence).to_json()['ents']:
        if ent['label'] != 'PERSON':
            ents.append(ent)
        else:
            name_parts = sentence[ent['start']: ent['end']].split()
            init_start = ent['start']
            for p in name_parts:
                ents.append({'start': init_start, 'end': init_start + len(p), 'label': 'PERSON'})
                init_start = init_start + len(p) + 1
    
    for e in ents:
        e['type'] = e['label']
        e['label'] = -1
    return ents

In [6]:
process_ents('Rory McIlroy will take a one-shot lead into the final round of the Wgc-Hsbc champions after carding a Three-Under')

[{'start': 0, 'end': 4, 'label': -1, 'type': 'PERSON'},
 {'start': 5, 'end': 12, 'label': -1, 'type': 'PERSON'},
 {'start': 25, 'end': 28, 'label': -1, 'type': 'CARDINAL'},
 {'start': 63, 'end': 75, 'label': -1, 'type': 'ORG'},
 {'start': 102, 'end': 107, 'label': -1, 'type': 'CARDINAL'}]

#### Build Hallucination & Factuality Dictionary

In [7]:
hallucination_dict = {}
for d in hallucination_data:
    if d['bbcid'] not in hallucination_dict:
        hallucination_dict[d['bbcid']] = {}
    if d['system'] not in hallucination_dict[d['bbcid']]:
        hallucination_dict[d['bbcid']][d['system']] = []
    
    hallucination_dict[d['bbcid']][d['system']].append(d)

In [27]:
hallucination_dict['11154244']['BERTS2S']

[{'bbcid': '11154244',
  'system': 'BERTS2S',
  'summary': 'one in five parents in england eat vegetables at home, a survey suggests.',
  'hallucination_type': 'extrinsic',
  'hallucinated_span': 'one in five parents',
  'worker_id': 'wid_0',
  'summary_upper': 'One in five parents in England eat vegetables at home , a survey suggests .',
  'hallucinated_span_upper': 'One in five parents'},
 {'bbcid': '11154244',
  'system': 'BERTS2S',
  'summary': 'one in five parents in england eat vegetables at home, a survey suggests.',
  'hallucination_type': 'extrinsic',
  'hallucinated_span': 'one in five parents',
  'worker_id': 'wid_1',
  'summary_upper': 'One in five parents in England eat vegetables at home , a survey suggests .',
  'hallucinated_span_upper': 'One in five parents'},
 {'bbcid': '11154244',
  'system': 'BERTS2S',
  'summary': 'one in five parents in england eat vegetables at home, a survey suggests.',
  'hallucination_type': 'extrinsic',
  'hallucinated_span': 'one in five par

In [9]:
factuality_dict = {}
for d in factuality_data:
    if d['bbcid'] not in factuality_dict:
        factuality_dict[d['bbcid']] = {}
    if d['system'] not in factuality_dict[d['bbcid']]:
        factuality_dict[d['bbcid']][d['system']] = []
    
    factuality_dict[d['bbcid']][d['system']].append(d)

In [28]:
factuality_dict['11154244']['BERTS2S']

[{'bbcid': '11154244',
  'system': 'BERTS2S',
  'summary': 'one in five parents in england eat vegetables at home, a survey suggests.',
  'is_factual': 'no',
  'worker_id': 'wid_0',
  'summary_upper': 'One in five parents in England eat vegetables at home , a survey suggests .'},
 {'bbcid': '11154244',
  'system': 'BERTS2S',
  'summary': 'one in five parents in england eat vegetables at home, a survey suggests.',
  'is_factual': 'no',
  'worker_id': 'wid_1',
  'summary_upper': 'One in five parents in England eat vegetables at home , a survey suggests .'},
 {'bbcid': '11154244',
  'system': 'BERTS2S',
  'summary': 'one in five parents in england eat vegetables at home, a survey suggests.',
  'is_factual': 'no',
  'worker_id': 'wid_2',
  'summary_upper': 'One in five parents in England eat vegetables at home , a survey suggests .'}]

In [11]:
def if_factual(bbcid, system):
    votes = []
    for d in factuality_dict[bbcid][system]:
        votes.append(d['is_factual'])
    
    if votes.count('no') <= 1:
        return True
    elif votes.count('no') > 1:
        return False
    else:
        raise Exception('Unexpected situation.')

In [12]:
if_factual('21267591', 'BERTS2S')

False

#### Label Entity

In [13]:
# FACTUALITY - IN SPAN - LABEL
# True       - True      true-hallucination
# True       - False     non-hallucination
# False      - True      false-hallucination (PROBLEM)
# False      - False     non-hallucination

# When there are multiple hallucinations in a False summary, it's possible that some hallucinations are true and others are false.
# To make things easier, we will first label all hallucinations in a false summary as false-hallucination.

In [14]:
entity_data = {}

for h in hallucination_data:
    if h['bbcid'] not in entity_data:
        entity_data[h['bbcid']] = {}
    if h['system'] not in entity_data[h['bbcid']]:
        entity_data[h['bbcid']][h['system']] = {
            'summary': h['summary'],
            'summary_upper': h['summary_upper'],
            'ents': process_ents(h['summary_upper'])
        }
        
        for e in entity_data[h['bbcid']][h['system']]['ents']:
            e['ent'] = h['summary_upper'][e['start']: e['end']]

In [15]:
entity_data['29347895']['BERTS2S']

{'summary': 'veteran classical music conductor christopher hogwood has died at the age of 83.',
 'summary_upper': 'Veteran classical music conductor Christopher Hogwood has died at the age of 83 .',
 'ents': [{'start': 34,
   'end': 45,
   'label': -1,
   'type': 'PERSON',
   'ent': 'Christopher'},
  {'start': 46, 'end': 53, 'label': -1, 'type': 'PERSON', 'ent': 'Hogwood'},
  {'start': 66,
   'end': 79,
   'label': -1,
   'type': 'DATE',
   'ent': 'the age of 83'}]}

In [16]:
print(len(entity_data.keys()))

500


In [17]:
def read_document(bbcid):
    folder = '/home/ml/cadencao/XSum/xsum-preprocessed/document/'
    file_path = folder + '{}.document'.format(bbcid)
    
    document = None
    with open(file_path, 'r') as f:
        document = f.read()
    return document

In [18]:
read_document(11154244)

'Share this with\nEmail\nFacebook\nMessenger\nMessenger\nTwitter\nPinterest\nWhatsApp\nLinkedin\nCopy this link\nResearchers found that four out of five children in England who ate school lunches had tried food at school that they had not tried at home .\nHalf of parents questioned said their children had asked for foods they had eaten at school to be cooked at home .\nThe survey , of about 1,000 parents , found the most popular vegetables were carrots , sweetcorn and peas .\nAubergine , chickpeas and spinach were among the least popular .\nOf the parents questioned , 628 had children who ate school lunches .\n( % denotes proportion of parents who said their child would eat each vegetable )\nEngland \'s School Food Trust commissioned the research after a survey by the Mumsnet website suggested some parents gave their children packed lunches because they thought they were too fussy to eat anything else .\nTrust chairman , Rob Rees , said : " Every parent knows it \'s a nightmare watchin

In [19]:
for bbcid in entity_data.keys():
    for system in entity_data[bbcid].keys():
        if system == 'Gold':
            continue
        
        summary = entity_data[bbcid][system]['summary_upper']
        
        spans, spans_uppder = [], []
        for hd in hallucination_dict[bbcid][system]:
            if hd['hallucination_type'] == 'extrinsic':
                spans.append(hd['hallucinated_span'])
                spans_uppder.append(hd['hallucinated_span_upper'])
        assert len(spans) == len(spans_uppder)
        
        for e in entity_data[bbcid][system]['ents']:
            entity = summary[e['start']: e['end']]
            
            in_span_count = []
            for s, su in zip(spans, spans_uppder):
                if entity.lower() in s or entity in su:
                    in_span_count.append(True)
            if len(in_span_count) >= 2:
                in_span = True
            else:
                in_span = False

            # FACTUALITY - IN SPAN - LABEL
            # True       - True      true-hallucination
            # True       - False     non-hallucination
            # False      - True      false-hallucination (PROBLEM)
            # False      - False     non-hallucination
            
            try:
                if if_factual(bbcid, system) and in_span:
                    e['label'] = 1 # true-hallucination
                elif if_factual(bbcid, system) and not in_span:
                    e['label'] = 0 # non-hallucination
                elif not if_factual(bbcid, system) and in_span:
                    e['label'] = 2 # false-hallucination
                elif not if_factual(bbcid, system) and not in_span:
                    e['label'] = 0 # non-hallucination
                else:
                    print('ERROR!')
            except:
                break

In [20]:
entity_data['13599161']['BERTS2S']

{'summary': 'shale gas drilling in lancashire has been suspended after a magnitude-7. 5 earthquake struck.',
 'summary_upper': 'Shale gas drilling in Lancashire has been suspended after a Magnitude-7 . 5 earthquake struck .',
 'ents': [{'start': 22,
   'end': 32,
   'label': 0,
   'type': 'ORG',
   'ent': 'Lancashire'},
  {'start': 74, 'end': 75, 'label': 2, 'type': 'CARDINAL', 'ent': '5'}]}

In [21]:
len(entity_data)

500

#### Save Data

In [22]:
import json

In [23]:
json.dump(entity_data, open('entity_data.json', 'w'))

In [24]:
entity_data['37839562']['TranS2S']

{'summary': "the director of glasgow school of art ( gsa ) has said the fire which damaged glasgow school of art's ( gsa ) mackintosh",
 'summary_upper': "The director of Glasgow school of art ( GSA ) has said the fire which damaged Glasgow school of art 's ( GSA ) Mackintosh",
 'ents': [{'start': 16,
   'end': 23,
   'label': -1,
   'type': 'GPE',
   'ent': 'Glasgow'},
  {'start': 40, 'end': 43, 'label': -1, 'type': 'ORG', 'ent': 'GSA'},
  {'start': 78, 'end': 85, 'label': -1, 'type': 'GPE', 'ent': 'Glasgow'}]}