In [3]:
import math
import torch
import string
import torch.nn as nn

from fairseq.models.bart import BARTModel
from utils import read_lines, get_probability

In [4]:
bart = BARTModel.from_pretrained('/home/ml/cadencao/Downloads/BART_models/bart.large.xsum',
                                 checkpoint_file='model.pt',
                                 data_name_or_path='/home/ml/cadencao/Downloads/BART_models/bart.large.xsum')

In [5]:
bart.cuda()
bart.eval()
bart.half()
print('- activate evaluation mode')

- activate evaluation mode


In [6]:
encode_func = bart.encode
decode_func = bart.decode

In [7]:
bart_encoder = bart.model.encoder
bart_decoder = bart.model.decoder
print(type(bart.model))
print(type(bart_encoder))
print(type(bart_decoder))

<class 'fairseq.models.bart.model.BARTModel'>
<class 'fairseq.models.transformer.TransformerEncoder'>
<class 'fairseq.models.transformer.TransformerDecoder'>


#### Read XSum

In [6]:
document_path = '/home/ml/cadencao/XSum/fairseq_files/test.source'
target_path = '/home/ml/cadencao/XSum/fairseq_files/test.target'
preds_path = 'preds/xsum_preds.hypo'
xsum_source = read_lines(document_path)
xsum_target = read_lines(target_path)
xsum_preds = read_lines(preds_path)
print(len(xsum_source))
assert len(xsum_source) == len(xsum_target) == len(xsum_preds)

11301


#### Tokenization

In [7]:
import torch.nn.functional as F

from fairseq.data.data_utils import collate_tokens

In [8]:
INDEX = 10066
test_inputs = [xsum_source[INDEX]]
src_tokens = collate_tokens([encode_func(i) for i in test_inputs], pad_idx=1, left_pad=True)
src_tokens = src_tokens.cuda()
print(src_tokens.shape)

src_lengths = torch.sum(src_tokens != 1, dim=1)
print(src_lengths)

torch.Size([1, 217])
tensor([217], device='cuda:0')


In [9]:
def get_sequence_probs(sequence, document):
    """One sequence each time.
    """
    inputs = [document]

    # tokenization
    src_tokens = collate_tokens([encode_func(i) for i in inputs], pad_idx=1, left_pad=True)
    src_tokens = src_tokens.cuda()
    src_lengths = torch.sum(src_tokens != 1, dim=1)
    
    tgt_prefix = torch.tensor([2], dtype=torch.long).cuda()
    tgt_ids = encode_func(sequence).cuda()
    tgt = torch.cat([tgt_prefix, tgt_ids], dim=0)

    tgt_input = tgt[:-1].cuda()
    tgt_output = tgt[1:].cuda()

    # encoding
    encoder_out = bart_encoder(src_tokens, src_lengths=src_lengths)

    # decoding
    decoder_outputs = bart_decoder(tgt_input.unsqueeze(0), encoder_out, features_only=False)
    logits = decoder_outputs[0]
    probs = F.softmax(logits, dim=2)

    # gather selected token probabilities
    token_probs = torch.gather(probs, 2, tgt_output.unsqueeze(0).unsqueeze(-1))
    token_probs = token_probs.squeeze()
    
    assert token_probs.shape == tgt_ids.shape
    return token_probs.tolist(), tgt_ids.detach()

In [10]:
token_probs, tgt_ids = get_sequence_probs(xsum_target[INDEX], xsum_source[INDEX])

In [11]:
for i, (t, p) in enumerate(zip(tgt_ids[1:-1], token_probs[1:-1])):
    token = decode_func(t.unsqueeze(-1))
    print("- {:02d}: {} ({:.2f})".format(i, token, p), end='\n')

- 00: Police (0.20)
- 01:  investigating (0.25)
- 02:  the (0.81)
- 03:  disappearance (0.56)
- 04:  of (0.90)
- 05:  a (0.84)
- 06:  man (0.73)
- 07:  three (0.00)
- 08:  months (0.66)
- 09:  ago (0.90)
- 10:  say (0.08)
- 11:  they (0.60)
- 12:  have (0.40)
- 13:  yet (0.00)
- 14:  to (0.89)
- 15:  trace (0.02)
- 16:  the (0.23)
- 17:  sender (0.05)
- 18:  of (0.90)
- 19:  a (0.80)
- 20:  letter (0.76)
- 21:  claiming (0.22)
- 22:  he (0.57)
- 23:  was (0.54)
- 24:  dead (0.78)
- 25: . (0.77)


In [12]:
tokens = [decode_func(i.unsqueeze(-1)) for i in tgt_ids[1:-1]]
get_probability('Cardiff City', tokens, token_probs[1:-1])

Target (Cardiff City) not found!!!


-1.0

In [13]:
xsum_target[INDEX]

'Police investigating the disappearance of a man three months ago say they have yet to trace the sender of a letter claiming he was dead.'

#### Hallucination Analysis

In [14]:
import json

In [15]:
samples = []
with open('hallucinated_span.jsonl', 'r') as f:
    for line in f:
        samples.append(json.loads(line.strip()))

In [16]:
samples[2]

{'id': 1431,
 'ents': ['Middlesbrough', 'Federico', 'Fazio'],
 'hallucinated': ['Federico']}

In [17]:
sample_probs = []
for s in samples:
    # get sequence probability
    token_probs, tgt_ids = get_sequence_probs(xsum_preds[s['id']], xsum_source[s['id']])
    tokens = [decode_func(i.unsqueeze(-1)) for i in tgt_ids[1:-1]]
    
    # calculate probability
    sp = {'id': s['id'], 'ents': [], 'non-hallucinated': [], 'hallucinated': []}
    for e in s['ents']:
        p = get_probability(e, tokens, token_probs[1:-1])
        sp['ents'].append(p)
        if e in s['hallucinated']:
            sp['hallucinated'].append(p)
        else:
            sp['non-hallucinated'].append(p)
    sample_probs.append(sp)

In [18]:
sample_probs[0]

{'id': 8805,
 'ents': [0.24124312947969884, 0.29296875, 0.46588326897472143],
 'non-hallucinated': [0.29296875, 0.46588326897472143],
 'hallucinated': [0.24124312947969884]}

In [19]:
right, false = [], []
for s in sample_probs:
    false.extend(s['hallucinated'])
    right.extend(s['non-hallucinated'])

In [20]:
print('Hallucinated:')
print('- Num: {}'.format(len(false)))
print('- Avg: {:.2f}'.format(sum(false) / len(false)))
print('- Max: {:.2f}'.format(max(false)))
print('- Min: {:.2f}'.format(min(false)))

print('\nNon-hallucinated:')
print('- Num: {}'.format(len(right)))
print('- Avg: {:.2f}'.format(sum(right) / len(right)))
print('- Max: {:.2f}'.format(max(right)))
print('- Min: {:.2f}'.format(min(right)))

Hallucinated:
- Num: 19
- Avg: 0.10
- Max: 0.26
- Min: 0.01

Non-hallucinated:
- Num: 64
- Avg: 0.52
- Max: 0.94
- Min: 0.13


In [21]:
# for s in sample_probs:
#     print('{}: {}'.format(s['id'], s['hallucinated']))

#### Classification

In [22]:
import spacy
from tqdm import tqdm
from sklearn.metrics import classification_report

nlp = spacy.load('en')

In [23]:
target = []
pred_prob = []
pred_ner = []

In [24]:
for s, sp in tqdm(zip(samples, sample_probs)):
    assert s['id'] == sp['id']
    assert len(s['ents']) == len(sp['ents'])
    for i, e in enumerate(s['ents']):
        # target
        if e in s['hallucinated']:
            target.append('hallucinated')
        else:
            target.append('consistent')
        
        # probability
        if sp['ents'][i] <= 0.25:
            pred_prob.append('hallucinated')
        else:
            pred_prob.append('consistent')
            
        # NER
        source = xsum_source[s['id']]
        source_ents = [e.text for e in nlp(source).ents]
        if e not in source_ents:
            pred_ner.append('hallucinated')
        else:
            pred_ner.append('consistent')

30it [00:03,  9.22it/s]


In [25]:
assert len(target) == len(pred_prob) == len(pred_ner)

In [26]:
print(classification_report(target, pred_prob))

              precision    recall  f1-score   support

  consistent       0.96      0.83      0.89        64
hallucinated       0.61      0.89      0.72        19

   micro avg       0.84      0.84      0.84        83
   macro avg       0.79      0.86      0.81        83
weighted avg       0.88      0.84      0.85        83



In [27]:
print(classification_report(target, pred_ner))

              precision    recall  f1-score   support

  consistent       0.94      0.48      0.64        64
hallucinated       0.34      0.89      0.49        19

   micro avg       0.58      0.58      0.58        83
   macro avg       0.64      0.69      0.57        83
weighted avg       0.80      0.58      0.61        83



In [None]:
tensor([[    0, 32251,   559,   537,    16,    20,  1513, 11711,     6,  3028,
          6513,  5576,  7864,     6,  3059,    19,   116,     2,  6323,  6730,
          1952,    11,    20,  1513,    32,  3665,     7,     5,  6855,  1643,
           215,    25,  3028,  6513,  5576,  7864,     6,  2668,  4436,   571,
         11032,   324,     6,  4508,  2884,  4663,     8,  2150, 24266,     6,
            53,    89,    32,    67,  6730,  1952,  3665,     7,     5,  4165,
          1643,   215,    25,   871,  4802,  1417,  3239,     6,  4720,  5415,
             6,  7730,   229, 12614,     8, 23710,   118,  4849,     4,     2],
        [    0,  1121,  1824,     6,    99,  3164,     9, 32631,  1253,    58,
            26,     7,    28,    11,  1327,   116,     2,  2765,  1824,     6,
          5169,   207,     9, 32631,  1253,    58,   303,    11,  1327,     4,
             2,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1]])

In [15]:
decode_func(torch.tensor([    0, 32251,   559,   537,    16,    20,  1513, 11711,     6,  3028,
          6513,  5576,  7864,     6,  3059,    19,   116,     2,  6323,  6730,
          1952,    11,    20,  1513,    32,  3665,     7,     5,  6855,  1643,
           215,    25,  3028,  6513,  5576,  7864,     6,  2668,  4436,   571,
         11032,   324,     6,  4508,  2884,  4663,     8,  2150, 24266,     6,
            53,    89,    32,    67,  6730,  1952,  3665,     7,     5,  4165,
          1643,   215,    25,   871,  4802,  1417,  3239,     6,  4720,  5415,
             6,  7730,   229, 12614,     8, 23710,   118,  4849,     4,     2], dtype=torch.long))

'Which political party is The Times columnist, Daniel Finkelstein, associated with?Some columnists in The Times are connected to the Conservative Party such as Daniel Finkelstein, Tim Montgomerie, Matthew Parris and Matt Ridley, but there are also columnists connected to the Labour Party such as David Aaronovitch, Phil Collins, Oliver Kamm and Jenni Russell.'

In [16]:
decode_func(torch.tensor([    0, 32251,   559,   537,    16,    20,  1513, 11711,     6,  3028,
          6513,  5576,  7864,     6,  3059,    19,   116,     2], dtype=torch.long))

'Which political party is The Times columnist, Daniel Finkelstein, associated with?'

In [17]:
decode_func(torch.tensor([    0, 6323,  6730,
          1952,    11,    20,  1513,    32,  3665,     7,     5,  6855,  1643,
           215,    25,  3028,  6513,  5576,  7864,     6,  2668,  4436,   571,
         11032,   324,     6,  4508,  2884,  4663,     8,  2150, 24266,     6,
            53,    89,    32,    67,  6730,  1952,  3665,     7,     5,  4165,
          1643,   215,    25,   871,  4802,  1417,  3239,     6,  4720,  5415,
             6,  7730,   229, 12614,     8, 23710,   118,  4849,     4,     2], dtype=torch.long))

'Some columnists in The Times are connected to the Conservative Party such as Daniel Finkelstein, Tim Montgomerie, Matthew Parris and Matt Ridley, but there are also columnists connected to the Labour Party such as David Aaronovitch, Phil Collins, Oliver Kamm and Jenni Russell.'

In [18]:
decode_func(torch.tensor([0, 3828, 1964, 2151, 318, 383, 3782, 22309, 11, 7806, 4463, 7750, 5714, 11, 3917, 351, 30, 2], dtype=torch.long))

' convicted itemsmonth If things coalition Ned in Overall disaster Mission250 inran won by'