In [1]:
import json
import math

from utils import read_lines

In [2]:
PATH = json.load(open('../path_config.json'))

#### Read XSum

In [None]:
document_path = PATH['xsum_fariseq'] + '/test.source'
target_path = PATH['xsum_fariseq'] + '/test.target'
xsum_source = read_lines(document_path)
xsum_target = read_lines(target_path)
print(len(xsum_source))
assert len(xsum_source) == len(xsum_target)

#### Evaluate Annotated Dataset

In [None]:
from tqdm import tqdm

In [None]:
data = json.load(open('../data/annotated_with_probability.json', 'r'))
print(len(data))

In [None]:
data[55]

#### Prior/Posterior Distribution Diagram

In [None]:
%matplotlib inline

from draw import plot_scatter

In [None]:
prior_posterior = []
for d in data:
    for e in d['ents']:
        e['id'] = d['id']
        e['prior'] = e['xsum_cmlm_scratch_cedar_warmup_20000']
        e['posterior'] = e['xsum_cmlm_bos']
        prior_posterior.append(e)

In [None]:
input_data = [
    [(p['prior'], p['posterior']) for p in prior_posterior if p['label'] == 0],
    [(p['prior'], p['posterior']) for p in prior_posterior if p['label'] == 1],
    [(p['prior'], p['posterior']) for p in prior_posterior if p['label'] == 2],
    [(p['prior'], p['posterior']) for p in prior_posterior if p['label'] == 3]
]
labels = ['Non-hallucination', 'Hallucination True', 'Hallucination False', 'Intrinsic Hallucination']
plot_scatter(input_data, labels)

#### Compare CMLM

In [None]:
import matplotlib.pyplot as plt

import matplotlib
matplotlib.rcParams['text.usetex'] = False

In [None]:
n_bins = 10
fig, (ax0, ax1) = plt.subplots(nrows=1, ncols=2, figsize=(7.0, 2.0))

# priors_0 = [p['prior'] for p in prior_posterior if p['label'] == 0]
# posteriors_0 = [p['posterior'] for p in prior_posterior if p['label'] == 0]
# _, bins, _ = ax0.hist(priors_0, n_bins, density=False, histtype='bar', color='blue', label=r'$\textrm{CMLM}_\textrm{CNN/DM}$', edgecolor='blue', alpha=0.55)
# ax0.hist(posteriors_0, bins=bins + 0.015, density=False, histtype='bar', color='red', label=r'$\textrm{CMLM}_\textsc{XSum}$', edgecolor='red', alpha=0.35)
# ax0.set_ylabel('Count', fontsize=13)
# ax0.set_title('Non-hallucinated Entities', fontsize=13)

priors_0 = [p['prior'] for p in prior_posterior if p['label'] == 0]
posteriors_0 = [p['posterior'] for p in prior_posterior if p['label'] == 0]
_, bins, _ = ax0.hist(priors_0, n_bins, density=False, histtype='bar', color='blue', label='CMLM trained on CNN/DM', edgecolor='blue', alpha=0.55)
ax0.hist(posteriors_0, bins=bins + 0.015, density=False, histtype='bar', color='red', label='CMLM trained on XSum', edgecolor='red', alpha=0.35)
ax0.set_ylabel('Count', fontsize=12)
ax0.set_title('Non-hallucinated Entities', fontsize=12)

priors_1 = [p['prior'] for p in prior_posterior if p['label'] == 1]
posteriors_1 = [p['posterior'] for p in prior_posterior if p['label'] == 1]
_, bins, _ = ax1.hist(priors_1, n_bins, density=False, histtype='bar', color='blue', label='CMLM (CNN/DM)', edgecolor='blue', alpha=0.55)
ax1.hist(posteriors_1, bins=bins + 0.015, density=False, histtype='bar', color='red', label='CMLM (XSum)', edgecolor='red', alpha=0.35)
ax1.legend(prop={'size': 11})
ax1.set_title('Factul Hallucinations', fontsize=12)

fig.text(0.5, -0.05, 'Posterior Probability', ha='center', fontsize=12)
fig.tight_layout()

# fig.text(0.0, 0.5, 'Count', va='center', rotation='vertical', fontsize=12)
plt.savefig("figures/" + 'hist_2cmlm' +'.pdf', bbox_inches="tight")
plt.show()

In [None]:
n_bins = 10
fig, (ax0, ax1) = plt.subplots(nrows=2, ncols=1, figsize=(5.0, 4.0))

priors_0 = [p['prior'] for p in prior_posterior if p['label'] == 0]
posteriors_0 = [p['posterior'] for p in prior_posterior if p['label'] == 0]
_, bins, _ = ax0.hist(priors_0, n_bins, density=True, histtype='bar', color='blue', label='CMLM trained on CNN/DM', edgecolor='blue', alpha=0.55)
ax0.hist(posteriors_0, bins=bins + 0.015, density=True, histtype='bar', color='red', label='CMLM trained on XSum', edgecolor='red', alpha=0.35)
# ax0.set_ylabel('Count', fontsize=12)
ax0.set_title('Non-hallucinated Entities', fontsize=12)

priors_1 = [p['prior'] for p in prior_posterior if p['label'] == 1]
posteriors_1 = [p['posterior'] for p in prior_posterior if p['label'] == 1]
_, bins, _ = ax1.hist(priors_1, n_bins, density=True, histtype='bar', color='blue', label='CMLM trained on CNN/DM', edgecolor='blue', alpha=0.55)
ax1.hist(posteriors_1, bins=bins + 0.015, density=True, histtype='bar', color='red', label='CMLM trained on XSum', edgecolor='red', alpha=0.35)
ax0.legend(prop={'size': 11})
ax1.set_title('Factul Hallucinations', fontsize=12)

fig.text(0.5, -0.05, 'Posterior Probability', ha='center', fontsize=12)
fig.tight_layout()
# if save_fig:
#     plt.savefig("figures/" + taskname +'.pdf', bbox_inches="tight")

fig.text(0.0, 0.5, 'Count', va='center', rotation='vertical', fontsize=12)
plt.savefig("figures/" + 'hist_2cmlm_vertical' +'.pdf', bbox_inches="tight")
plt.show()

In [None]:
# selected_entities = []
# for e in prior_posterior:
#     if e['label'] == 1:
#         if e['prior'] > 1e-5 and math.log(e['posterior'] / e['prior']) > 5:
#             selected_entities.append(e)
#         if e['prior'] > 0. and math.log(e['posterior'] / e['prior']) < 0:
#             selected_entities.append(e)
#         elif e['posterior'] - e['prior'] > 0.5:
#             selected_entities.append(e)

In [None]:
# print(len(selected_entities))
# print(selected_entities[0])

In [None]:
# json.dump(selected_entities, open('sigma_entities.json', 'w'))

#### Get Average Entropy

In [None]:
import math

from draw import plot_hist

In [None]:
for l in range(3):
    posteriors = [p['prior'] for p in prior_posterior if p['label'] == l]
    print('- label {}: {}'.format(l, -math.log(sum(posteriors) / len(posteriors))))

In [None]:
threshold = 0.0001
posteriors = [[], [], []]
for l in range(3):
    for p in prior_posterior:
        if p['label'] == l and p['posterior'] > threshold:
            posteriors[l].append(-math.log(p['posterior']))

In [None]:
threshold = 1e-7
priors= [[], [], []]
for l in range(3):
    for p in prior_posterior:
        if p['label'] == l and p['prior'] > threshold:
            priors[l].append(-math.log(p['prior']))

In [None]:
plot_hist('histogram', posteriors, priors, save_fig=True)

#### Evaluate Baseline Models

In [None]:
from sklearn.metrics import classification_report

In [None]:
prior_posterior[0]

In [None]:
true_label, factual_label, hallucination_label = [], [], []
prior_probabilities, posterior_probabilities = [], []

for p in prior_posterior:
    if p['label'] is not None and p['label'] != 3:
        if p['label'] == 0 or p['label'] == 1:
            factual_label.append(1)
        elif p['label'] == 2:
            factual_label.append(0)
        else:
            raise Exception("ERROR! {}".format(p['label']))
            
        if p['label'] == 0:
            hallucination_label.append(0)
        elif p['label'] == 2 or p['label'] == 1:
            hallucination_label.append(1)
        else:
            raise Exception("ERROR! {}".format(p['label']))
            
        true_label.append(p['label'])
        prior_probabilities.append(p['prior'])
        posterior_probabilities.append(p['posterior'])

In [None]:
lm_baseline_preds = []
overlap_preds = []

for p in prior_posterior:
    if p['label'] is not None and p['label'] != 3:
        source = xsum_source[p['id']]

        if p['ent'].lower() in source.lower():
            overlap_preds.append(1)
        else:
            overlap_preds.append(0)

        if p['posterior'] > p['prior']:
            lm_baseline_preds.append(1)
        else:
            lm_baseline_preds.append(0)

In [None]:
print(classification_report(factual_label, overlap_preds, target_names=['Non-factual', 'Factual'], digits=4))

In [None]:
print(classification_report([1 if i == 0 else 0 for i in hallucination_label], overlap_preds, 
                            target_names=['Non-hallucinated', 'Hallucinated'], digits=4))

In [None]:
print(classification_report(factual_label, lm_baseline_preds, target_names=['Non-factual', 'Factual'], digits=4))

In [None]:
print(classification_report([1 if i == 0 else 0 for i in hallucination_label], lm_baseline_preds, 
                            target_names=['Non-hallucinated', 'Hallucinated'], digits=4))

### KNN Evaluation

In [None]:
import numpy as np
import matplotlib

from sklearn import neighbors
from draw import plot, plot_three, plot_three_with_boundary

In [None]:
def leave_one_out_error(prior_probs, posterior_probs, labels, n_neighbors=15):
    assert len(prior_probs) == len(posterior_probs) == len(labels)
    
    preds = []
    for i in range(len(prior_probs)):
        train_features, train_labels = [], []
        for j in range(len(prior_probs)):
            if j != i:
                train_features.append([prior_probs[j], posterior_probs[j]])
                train_labels.append(labels[j])
    
        classifier = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors, algorithm='auto')

        x_mat = np.array(train_features)
        y_vec = np.array(train_labels)
        classifier.fit(x_mat, y_vec)
        
        test_features = np.array([[prior_probs[i], posterior_probs[i]]])
        Z = classifier.predict(test_features)
        preds.append(Z[0])
    
    return preds

In [None]:
hallucination_label_reverse = [1 if i == 0 else 0 for i in hallucination_label]
knn_preds = leave_one_out_error(prior_probabilities, posterior_probabilities, hallucination_label_reverse, n_neighbors=16)
print(classification_report(hallucination_label_reverse, knn_preds, target_names=['Hallucinated', 'Non-hallucinated'], digits=4))

In [None]:
knn_preds = leave_one_out_error(prior_probabilities, posterior_probabilities, factual_label, n_neighbors=12)
print(classification_report(factual_label, knn_preds, target_names=['Non-Factual', 'Factual'], digits=4))

In [None]:
font = {'weight' : 'normal',
        'size'   : 8}

matplotlib.rc('font', **font)

In [None]:
plot_three_with_boundary('entity_distribution_2cmlm',
                         posterior_probabilities, prior_probabilities, true_label,
                         colors=['blue', 'darkgreen', 'red'],
                         legend_labels=['Non-hallucinated', 'Factual Hallucinataion', 'Non-factual Hallucinataion'],
                         x_label='CMLM trained on CNN/DM',
                         y_label='CMLM trained on XSum',
                         n_neighbors=16,
                         fig_size=(4.5, 3.5),
                         interval=0.25, h=0.05,
                         save_figure=True)

In [None]:
plot_three('entity_distribution_mlm_cmlm',
           posterior_probabilities, prior_probabilities, true_label,
           colors=['blue','darkgreen', 'red'],
           x_label='Prior Probability',
           y_label='Posterior Probability',
           legend_labels=['Non-hallucinated', 'Factual Hallucinataion', 'Non-factual Hallucinataion'],
           n_neighbors=10,
           fig_size=(4.5, 3.5),
           save_figure=True)

In [None]:
plot('Hallucination Entity Classification',
     posterior_probabilities, prior_probabilities, [1 if i == 0 else 0 for i in hallucination_label], 
     n_neighbors=12, fig_size=(4.5, 3.5), colors=['red', 'blue'], legend_labels=['Hallucinated', 'Non-Hallucinated'])

In [None]:
plot('compare_mlm_and_cmlm', posterior_probabilities, prior_probabilities, factual_label, n_neighbors=12, 
     fig_size=(4.5, 3.5), colors=['red', 'blue'], legend_labels=['Non-factual', 'Factual'],
     x_label='CLM Posterior Probability', y_label='CMLM Posterior Probability', save_figure=True)

#### Save KNN Model

In [None]:
import pickle

In [None]:
def build_KNN(posteriors, priors, labels, n_neighbors=15):
    classifier = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors, algorithm='auto')

    priors = np.array(priors)
    posteriors = np.array(posteriors)
    
#     x_mat = np.vstack([posteriors / np.std(posteriors), priors / np.std(priors)]).transpose()
    x_mat = np.vstack([posteriors, priors]).transpose()
    y_vec = np.array(labels)
    
    classifier.fit(x_mat, y_vec)
    
    return classifier

In [None]:
knn_classifier = build_KNN(posterior_probabilities, prior_probabilities, hallucination_label_reverse, n_neighbors=5)

In [None]:
preds = knn_classifier.predict(np.array(np.vstack([posterior_probabilities, prior_probabilities]).transpose()))

In [None]:
print(classification_report(hallucination_label_reverse, knn_preds, target_names=['Non-Factual', 'Factual'], digits=4))

In [None]:
# source, destination
# pickle.dump(knn_classifier, open('classifiers/knn_mlm_cmlm_hallucination.pkl', 'wb'))

#### AUC

In [None]:
from draw import draw_auc

In [None]:
p1, p2, p3, p4, p5, p6, p7 = [], [], [], [], [], [], []

for p in prior_posterior:
    if p['label'] is not None and p['label'] != 3:
        p1.append(p['bart.large'])
        p2.append(p['xsum_cmlm_bos'])
        p3.append(p['xsum_cmlm_scratch_cedar_warmup_10000'])
        p4.append(p['cnndm_cmlm_cedar'])
        p5.append(p['cnndm_cmlm_scratch_cedar_warmup_10000'])
        p6.append(p['bart.large.xsum'])
        p7.append(p['bart.large.cnn'])

assert len(p1) == len(factual_label)

In [None]:
draw_auc(factual_label,
         [p1, p2, p3, p4, p5, p6, p7],
         ['MLM', 'CMLM on XSum', 'CMLM on XSum scratch', 'CMLM on CNN/DM', 'CMLM on CNN/DM scratch', 'CLM on XSum', 'CLM on CNN/DM'],
         ['darkorange', 'green', 'red', 'blue', 'pink', 'aqua', 'tab:purple'])

In [None]:
draw_auc(factual_label,
         [p2, p6, p4, p7],
         ['CMLM on XSum', 'CLM on XSum', 'CMLM on CNN/DM', 'CLM on CNN/DM'],
         ['darkorange', 'green', 'red', 'blue'])

In [None]:
# {
#     'start': 61,
#     'end': 74,
#     'label': 2,
#     'type': 'CARDINAL',
#     'ent': 'more than 100',
#     'bart.large': 0.024139404296875,
#     'xsum_cmlm_bos': 0.0843505859375,
#     'cnndm_cmlm_cedar': 0.01030731201171875,
#     'bart.large.xsum': 0.05517578125,
#     'cnndm_cmlm_scratch_cedar_warmup_20000': 1.6808509826660156e-05,
#     'xsum_cmlm_scratch_cedar_warmup_10000': 0.00960540771484375,
#     'cnndm_cmlm_scratch_cedar_warmup_10000': 4.7087669372558594e-05,
#     'xsum_cmlm_scratch_cedar_warmup_20000': 0.003948211669921875
# }