In [1]:
import json

from fairseq.models.bart import BARTModel
from utils import read_lines

In [2]:
PATH = json.load(open('../path_config.json'))

In [3]:
bart = BARTModel.from_pretrained(PATH['xsum_cmlm_bos'],
                                 checkpoint_file='checkpoint_best.pt',
                                 data_name_or_path=PATH['data_name_or_path'])

#### Read XSum

In [4]:
document_path = PATH['xsum_fariseq'] + '/test.source'
target_path = PATH['xsum_fariseq'] + '/test.target'
xsum_source = read_lines(document_path)
xsum_target = read_lines(target_path)
print(len(xsum_source))
assert len(xsum_source) == len(xsum_target)

11301


#### Named Entity Recognition (NER)

In [5]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [6]:
test_target = xsum_target[0]
test_source = xsum_source[0]

In [7]:
extracted_ents = nlp(test_target).to_json()['ents']

In [8]:
extracted_ents

[{'start': 79, 'end': 91, 'label': 'LOC'},
 {'start': 96, 'end': 106, 'label': 'PRODUCT'}]

#### Calculate Probability for Each Extracted Entity

In [9]:
from model import ConditionalSequenceGenerator
from utils import prepare_cmlm_inputs, prepare_clm_inputs, get_cmlm_probability

In [10]:
model = ConditionalSequenceGenerator(bart)

In [11]:
inputs = prepare_cmlm_inputs(test_source, test_target, extracted_ents)

In [12]:
inputs

(['<s> New sand dunes may be created to reduce the risk of flooding on a beach on the ### and Flintshire border. <\\s> Severe storms in December 2013 caused damage to an existing dune system to the east of Barkby Beach in Prestatyn. A report, to be considered by Denbighshire councillors, says there is evidence they have begun to re-establish themselves naturally. But the council is considering creating a secondary dune system to strengthen the defences. The flood risk management report says: "The  narrowness of the dunes at Barkby Beach is a cause for serious concern. "Discussions have taken place with Natural Resources Wales regarding possible options to reduce the risk of a breach at this location. "This could be such as creating a secondary dune system on the landward side of the existing dunes." About 400 people were forced to leave their homes after high tides and gale force winds battered north Wales causing flooding last December. In Rhyl, Denbighshire - one of the worst-hit are

#### Calculate Probability

In [13]:
entity_probs = get_cmlm_probability(model, inputs[0], inputs[1], inputs[2], inputs[3])

In [14]:
entity_probs

[0.434326171875, 0.1956787109375]

#### Masking Entity

In [15]:
import random

In [16]:
masked_target = inputs[1][0]

cal = 0
for p, e in zip(entity_probs, inputs[2]):
    if random.random() < p:
        masked_target = masked_target[:e[0] + cal] + '<UNC>' + masked_target[e[1] + cal:]
        cal = 5 - (e[1] - e[0])

In [17]:
masked_target

'<s> New sand dunes may be created to reduce the risk of flooding on a beach on the Denbighshire and Flintshire border.'