In [1]:
import argparse
import sys
sys.path.append("./BLINK")
import blink.candidate_ranking.utils as utils
from main_dense import load_models, run, _print_colorful_text, _print_colorful_prediction, _annotate, _get_test_samples, _process_biencoder_dataloader, _run_biencoder, _process_crossencoder_dataloader, _run_crossencoder

In [2]:
models_path = "BLINK/models/" # the path where you stored the BLINK models

config = {
    "test_entities": None,
    "test_mentions": None,
    "interactive": False,
    "top_k": 10,
    "biencoder_model": models_path+"biencoder_wiki_large.bin",
    "biencoder_config": models_path+"biencoder_wiki_large.json",
    "entity_catalogue": models_path+"entity.jsonl",
    "entity_encoding": models_path+"all_entities_large.t7",
    "crossencoder_model": models_path+"crossencoder_wiki_large.bin",
    "crossencoder_config": models_path+"crossencoder_wiki_large.json",
    "fast": False, # set this to be true if speed is a concern
    "output_path": "logs/" # logging directory
}

args = argparse.Namespace(**config)
logger = utils.get_logger(args.output_path)

In [3]:
models = load_models(args, logger)

12/29/2023 15:51:09 - INFO - Blink -   loading biencoder model
12/29/2023 15:51:10 - INFO - pytorch_transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt from cache at /afs/crc.nd.edu/user/k/kmealey2/.cache/torch/pytorch_transformers/9b3c03a36e83b13d5ba95ac965c9f9074a99e14340c523ab405703179e79fc46.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
12/29/2023 15:51:10 - INFO - pytorch_transformers.modeling_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json from cache at /afs/crc.nd.edu/user/k/kmealey2/.cache/torch/pytorch_transformers/6dfaed860471b03ab5b9acb6153bea82b6632fb9bbe514d3fff050fe1319ee6d.788fed32bb8481a9b15ce726d41c53d5d5066b04c667e34ce3a7a3826d1573d8
12/29/2023 15:51:10 - INFO - pytorch_transformers.modeling_utils -   Model config {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,


In [5]:
biencoder = models[0]
biencoder_params = models[1]
crossencoder = models[2]
crossencoder_params = models[3]
candidate_encoding = models[4]
title2id = models[5]
id2title = models[6]
id2text = models[7]
wikipedia_id2local_id = models[8]
faiss_indexer = None
test_data = None

In [7]:
id2text[title2id['Conventional landing gear']]

' Conventional landing gear, or tailwheel-type landing gear, is an aircraft undercarriage consisting of two main wheels forward of the center of gravity and a small wheel or skid to support the tail. The term taildragger is also used, although some claim it should apply only to those aircraft with a tailskid rather than a wheel.  The term "conventional" persists for historical reasons, but all modern jet aircraft and most modern propeller aircraft use tricycle gear. '

In [11]:
id2url[title2id['Conventional landing gear']]

'https://en.wikipedia.org/wiki?curid=1049599'

In [5]:
import argparse
import json
import sys

from tqdm import tqdm
import logging
import torch
import numpy as np
from colorama import init
from termcolor import colored

import blink.ner as NER
from torch.utils.data import DataLoader, SequentialSampler, TensorDataset
from blink.biencoder.biencoder import BiEncoderRanker, load_biencoder
from blink.crossencoder.crossencoder import CrossEncoderRanker, load_crossencoder
from blink.biencoder.data_process import (
    process_mention_data,
    get_candidate_representation,
)
import blink.candidate_ranking.utils as utils
from blink.crossencoder.train_cross import modify, evaluate
from blink.crossencoder.data_process import prepare_crossencoder_data
from blink.indexer.faiss_indexer import DenseFlatIndexer, DenseHNSWFlatIndexer

In [10]:
id2url = {
        v: "https://en.wikipedia.org/wiki?curid=%s" % k
        for k, v in wikipedia_id2local_id.items()
    }

## Interactive Mode

In [22]:
stopping_condition = False
args.fast = False

while not stopping_condition:

    samples = None
    
    logger.info("interactive mode")

    # biencoder_params["eval_batch_size"] = 1

    # Load NER model
    ner_model = NER.get_model()

    # Interactive
    text = input("insert text:")

    # Identify mentions
    samples = _annotate(ner_model, [text])

    _print_colorful_text(text, samples)
    
    # don't look at labels
    keep_all = True

    # prepare the data for biencoder
    if logger:
        logger.info("preparing data for biencoder")
    dataloader = _process_biencoder_dataloader(
        samples, biencoder.tokenizer, biencoder_params
    )

    # run biencoder
    if logger:
        logger.info("run biencoder")
    top_k = args.top_k
    labels, nns, scores = _run_biencoder(
        biencoder, dataloader, candidate_encoding, top_k, faiss_indexer
    )
    
    print("\nfast (biencoder) predictions:")

    _print_colorful_text(text, samples)

    # print biencoder prediction
    idx = 0
    for entity_list, sample in zip(nns, samples):
        e_id = entity_list[0]
        e_title = id2title[e_id]
        e_text = id2text[e_id]
        e_url = id2url[e_id]
        _print_colorful_prediction(
            idx, sample, e_id, e_title, e_text, e_url, args.show_url
        )
        idx += 1
    print()

    if args.fast:
        # use only biencoder
        continue
        
    # prepare crossencoder data
    context_input, candidate_input, label_input = prepare_crossencoder_data(
        crossencoder.tokenizer, samples, labels, nns, id2title, id2text, keep_all,
    )

    context_input = modify(
        context_input, candidate_input, crossencoder_params["max_seq_length"]
    )

    dataloader = _process_crossencoder_dataloader(
        context_input, label_input, crossencoder_params
    )

    # run crossencoder and get accuracy
    accuracy, index_array, unsorted_scores = _run_crossencoder(
        crossencoder,
        dataloader,
        logger,
        context_len=biencoder_params["max_context_length"],
    )

    print("\naccurate (crossencoder) predictions:")

    _print_colorful_text(text, samples)

    # print crossencoder prediction
    idx = 0
    for entity_list, index_list, sample in zip(nns, index_array, samples):
        e_id = entity_list[index_list[-1]]
        e_title = id2title[e_id]
        e_text = id2text[e_id]
        e_url = id2url[e_id]
        _print_colorful_prediction(
            idx, sample, e_id, e_title, e_text, e_url, args.show_url
        )
        idx += 1
    print()

12/28/2023 16:47:20 - INFO - Blink -   interactive mode
2023-12-28 16:47:20,347 loading file /afs/crc.nd.edu/user/k/kmealey2/.flair/models/en-ner-conll03-v0.4.pt


insert text: tailwheel cocked right prior to tkof.


Failed to identify entity from text:

tailwheel cocked right prior to tkof.

12/28/2023 16:47:34 - INFO - Blink -   preparing data for biencoder
12/28/2023 16:47:34 - INFO - Blink -   run biencoder


0it [00:00, ?it/s]



fast (biencoder) predictions:
Failed to identify entity from text:

tailwheel cocked right prior to tkof.




0it [00:00, ?it/s]
Evaluation: 0it [00:00, ?it/s]


12/28/2023 16:47:34 - INFO - Blink -   Eval accuracy: -1.00000

accurate (crossencoder) predictions:
Failed to identify entity from text:

tailwheel cocked right prior to tkof.


12/28/2023 16:47:34 - INFO - Blink -   interactive mode
2023-12-28 16:47:34,960 loading file /afs/crc.nd.edu/user/k/kmealey2/.flair/models/en-ner-conll03-v0.4.pt


insert text: tail wheel was cocked right prior to taking off.


Failed to identify entity from text:

tail wheel was cocked right prior to taking off.

12/28/2023 16:48:15 - INFO - Blink -   preparing data for biencoder
12/28/2023 16:48:15 - INFO - Blink -   run biencoder


0it [00:00, ?it/s]



fast (biencoder) predictions:
Failed to identify entity from text:

tail wheel was cocked right prior to taking off.




0it [00:00, ?it/s]
Evaluation: 0it [00:00, ?it/s]


12/28/2023 16:48:15 - INFO - Blink -   Eval accuracy: -1.00000

accurate (crossencoder) predictions:
Failed to identify entity from text:

tail wheel was cocked right prior to taking off.


12/28/2023 16:48:15 - INFO - Blink -   interactive mode
2023-12-28 16:48:15,901 loading file /afs/crc.nd.edu/user/k/kmealey2/.flair/models/en-ner-conll03-v0.4.pt


insert text: tow plane became airborne then settled. student thought tow in trouble and released. hit tree.


Failed to identify entity from text:

tow plane became airborne then settled. student thought tow in trouble and released. hit tree.

12/28/2023 16:48:59 - INFO - Blink -   preparing data for biencoder
12/28/2023 16:48:59 - INFO - Blink -   run biencoder


0it [00:00, ?it/s]



fast (biencoder) predictions:
Failed to identify entity from text:

tow plane became airborne then settled. student thought tow in trouble and released. hit tree.




0it [00:00, ?it/s]
Evaluation: 0it [00:00, ?it/s]


12/28/2023 16:48:59 - INFO - Blink -   Eval accuracy: -1.00000

accurate (crossencoder) predictions:
Failed to identify entity from text:

tow plane became airborne then settled. student thought tow in trouble and released. hit tree.


12/28/2023 16:48:59 - INFO - Blink -   interactive mode
2023-12-28 16:48:59,471 loading file /afs/crc.nd.edu/user/k/kmealey2/.flair/models/en-ner-conll03-v0.4.pt


insert text: tow plane became airborne and then it settled.


Failed to identify entity from text:

tow plane became airborne and then it settled.

12/28/2023 16:49:41 - INFO - Blink -   preparing data for biencoder
12/28/2023 16:49:41 - INFO - Blink -   run biencoder


0it [00:00, ?it/s]



fast (biencoder) predictions:
Failed to identify entity from text:

tow plane became airborne and then it settled.




0it [00:00, ?it/s]
Evaluation: 0it [00:00, ?it/s]


12/28/2023 16:49:41 - INFO - Blink -   Eval accuracy: -1.00000

accurate (crossencoder) predictions:
Failed to identify entity from text:

tow plane became airborne and then it settled.


12/28/2023 16:49:41 - INFO - Blink -   interactive mode
2023-12-28 16:49:41,609 loading file /afs/crc.nd.edu/user/k/kmealey2/.flair/models/en-ner-conll03-v0.4.pt


KeyboardInterrupt: Interrupted by user

## Non-interactive Mode

Blink uses Flair NER, and treats each sentence seperately.

In [9]:
import json

with open("./faa_samples.jsonl") as f:
    test_str = f.read()

test_data = []
for test_doc_str in test_str.split('\n'):
    try:
        test_data.append(json.loads(test_doc_str))
    except:
        print(f"could not parse: {test_doc_str}")

could not parse: 


In [11]:
len(test_data)

1164

In [None]:
samples = test_data

# don't look at labels
keep_all = (
    args.interactive
    or samples[0]["label"] == "unknown"
    or samples[0]["label_id"] < 0
)

# prepare the data for biencoder
if logger:
    logger.info("preparing data for biencoder")
dataloader = _process_biencoder_dataloader(
    samples, biencoder.tokenizer, biencoder_params
)

# run biencoder
if logger:
    logger.info("run biencoder")
top_k = args.top_k
labels, nns, scores = _run_biencoder(
    biencoder, dataloader, candidate_encoding, top_k, faiss_indexer
)

biencoder_accuracy = -1
recall_at = -1
if not keep_all:
    # get recall values
    top_k = args.top_k
    x = []
    y = []
    for i in range(1, top_k):
        temp_y = 0.0
        for label, top in zip(labels, nns):
            if label in top[:i]:
                temp_y += 1
        if len(labels) > 0:
            temp_y /= len(labels)
        x.append(i)
        y.append(temp_y)
    # plt.plot(x, y)
    biencoder_accuracy = y[0]
    recall_at = y[-1]
    print("biencoder accuracy: %.4f" % biencoder_accuracy)
    print("biencoder recall@%d: %.4f" % (top_k, y[-1]))

## Fast predictions
predictions = []
for entity_list in nns:
    sample_prediction = []
    for e_id in entity_list:
        e_title = id2title[e_id]
        sample_prediction.append(e_title)
    predictions.append(sample_prediction)

# use only biencoder
print (
    biencoder_accuracy,
    recall_at,
    -1,
    -1,
    len(samples),
    predictions,
    scores,
)

In [20]:
biencoder_predictions = predictions

In [21]:
# prepare crossencoder data
context_input, candidate_input, label_input = prepare_crossencoder_data(
    crossencoder.tokenizer, samples, labels, nns, id2title, id2text, keep_all,
)

context_input = modify(
    context_input, candidate_input, crossencoder_params["max_seq_length"]
)

dataloader = _process_crossencoder_dataloader(
    context_input, label_input, crossencoder_params
)

# run crossencoder and get accuracy
accuracy, index_array, unsorted_scores = _run_crossencoder(
    crossencoder,
    dataloader,
    logger,
    context_len=biencoder_params["max_context_length"],
)

scores = []
predictions = []
for entity_list, index_list, scores_list in zip(
    nns, index_array, unsorted_scores
):

    index_list = index_list.tolist()

    # descending order
    index_list.reverse()

    sample_prediction = []
    sample_scores = []
    for index in index_list:
        e_id = entity_list[index]
        e_title = id2title[e_id]
        sample_prediction.append(e_title)
        sample_scores.append(scores_list[index])
    predictions.append(sample_prediction)
    scores.append(sample_scores)

crossencoder_normalized_accuracy = -1
overall_unormalized_accuracy = -1
if not keep_all:
    crossencoder_normalized_accuracy = accuracy
    print(
        "crossencoder normalized accuracy: %.4f"
        % crossencoder_normalized_accuracy
    )

    if len(samples) > 0:
        overall_unormalized_accuracy = (
            crossencoder_normalized_accuracy * len(label_input) / len(samples)
        )
    print(
        "overall unnormalized accuracy: %.4f" % overall_unormalized_accuracy
    )

100%|██████████| 1164/1164 [00:00<00:00, 2762.73it/s]

4/1164 




1164/1164 

Evaluation: 100%|██████████| 1164/1164 [02:22<00:00,  7.91it/s]

12/28/2023 18:43:43 - INFO - Blink -   Eval accuracy: 0.59107





In [25]:
crossencoder_predictions = predictions

In [28]:
import pandas as pd
og_data = pd.read_csv("../../data/FAA_data/Maintenance_Text_data_nona.csv")

12/28/2023 18:46:24 - INFO - numexpr.utils -   Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
12/28/2023 18:46:24 - INFO - numexpr.utils -   NumExpr defaulting to 8 threads.


In [37]:
results_dict = {"doc_idx":[], "sent_idx":[], "original_sentence":[], "input":[], "mention":[], "biencoder_predicted_entity":[], "crossencoder_predicted_entity":[]}

for isample, sample in enumerate(samples):
    results_dict["doc_idx"].append(sample["doc_idx"])
    results_dict["sent_idx"].append(sample["sent_idx"])
    results_dict["original_sentence"].append(og_data["c119"].iat[sample["doc_idx"]])
    results_dict["input"].append(''.join([sample["context_left"], sample["mention"], sample["context_right"]]))
    results_dict["mention"].append(sample["mention"])
    results_dict["biencoder_predicted_entity"].append(biencoder_predictions[isample][0]) # just taking top one
    results_dict["crossencoder_predicted_entity"].append(crossencoder_predictions[isample][0])

In [39]:
pd.DataFrame(results_dict)

Unnamed: 0,doc_idx,sent_idx,original_sentence,input,mention,biencoder_predicted_entity,crossencoder_predicted_entity
0,0,0,TAILWHEEL COCKED RIGHT PRIOR TO TKOF. ...,tailwheel cocked right prior to tkof.,tailwheel,Flywheel,Conventional landing gear
1,2,1,"2ND ILS APCH,ACFT'S G/S INOP.LOM TUNED TO WRON...",lom tuned to wrong freq.,lom,"Lom, Norway",Herbert Lom
2,7,0,MTNS OBSCURED.FLT TO CK VOR REC REPTD INOP PRI...,mtns obscured.,mtns,KCNS,MTN (TV station)
3,10,1,LEFT ENG OIL SUPPLY EXHAUSTED.GEAR-UP LDG IN M...,gear-up ldg in mesquite brush.,mesquite,Mesquite,Mesquite
4,22,0,APRX 1/2 CUPFULL FLUID UNDER R BRAKE PRIOR TO ...,aprx 1/2 cupfull fluid under r brake prior to ...,aprx,Audi Performance and Racing,Elementis
...,...,...,...,...,...,...,...
1159,2742,0,"ON 7/22/08 AT 1249 MST, CESSNA T182T, N562GK, ...","on 7/22/08 at 1249 mst, cessna t182t, n562gk, ...",mst,South African Standard Time,Marine Science Technician
1160,2743,0,(-23) A/C RELOCATED TO NEW HANGAR TO CHECK SIZ...,(-23) a/c relocated to new hangar to check siz...,new hangar,American Airways Hangar and Administration Bui...,Goodyear Airdock
1161,2744,0,(-23) ON 2/23/08 @ APPROXIMATELY 2130 DURING T...,(-23) on 2/23/08 @ approximately 2130 during t...,airc,Air Force Reserve Command,Air traffic control
1162,2745,0,(-23) PILOT TOOK OFF FOR LEESBURG AIRPORT AND ...,(-23) pilot took off for leesburg airport and ...,leesburg,"Leesburg, Virginia","Leesburg, Florida"
