In [None]:
!pip install scispacy==0.2.5

In [None]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_ner_bc5cdr_md-0.2.5.tar.gz

In [1]:
import spacy
import scispacy

import csv
import pandas as pd

In [4]:
print(scispacy.__version__)

0.2.5


In [2]:
#loading the spacy models for disease NER
nlp = spacy.load("en_ner_bc5cdr_md") 

In [6]:
def load_ncbi_txt(file_path):
    texts = {}
    with open(file_path, "r", encoding="utf8") as f:
        for line in f:
            if line.strip() == "":
                continue
            if "|t|" in line:
                pmid, title = line.strip().split("|t|")
                texts[pmid] = {"title": title, "abstract": "", "annotations": []}
            elif "|a|" in line:
                pmid, abstract = line.strip().split("|a|")
                if pmid in texts:
                    texts[pmid]["abstract"] = abstract
                else:
                    
                    texts[pmid] = {"title": "", "abstract": abstract, "annotations": []}
            else:
                parts = line.strip().split("\t")
                if len(parts) >= 5:
                    pmid, start, end, mention, label = parts[:5]
                    if pmid in texts:
                        texts[pmid]["annotations"].append({
                            "start": int(start),
                            "end": int(end),
                            "mention": mention,
                            "label": label
                        })
    return texts


In [7]:
def get_predictions(texts):
    predictions = {}
    for pmid, entry in texts.items():
        doc = nlp(entry["title"] + " " + entry["abstract"])
        entities = [{"start": ent.start_char, "end": ent.end_char, "mention": ent.text, "label": ent.label_}
                    for ent in doc.ents if ent.label_ == "DISEASE"]
        predictions[pmid] = entities
    return predictions


In [8]:
texts = load_ncbi_txt("D:/scispacy/NCBItestset_corpus/NCBItestset_corpus.txt")
predictions = get_predictions(texts)

In [9]:
output_file = "predictions.tsv"

with open(output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file, delimiter='\t')  
    
    writer.writerow(["doc_id", "start", "end", "mention", "label"])
    
    for doc_id, annotations in predictions.items():
        for annotation in annotations:
            writer.writerow([doc_id, annotation['start'], annotation['end'], annotation['mention'], annotation['label']])

print(f"Predictions have been saved to {output_file}")

Predictions have been saved to predictions.tsv


In [10]:
pred_df = pd.read_csv("D:/scispacy/predictions.tsv", sep="\t")
gold_df = pd.read_csv("D:/OGER/OGER/data/ncbi_annotations/gold_annotations.tsv", sep="\t")

In [11]:
def exact_match(pred, gold):
    return (
        pred['doc_id'] == gold['doc_id'] and
        pred['start'] == gold['start'] and
        pred['end'] == gold['end'] 
    )

def partial_match(pred, gold):
    return (
        pred['doc_id'] == gold['doc_id'] and
        not (pred['end'] <= gold['start'] or pred['start'] >= gold['end']) 
    )


In [14]:
def evaluate(pred_df, gold_df, match_func):
    tp = 0
    matched_gold = set()

    for _, pred_row in pred_df.iterrows():
        match_found = False
        for idx, gold_row in gold_df.iterrows():
            if idx in matched_gold:
                continue
            if match_func(pred_row, gold_row):
                tp += 1
                matched_gold.add(idx)
                match_found = True
                break

    fp = len(pred_df) - tp
    fn = len(gold_df) - tp

    precision = tp / (tp + fp) if tp + fp else 0.0
    recall = tp / (tp + fn) if tp + fn else 0.0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0.0

    return precision, recall, f1, fp, fn, tp 


In [15]:
exact_precision, exact_recall, exact_f1, exact_fp, exact_fn, exact_tp = evaluate(pred_df, gold_df, exact_match)
partial_precision, partial_recall, partial_f1, partial_fp, partial_fn, partial_tp = evaluate(pred_df, gold_df, partial_match)

print(f"Exact Match: P={exact_precision:.6f}, R={exact_recall:.6f}, F1={exact_f1:.6f}, fp={exact_fp:.6f}, fn={exact_fn:.6f}, tp={exact_tp:.6f}")
print(f"Partial Match: P={partial_precision:.6f}, R={partial_recall:.6f}, F1={partial_f1:.6f}, fp={partial_fp:.6f}, fn={partial_fn:.6f}, tp={partial_tp:.6f}")

Exact Match: P=0.651967, R=0.569792, F1=0.608116, fp=292.000000, fn=413.000000, tp=547.000000
Partial Match: P=0.818832, R=0.715625, F1=0.763758, fp=152.000000, fn=273.000000, tp=687.000000
