In [None]:
!pip install bottle lxml NLTK

In [1]:
import csv
import os
from collections import defaultdict
import pandas as pd

from oger.doc.document import Collection
from oger.ctrl.router import Router, PipelineServer

In [2]:
import sys
sys.path.append("D:/OGER/OGER")

In [50]:
with open("D:/OGER/CTD/CTD.csv", 'r', encoding='utf-8') as f:
    sample = f.readline()
    print(repr(sample))

'cui\tresource\toriginal_id\tterm\tpreferred_term\tentity_type\n'


In [None]:
def load_ncbi_corpus(txt_path):
    texts = defaultdict(dict)  
    annotations = defaultdict(list) 

    with open(txt_path, 'r', encoding='utf-8') as f:
        for line in f:
            if '|t|' in line or '|a|' in line:
                pmid, part, content = line.strip().split('|', maxsplit=2)
                texts[pmid][part] = content
            elif '\t' in line:
                pmid, start, end, mention, ent_type, concept_id = line.strip().split('\t')
                annotations[pmid].append((int(start), int(end), mention, ent_type, concept_id))

    combined_texts = {
        pmid: texts[pmid].get('t', '') + ' ' + texts[pmid].get('a', '')
        for pmid in texts
    }

    return combined_texts, annotations




In [10]:
texts, gold_annotations = load_ncbi_corpus("D:/scispacy/NCBItestset_corpus/NCBItestset_corpus.txt")

In [118]:
os.makedirs("data/ncbi_texts", exist_ok=True)

for pmid, text in texts.items():
    with open(f"data/ncbi_texts/{pmid}.txt", "w", encoding="utf-8") as f:
        f.write(text)

os.makedirs("data/ncbi_annotations", exist_ok=True)

with open("data/ncbi_annotations/gold_annotations.tsv", "w", newline='', encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile, delimiter='\t')
    writer.writerow(['pmid', 'start', 'end', 'mention', 'entity_type', 'concept_id'])  

    for pmid, annots in gold_annotations.items():
        for start, end, mention, entity_type, concept_id in annots:
            writer.writerow([pmid, start, end, mention, entity_type, concept_id])

print("Data and annotations saved successfully!")

Data and annotations saved successfully!


Applying OGER using python library:

In [None]:
conf = Router(termlist_path='D:/OGER/CTD/CTD.tsv')
pl = PipelineServer(conf)

In [6]:
text_directory = "D:/OGER/OGER/data/ncbi_texts"

documents = []
for file_name in os.listdir(text_directory):
    if file_name.endswith('.txt'):
        file_path = os.path.join(text_directory, file_name)
        doc = pl.load_one(file_path, 'txt')  
        pmid = file_name.replace('.txt', '')
        doc.id = pmid  
        documents.append(doc)


doc_collection = Collection.from_iterable(documents, id_="ncbi_collection")

pl.process(doc_collection)

all_predictions = {}

for doc in doc_collection:
    pmid = doc.id
    all_predictions[pmid] = []
    
    for entity in doc.iter_entities():
        all_predictions[pmid].append({
            "text": entity.text,
            "start": entity.start,
            "end": entity.end,
            "concept_id": entity.cid
        })


Recognizers configuration: (<oger.ctrl.parameters.ERParams object at 0x000002711F201BA0>,)
Initializing EntityRecognizer with config: <oger.ctrl.parameters.ERParams object at 0x000002711F201BA0>


In [7]:
all_predictions

{'9288106': [{'text': 'ataxia',
   'start': 40,
   'end': 46,
   'concept_id': 'D001259'},
  {'text': 'ataxia-telangiectasia',
   'start': 40,
   'end': 61,
   'concept_id': 'D001260'},
  {'text': 'telangiectasia', 'start': 47, 'end': 61, 'concept_id': 'D013684'},
  {'text': 'Ataxia', 'start': 99, 'end': 105, 'concept_id': 'D001259'},
  {'text': 'Ataxia-telangiectasia',
   'start': 99,
   'end': 120,
   'concept_id': 'D001260'},
  {'text': 'telangiectasia',
   'start': 106,
   'end': 120,
   'concept_id': 'D013684'},
  {'text': 'cancer', 'start': 235, 'end': 241, 'concept_id': 'D009369'},
  {'text': 'neoplasias', 'start': 263, 'end': 273, 'concept_id': 'D009369'},
  {'text': 'chromosomal instability',
   'start': 351,
   'end': 374,
   'concept_id': 'D043171'},
  {'text': 'malignancy', 'start': 483, 'end': 493, 'concept_id': 'D009369'},
  {'text': 'lymphomas', 'start': 1329, 'end': 1338, 'concept_id': 'D008223'},
  {'text': 'NHL', 'start': 1342, 'end': 1345, 'concept_id': 'D008228'},
 

In [None]:
output_file = "predictions.tsv"

with open(output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file, delimiter='\t')  
    
    writer.writerow(["doc_id", "mention", "start", "end", "concept_id"])
    
    for doc_id, annotations in all_predictions.items():
        for annotation in annotations:
            writer.writerow([doc_id, annotation['mention'], annotation['start'], annotation['end'], annotation['concept_id']])

print(f"Predictions have been saved to {output_file}")

Predictions have been saved to predictions.tsv


In [3]:
pred_df = pd.read_csv("D:/OGER/OGER/predictions.tsv", sep="\t")
gold_df = pd.read_csv("D:/OGER/OGER/data/ncbi_annotations/gold_annotations.tsv", sep="\t")

In [4]:
def exact_match(pred, gold):
    return (
        pred['doc_id'] == gold['doc_id'] and
        pred['start'] == gold['start'] and
        pred['end'] == gold['end'] 
    )

def partial_match(pred, gold):
    return (
        pred['doc_id'] == gold['doc_id'] and
        not (pred['end'] <= gold['start'] or pred['start'] >= gold['end']) 
    )


In [5]:
def evaluate(pred_df, gold_df, match_func):
    tp = 0
    matched_gold = set()

    for _, pred_row in pred_df.iterrows():
        match_found = False
        for idx, gold_row in gold_df.iterrows():
            if idx in matched_gold:
                continue
            if match_func(pred_row, gold_row):
                tp += 1
                matched_gold.add(idx)
                match_found = True
                break

    fp = len(pred_df) - tp
    fn = len(gold_df) - tp

    precision = tp / (tp + fp) if tp + fp else 0.0
    recall = tp / (tp + fn) if tp + fn else 0.0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0.0

    return precision, recall, f1, fp, fn, tp


In [6]:
exact_precision, exact_recall, exact_f1, exact_fp, exact_fn, exact_tp = evaluate(pred_df, gold_df, exact_match)
partial_precision, partial_recall, partial_f1, partial_fp, partial_fn, partial_tp = evaluate(pred_df, gold_df, partial_match)

print(f"Exact Match: P={exact_precision:.6f}, R={exact_recall:.6f}, F1={exact_f1:.6f}, fp={exact_fp:.6f}, fn={exact_fn:.6f}, tp={exact_tp:.6f}")
print(f"Partial Match: P={partial_precision:.6f}, R={partial_recall:.6f}, F1={partial_f1:.6f}, fp={partial_fp:.6f}, fn={partial_fn:.6f}, tp={partial_tp:.6f}")

Exact Match: P=0.359582, R=0.537500, F1=0.430898, fp=919.000000, fn=444.000000, tp=516.000000
Partial Match: P=0.464111, R=0.693750, F1=0.556159, fp=769.000000, fn=294.000000, tp=666.000000


Applying OGER using command line:

In [44]:
!python -m oger run --settings D:/OGER/settings.ini -c termlist0-normalize "[\"lowercase\"]"


Recognizers configuration: (<oger.ctrl.parameters.ERParams object at 0x000002584FA0CA60>,)
Initializing EntityRecognizer with config: <oger.ctrl.parameters.ERParams object at 0x000002584FA0CA60>


In [45]:
#lenient
!python -m oger eval -t tsv -g D:/OGER/OGER/data/ncbi_annotations/gold_annotations.tsv -a D:/OGER/OGER/output/combined_output.tsv -G 1,2,3 -A 1,3,4 -k 1 -o 3,4 --lenient -p -r -f -c 


Precision	0.5292517006802722
Recall	0.809573361082206
F1	0.6400658165364048
TP	622
FP	692
FN	183
PP	156


In [46]:
#exact
!python -m oger eval -t tsv -g D:/OGER/OGER/data/ncbi_annotations/gold_annotations.tsv -a D:/OGER/OGER/output/combined_output.tsv -G 1,2,3 -A 1,3,4 -k 1 -o 3,4 --strict -p -r -f -c


Precision	0.4231292517006803
Recall	0.6472424557752341
F1	0.5117235705470998
TP	622
FP	848
FN	339
