In [1]:
import os
import random
from collections import defaultdict
from tqdm import tqdm
from transformers import pipeline
from rapidfuzz import fuzz

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Setup paths
data_root = '/Users/mamtaudai/Downloads/cadec'
text_dir = os.path.join(data_root, 'text')
original_dir = os.path.join(data_root, 'original')
meddra_dir = os.path.join(data_root, 'meddra')
sct_dir = os.path.join(data_root, 'sct')

In [3]:
all_files = [f for f in os.listdir(text_dir) if f.endswith('.txt')]


In [4]:
# Single demo file
demo_file = 'LIPITOR.86.txt'

In [5]:
# Define label mapping for d4data/biomedical-ner-all
label_map = {
    'SIGN_SYMPTOM': 'ADR',
    'BIOLOGICAL_STRUCTURE': 'ADR',
    'SEVERITY': 'ADR',
    'DISEASE': 'Disease',
    'CHEMICAL': 'Drug',
    'ADR': 'ADR'
}

In [28]:
#  Load NER pipeline 

ner_pipeline = pipeline("ner", model="d4data/biomedical-ner-all", aggregation_strategy="simple")

Device set to use cpu


#  Helper functions 


In [7]:
def extract_entities(filepath):
    entities = set()
    with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
        for line in f:
            if line.startswith('#'):
                continue
            parts = line.strip().split('\t')
            if len(parts) != 3:
                continue
            _, label_and_ranges, text = parts
            label = label_and_ranges.split()[0]
            entities.add((label, text.lower()))
    return entities

In [8]:
def extract_adr(filepath):
    adrs = set()
    with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
        for line in f:
            if line.startswith('#'):
                continue
            parts = line.strip().split('\t')
            if len(parts) < 3:
                continue
            _, _, text = parts
            adrs.add(('ADR', text.lower()))
    return adrs

In [9]:
def evaluate_file(fname, text_dir, original_dir, ner_pipeline, label_map):
    text_path = os.path.join(text_dir, fname)
    with open(text_path, 'r', encoding='utf-8', errors='replace') as f:
        post_text = f.read()

    ner_results = ner_pipeline(post_text)
    
    # Map predictions
    pred_entities = set()
    for entity in ner_results:
        text = entity['word'].strip().lower()
        model_label = entity['entity_group'].upper().replace("-", "_")
        if model_label in label_map:
            cadec_label = label_map[model_label]
            pred_entities.add((cadec_label, text))
            
    # Load ground truth
    ann_fname = fname.replace('.txt', '.ann')
    gt_entities = extract_entities(os.path.join(original_dir, ann_fname))

    # Fuzzy match predictions
    tp = 0
    for pred_label, pred_text in pred_entities:
        for gt_label, gt_text in gt_entities:
            if pred_label == gt_label and fuzz.partial_ratio(pred_text, gt_text) >= 80:
                tp += 1
                break

    fp = len(pred_entities) - tp
    fn = len(gt_entities) - tp

    p = tp / (tp+fp) if tp+fp > 0 else 0
    r = tp / (tp+fn) if tp+fn > 0 else 0
    f1 = 2*p*r/(p+r) if p+r > 0 else 0

# We choose Precision, Recall, and F1 as our metrics because they are standard for NER evaluation.
# Precision measures how many predicted entities are correct.
# Recall measures how many ground truth entities were found.
# F1 balances both, which is especially important for imbalanced data like ADR mentions.
    
    return p, r, f1

# --- Task 1: Enumerate distinct entities of each type ---


In [10]:
global_label_entities = defaultdict(set)
for fname in tqdm(all_files, desc="Enumerating unique entities"):
    ann_fname = fname.replace('.txt', '.ann')
    orig_path = os.path.join(original_dir, ann_fname)
    with open(orig_path, 'r', encoding='utf-8', errors='replace') as f:
        for line in f:
            line = line.strip()
            if line.startswith('#') or not line:
                continue
            parts = line.split('\t')
            if len(parts) != 3:
                continue
            _, label_and_ranges, entity_text = parts
            label = label_and_ranges.split()[0]
            global_label_entities[label].add(entity_text.lower())

Enumerating unique entities: 100%|████████| 1250/1250 [00:00<00:00, 1917.01it/s]


In [11]:
for label, entities in global_label_entities.items():
    print(f"{label}: {len(entities)} distinct entities")

ADR: 3400 distinct entities
Drug: 323 distinct entities
Disease: 164 distinct entities
Finding: 298 distinct entities
Symptom: 148 distinct entities


# --- Tasks 2–4: Evaluate on single file ---


In [12]:
p, r, f1 = evaluate_file(demo_file, text_dir, original_dir, ner_pipeline, label_map)
print(f"\nSingle file: {demo_file}")
print(f"Precision: {p:.2f}, Recall: {r:.2f}, F1: {f1:.2f}")


Single file: LIPITOR.86.txt
Precision: 0.88, Recall: 1.00, F1: 0.93


In [16]:
# Proper ADR evaluation on single file

text_path = os.path.join(text_dir, demo_file)
with open(text_path, 'r', encoding='utf-8', errors='replace') as f:
    post_text = f.read()

In [17]:
ner_results = ner_pipeline(post_text)


In [18]:
# Get predicted ADRs

pred_adr_entities = set()
for entity in ner_results:
    text = entity['word'].strip().lower()
    model_label = entity['entity_group'].upper().replace("-", "_")
    if model_label in label_map:
        cadec_label = label_map[model_label]
        if cadec_label == 'ADR':
            pred_adr_entities.add((cadec_label, text))

In [19]:
print("\nPredicted ADR entities:")
print(pred_adr_entities)


Predicted ADR entities:
{('ADR', 'side of face'), ('ADR', 'tingling'), ('ADR', 'bouts of anxiety'), ('ADR', 'depression'), ('ADR', 'headache'), ('ADR', 'severe'), ('ADR', 'pain'), ('ADR', 'loss of reason to live')}


In [20]:
meddra_demo = demo_file.replace('.txt', '.ann')
meddra_adr = extract_adr(os.path.join(meddra_dir, meddra_demo))

In [21]:
# Fuzzy match predicted ADRs

adr_tp = 0
for pred_label, pred_text in pred_adr_entities:
    for gt_label, gt_text in meddra_adr:
        if pred_label == gt_label and fuzz.partial_ratio(pred_text, gt_text) >= 80:
            adr_tp += 1
            break

In [22]:
adr_fp = len(pred_adr_entities) - adr_tp
adr_fn = len(meddra_adr) - adr_tp

In [23]:
adr_p = adr_tp / (adr_tp+adr_fp) if adr_tp+adr_fp > 0 else 0
adr_r = adr_tp / (adr_tp+adr_fn) if adr_tp+adr_fn > 0 else 0
adr_f1 = 2*adr_p*adr_r/(adr_p+adr_r) if adr_p+adr_r > 0 else 0

In [24]:
print(f"\nADR Precision: {adr_p:.2f}, Recall: {adr_r:.2f}, F1: {adr_f1:.2f}")



ADR Precision: 0.88, Recall: 1.00, F1: 0.93


# --- Task 5: Evaluate on 50 random files ---


In [25]:
random.seed(42)
sample_50 = random.sample(all_files, 50)

In [26]:
f1s = []
for fname in tqdm(sample_50, desc="Evaluating 50 random files"):
    _, _, f1 = evaluate_file(fname, text_dir, original_dir, ner_pipeline, label_map)
    f1s.append(f1)

Evaluating 50 random files: 100%|███████████████| 50/50 [00:07<00:00,  6.74it/s]


In [27]:
print(f"\nMean F1 over 50 random files: {sum(f1s)/len(f1s):.2f}")



Mean F1 over 50 random files: 0.51


# Task 6: SNOMED CT Mapping

In [29]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd

In [30]:
# Load SentenceTransformer for embedding-based similarity
embed_model = SentenceTransformer('all-MiniLM-L6-v2')

In [35]:
# --- Step 1: Combine `original` and `sct` for one file ---
def combine_original_and_sct(fname, original_dir, sct_dir):
    ann_path = os.path.join(original_dir, fname.replace('.txt', '.ann'))
    sct_path = os.path.join(sct_dir, fname.replace('.txt', '.ann'))

    # Parse original annotations
    id_to_label_text = {}
    with open(ann_path, 'r', encoding='utf-8', errors='replace') as f:
        for line in f:
            if line.startswith('#'):
                continue
            parts = line.strip().split('\t')
            if len(parts) != 3:
                continue
            id_, label_and_range, text = parts
            label = label_and_range.split()[0]
            id_to_label_text[id_] = (label, text)

    # Parse SCT annotations
    combined = []
    with open(sct_path, 'r', encoding='utf-8', errors='replace') as f:
        for line in f:
            if line.startswith('#'):
                continue
            parts = line.strip().split('\t')
            if len(parts) < 3:
                continue
            id_, code_and_term, _, text = parts[0], parts[1], parts[-2], parts[-1]
            id_no_T = id_[1:]  # remove extra T
            if id_no_T in id_to_label_text:
                label, gt_text = id_to_label_text[id_no_T]
                parts = code_and_term.split('|')
                if len(parts) >= 2:
                   code, term = parts[:2]
                else:
                   code = parts[0].strip()
                   term = 'N/A'

                combined.append({
                    'SNOMED_Code': code.strip(),
                    'SNOMED_Term': term.strip(),
                    'Label': label,
                    'Ground_Truth_Text': gt_text.strip()
                })

    df = pd.DataFrame(combined)
    return df

In [36]:
# --- Step 2: Apply to demo file ---
combined_df = combine_original_and_sct(demo_file, original_dir, sct_dir)
print("\nCombined Original + SCT:")
print(combined_df)


Combined Original + SCT:
           SNOMED_Code       SNOMED_Term Label         Ground_Truth_Text
0             25064002          Headache   ADR                 headaches
1            162397003    Pain in throat   ADR            pain in throat
2             62507009  Pins and needles   ADR  tingling in side of face
3             48694002           Anxiety   ADR                   anxiety
4             35489007        Depression   ADR                depression
5  CONCEPT_LESS 93 115               N/A   ADR    loss of reason to live
6            422587007            Nausea   ADR                    nausea


In [37]:
# --- Step 3: For predicted ADRs, find SNOMED code by two methods ---

# Get predicted ADRs
pred_adr_texts = [text for _, text in pred_adr_entities]

sct_terms = combined_df['SNOMED_Term'].tolist()
sct_codes = combined_df['SNOMED_Code'].tolist()
sct_texts = combined_df['Ground_Truth_Text'].tolist()
sct_embeddings = embed_model.encode(sct_terms, convert_to_tensor=True)

In [38]:
print("\nPredicted ADRs and matched SNOMED concepts:")



Predicted ADRs and matched SNOMED concepts:


In [39]:
for pred_text in pred_adr_texts:
    # (a) Approximate string match
    best_match_idx = -1
    best_score = -1
    for i, term in enumerate(sct_terms):
        score = fuzz.partial_ratio(pred_text, term.lower())
        if score > best_score:
            best_score = score
            best_match_idx = i
    approx_code = sct_codes[best_match_idx]
    approx_term = sct_terms[best_match_idx]

    # (b) Embedding similarity
    pred_emb = embed_model.encode(pred_text, convert_to_tensor=True)
    cos_sim = util.cos_sim(pred_emb, sct_embeddings)[0]
    best_idx_emb = cos_sim.argmax().item()
    emb_code = sct_codes[best_idx_emb]
    emb_term = sct_terms[best_idx_emb]

    print(f"\nPredicted ADR: {pred_text}")
    print(f"Approximate Match → SNOMED: {approx_term} ({approx_code}), Score: {best_score}")
    print(f"Embedding Match   → SNOMED: {emb_term} ({emb_code}), Cosine Sim: {cos_sim[best_idx_emb]:.2f}")


Predicted ADR: side of face
Approximate Match → SNOMED: Headache (25064002), Score: 54.54545454545454
Embedding Match   → SNOMED: Headache (25064002), Cosine Sim: 0.35

Predicted ADR: tingling
Approximate Match → SNOMED: Pain in throat (162397003), Score: 53.333333333333336
Embedding Match   → SNOMED: Nausea (422587007), Cosine Sim: 0.51

Predicted ADR: bouts of anxiety
Approximate Match → SNOMED: Anxiety (48694002), Score: 100.0
Embedding Match   → SNOMED: Anxiety (48694002), Cosine Sim: 0.81

Predicted ADR: depression
Approximate Match → SNOMED: Depression (35489007), Score: 100.0
Embedding Match   → SNOMED: Depression (35489007), Cosine Sim: 1.00

Predicted ADR: headache
Approximate Match → SNOMED: Headache (25064002), Score: 100.0
Embedding Match   → SNOMED: Headache (25064002), Cosine Sim: 1.00

Predicted ADR: severe
Approximate Match → SNOMED: Depression (35489007), Score: 54.54545454545454
Embedding Match   → SNOMED: Depression (35489007), Cosine Sim: 0.41

Predicted ADR: pain
