# Test Notebook 01: BRAT Loader

**Purpose**: Validate BRAT parsing on sample files

**Tests**:
1. Load 2-3 sample `.txt`/`.ann` files from each split
2. Display raw BRAT annotations
3. Show parsed output (id, text, trigger, label)
4. Verify label distribution
5. Check for parsing errors


In [None]:
import sys
sys.path.append('..')

from pathlib import Path
from src.utils.brat_loader import BRATLoader, load_shac_data
import yaml
from collections import Counter
import pandas as pd


## 1. Load Configuration


In [None]:
# Load config
with open('../configs/data.yaml', 'r') as f:
    config = yaml.safe_load(f)

print("Configuration:")
print(f"  Data root: {config['raw_root']}")
print(f"  Sources: {config['sources']}")
print(f"  Splits: {config['splits']}")
print(f"  Target event: {config['target_event']}")
print(f"  Status labels: {config['status_labels']}")


## 2. Test on Sample File

Load one sample to see raw annotations and parsed output


In [None]:
# Test single file parsing
sample_txt = Path(config['raw_root']) / 'train' / 'mimic' / '0101.txt'
sample_ann = sample_txt.with_suffix('.ann')

print(f"Reading: {sample_txt}\n")

# Display raw text
with open(sample_txt, 'r') as f:
    text = f.read()
    print("=" * 80)
    print("RAW TEXT:")
    print("=" * 80)
    print(text)
    print("=" * 80)

# Display raw annotations
print("\nRAW ANNOTATIONS:")
print("=" * 80)
with open(sample_ann, 'r') as f:
    for line in f:
        line = line.strip()
        if line.startswith('E') and 'Drug' in line:
            print(line)
        elif line.startswith('A') and 'StatusTimeVal' in line:
            print(line)
        elif line.startswith('T') and ('Drug' in line or 'StatusTime' in line):
            print(line)
print("=" * 80)


In [None]:
# Parse and extract Drug events
loader = BRATLoader(target_event="Drug")
ann_data = loader.parse_ann_file(sample_ann)

events = loader.extract_drug_events(
    ann_data=ann_data,
    text=text,
    note_id='0101',
    source='mimic',
    split='train'
)

print("\nEXTRACTED DRUG EVENTS:")
print("=" * 80)
for event in events:
    print(f"ID: {event['id']}")
    print(f"  Trigger: '{event['trigger_text']}'")
    print(f"  Status Label: {event['status_label']}")
    print(f"  Source: {event['source']}")
    print(f"  Split: {event['split']}")
    print(f"  Text length: {len(event['text'])} chars")
    print()


## 3. Test on Multiple Files

Load 5 samples from each source and verify


In [None]:
# Load first 5 files from each source
sample_events = []

for source in ['mimic', 'uw']:
    dir_path = Path(config['raw_root']) / 'train' / source
    txt_files = sorted(dir_path.glob('*.txt'))[:5]
    
    print(f"\nProcessing {source}: {len(txt_files)} files")
    
    for txt_file in txt_files:
        ann_file = txt_file.with_suffix('.ann')
        
        with open(txt_file, 'r') as f:
            text = f.read()
        
        ann_data = loader.parse_ann_file(ann_file)
        events = loader.extract_drug_events(
            ann_data=ann_data,
            text=text,
            note_id=txt_file.stem,
            source=source,
            split='train'
        )
        sample_events.extend(events)
        print(f"  {txt_file.stem}: {len(events)} Drug events")

print(f"\n✅ Total Drug events extracted: {len(sample_events)}")

# Convert to DataFrame
df = pd.DataFrame(sample_events)

# Show label distribution
print("\nLabel Distribution:")
for label, count in df['status_label'].value_counts().items():
    print(f"  {label}: {count} ({100*count/len(df):.1f}%)")

# Show source distribution  
print("\nSource Distribution:")
for source, count in df['source'].value_counts().items():
    print(f"  {source}: {count} ({100*count/len(df):.1f}%)")

# Show sample
print("\nSample events:")
df[['id', 'source', 'trigger_text', 'status_label']].head(10)


## ✅ Validation Checklist

**Check before proceeding to Phase 2:**

- BRAT loader successfully parses sample files
- All Drug events extracted with correct StatusTime labels
- Trigger text extracted correctly
- Source (mimic/uw) tracked correctly
- No missing IDs, triggers, or labels
- Label values match expected: none, current, past, Not Applicable
- Both MIMIC and UW samples processed

If all checks pass, proceed to implementing preprocessing!
