In [1]:
import sys
sys.path.append("/workspaces/chisel/")

# 🧪 Example: Processing the NCBI Disease Dataset with Chisel
This example demonstrates how to preprocess the [NCBI Disease Corpus](https://www.ncbi.nlm.nih.gov/CBBresearch/Dogan/DISEASE/) using the Chisel library, transforming the data into a format suitable for transformer-based token classification models.

## 📥 Step 1: Download the NCBI Dataset

The NCBI corpus uses inline HTML-like tags to annotate disease mentions. Annotations look like:


```
<category="SpecificDisease">Cancer</category>
```

To work with this format, we ensure it is valid XML by renaming attributes to:

```
<category category="SpecificDisease">Cancer</category>
```

## ✅ Download and extract the dataset

In [2]:
import requests
import zipfile
import io
import os

# Step 1: Provide the URL to the zip file
url = "https://www.ncbi.nlm.nih.gov/CBBresearch/Dogan/DISEASE/NCBI_corpus.zip"

# Step 2: Set the extraction destination
extract_to = "./data"
os.makedirs(extract_to, exist_ok=True)

# Step 3: Download and extract
response = requests.get(url)
if response.status_code == 200:
    with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
        zip_ref.extractall(extract_to)
    print(f"Extracted contents to: {extract_to}")
else:
    print(f"Failed to download file. Status code: {response.status_code}")

Extracted contents to: ./data


## 📄 Step 2: Load and Clean the Data

In [2]:
annotations = []
with open("/workspaces/chisel/examples/data/NCBI_corpus_training.txt", "r") as f:
    for line in f.readlines():
        splits = line.split("\t")
        annotations.append({
            "id": splits[0].strip(),
            "text": " ".join(splits[1:]).strip().replace('<category="', '<category category="') # Ensure correct XML format
        })

## 🧱 Step 3: Preprocess with Chisel
We define and connect the pipeline components from Chisel.

### 🔧 Setup

In [3]:
# 📦 Imports
from transformers import AutoTokenizer
from chisel.extraction.parsers.html_tag_parser import HTMLTagParser
from chisel.extraction.tokenizers.hf_tokenizer import HFTokenizer
from chisel.extraction.chunkers.fixed_length_chunker import FixedLengthTokenChunker
from chisel.extraction.span_aligners.token_span_aligner import TokenSpanAligner
from chisel.extraction.labelers.bio_labeler import BIOLabeler
from chisel.extraction.labelers.label_encoder import SimpleLabelEncoder
from chisel.extraction.validators.validators import DefaultParseValidator, HFTokenAlignmentValidator
from chisel.extraction.formatters.torch_formatter import TorchDatasetFormatter
from chisel.extraction.models.models import ChiselRecord
from chisel.extraction.models.models import EntitySpan
from typing import List

  from .autonotebook import tqdm as notebook_tqdm


## 🔧 Step 3: Initialize Chisel Components

In [8]:
# 🔍 Components
parser = HTMLTagParser(label_strategy="attribute", attribute_name="category")
tokenizer = HFTokenizer(model_name="bert-base-cased")
aligner = TokenSpanAligner()
chunker = FixedLengthTokenChunker(max_tokens=512, overlap=0)
labeler = BIOLabeler()
label_encoder = SimpleLabelEncoder(label_to_id={
 'O': 0,
 'B-Modifier': 1,
 'I-Modifier': 2,
 'B-SpecificDisease': 3,
 'I-SpecificDisease': 4,
 'B-CompositeMention': 5,
 'I-CompositeMention': 6,
 'B-DiseaseClass': 7,
 'I-DiseaseClass': 8,
})

parse_validators = [DefaultParseValidator(on_error="raise")]
label_validators = [HFTokenAlignmentValidator(tokenizer=tokenizer.tokenizer, on_error="raise")]
formatter = TorchDatasetFormatter()

### 🔄 Step 4: Run the Pipeline

In [9]:
processed_data = []

# 🔁 Pipeline loop
for example in annotations:
    text, entities = parser.parse(example["text"])
    
    # 🧪 Per-span validation — skip bad spans
    valid_spans = []
    for span in entities:
        try:
            for validator in parse_validators:
                validator.validate(text, span)
            valid_spans.append(span)
        except ValueError:
            continue 

    tokens = tokenizer.tokenize(text)
    token_entity_spans = aligner.align(entities, tokens)

    token_chunks, entity_chunks = chunker.chunk(tokens, token_entity_spans)

    for chunk_id, (toks, ents) in enumerate(zip(token_chunks, entity_chunks)):

        labels = labeler.label(toks, ents)
        encoded_labels = label_encoder.encode(labels)

        # 🧪 Per-span validation — skip bad spans
        valid_token_spans = []
        for span in token_entity_spans:
            try:
                for validator in label_validators:
                    validator.validate(tokens, span)
                valid_token_spans.append(span)
            except ValueError as e:
                print(f"Validation error: {e}")  # Log the error for debugging
                continue  # Optionally log or collect stats on dropped spans

        record = ChiselRecord(
                id=example["id"],
                chunk_id=chunk_id,
                text=tokenizer.tokenizer.decode([token.id for token in toks]),
                tokens=toks,
                input_ids=[token.id for token in toks],
                attention_mask=[1] * len(toks),
                entities=[tes.entity for tes in valid_token_spans],
                bio_labels=labels,
                labels=encoded_labels
            )
        processed_data.append(record)

data = formatter.format(processed_data)

Token indices sequence length is longer than the specified maximum sequence length for this model (570 > 512). Running this sequence through the model will result in indexing errors


Validation error: Token span and entity span mismatch:
  Decoded actual: 'autosomal dominant'
  Decoded expected: 'autosomal dominant disorde'
Validation error: Token span and entity span mismatch:
  Decoded actual: 'absence of the seventh component of'
  Decoded expected: 'absence of the seventh component of complemen'


### ✅ Output
You now have a torch dataset ready for modelling!

In [8]:
data[0].keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [14]:
for idx, mask, label in zip(
    data[1]["input_ids"][0:20], 
    data[1]["attention_mask"][0:20], 
    data[1]["labels"][0:20]
):
    print(f"({idx}): {mask}, {label}")

(138): 1, 0
(1887): 1, 0
(10978): 1, 0
(3048): 1, 0
(1477): 1, 0
(17895): 1, 0
(1107): 1, 0
(1483): 1, 0
(1105): 1, 0
(1456): 1, 0
(1237): 1, 0
(145): 1, 1
(14576): 1, 2
(12096): 1, 2
(2073): 1, 0
(131): 1, 0
(4247): 1, 0
(117): 1, 0
(185): 1, 0
(10436): 1, 0
