In [1]:
import sys
sys.path.append("/workspaces/chisel/")

# 🧪 Example: Processing CoNLL NER Data with Chisel
This example demonstrates how to parse the CoNLL-2003 dataset into Chisel's internal ChiselRecord format, suitable for training transformer-based token classification models.

## 📥 Step 1: Download CoNLL Data
We use the version hosted by the [CrossWeigh](https://github.com/ZihanWangKi/CrossWeigh) repository.

In [2]:
import requests

url = "https://raw.githubusercontent.com/ZihanWangKi/CrossWeigh/refs/heads/master/data/conllpp_train.txt"
response = requests.get(url)
docs = response.text.split("-DOCSTART- -X- -X- O\n\n")
docs = list(filter(lambda x: len(x) > 0, docs))

In [7]:
for item in docs[0:1]:
    print(item)

EU NNP B-NP B-ORG
rejects VBZ B-VP O
German JJ B-NP B-MISC
call NN I-NP O
to TO B-VP O
boycott VB I-VP O
British JJ B-NP B-MISC
lamb NN I-NP O
. . O O

Peter NNP B-NP B-PER
Blackburn NNP I-NP I-PER

BRUSSELS NNP B-NP B-LOC
1996-08-22 CD I-NP O

The DT B-NP O
European NNP I-NP B-ORG
Commission NNP I-NP I-ORG
said VBD B-VP O
on IN B-PP O
Thursday NNP B-NP O
it PRP B-NP O
disagreed VBD B-VP O
with IN B-PP O
German JJ B-NP B-MISC
advice NN I-NP O
to TO B-PP O
consumers NNS B-NP O
to TO B-VP O
shun VB I-VP O
British JJ B-NP B-MISC
lamb NN I-NP O
until IN B-SBAR O
scientists NNS B-NP O
determine VBP B-VP O
whether IN B-SBAR O
mad JJ B-NP O
cow NN I-NP O
disease NN I-NP O
can MD B-VP O
be VB I-VP O
transmitted VBN I-VP O
to TO B-PP O
sheep NN B-NP O
. . O O

Germany NNP B-NP B-LOC
's POS B-NP O
representative NN I-NP O
to TO B-PP O
the DT B-NP O
European NNP I-NP B-ORG
Union NNP I-NP I-ORG
's POS B-NP O
veterinary JJ I-NP O
committee NN I-NP O
Werner NNP I-NP B-PER
Zwingmann NNP I-NP I-PER
sa

## 🧩 Step 2: Define a Parser for CoNLL Format
Note: Whilst this may seem a bit complicated at first glance, with help of generative AI and the validators chisel provide, it should be fairly quick to write own custom parsers.

In [8]:
from typing import Tuple, List
from chisel.extraction.base.protocols import Parser
from chisel.extraction.models.models import EntitySpan
import string


class ConllParser(Parser):
    def parse(self, doc: str) -> Tuple[str, List[EntitySpan]]:
        tokens, labels = [], []
        for line in doc.strip().splitlines():
            if not line.strip():
                continue
            splits = line.strip().split(" ")
            tokens.append(splits[0])
            labels.append(splits[-1])

        text = ""
        spans = []
        char_offset = 0
        i = 0

        while i < len(tokens):
            token = tokens[i]
            label = labels[i]

            # Only add joiner if previous token exists and current token is not punctuation
            if text and token not in string.punctuation:
                text += " "
                char_offset += len(" ")

            if label.startswith("B-"):
                ent_label = label[2:]
                ent_start = char_offset
                ent_text = token
                text += token
                char_offset += len(token)
                i += 1
                while i < len(tokens) and labels[i].startswith("I-"):
                    text += " " + tokens[i]
                    ent_text += " " + tokens[i]
                    char_offset += len(" ") + len(tokens[i])
                    i += 1
                ent_end = char_offset
                spans.append(
                    EntitySpan(
                        text=ent_text, start=ent_start, end=ent_end, label=ent_label
                    )
                )
            else:
                text += token
                char_offset += len(token)
                i += 1
        return text.strip(), spans

## 🔧 Step 3: Initialize Chisel Components

In [9]:
# 📦 Imports
from transformers import AutoTokenizer
from chisel.extraction.tokenizers.hf_tokenizer import HFTokenizer
from chisel.extraction.span_aligners.token_span_aligner import TokenSpanAligner
from chisel.extraction.labelers.bio_labeler import BIOLabeler
from chisel.extraction.labelers.label_encoder import SimpleLabelEncoder
from chisel.extraction.validators.validators import DefaultParseValidator, HFTokenAlignmentValidator
from chisel.extraction.formatters.hf_formatter import HFDatasetFormatter
from chisel.extraction.models.models import ChiselRecord
from chisel.extraction.models.models import EntitySpan
from typing import List

  from .autonotebook import tqdm as notebook_tqdm


## 📦 Components

In [10]:
parser = ConllParser()
tokenizer = HFTokenizer(model_name="bert-base-cased")
aligner = TokenSpanAligner()
labeler = BIOLabeler()
label_encoder = SimpleLabelEncoder(label_to_id={
 'O': 0,
 'B-ORG': 1,
 'I-ORG': 2,
 'B-PER': 3,
 'I-PER': 4,
 'B-MISC': 5,
 'I-MISC': 6,
 'B-LOC': 7,
 'I-LOC': 8
})

parse_validators = [DefaultParseValidator()]
label_validators = [HFTokenAlignmentValidator(tokenizer=tokenizer.tokenizer)]
formatters = HFDatasetFormatter()

## 🔄 Step 4: Run the Pipeline

In [12]:
processed_data = []

# 🔁 Pipeline loop
for idx, example in enumerate(docs):
    text, entities = parser.parse(example)
    
    # 🧪 Per-span validation — skip bad spans
    valid_spans = []
    for span in entities:
        try:
            for validator in parse_validators:
                validator.validate(text, span)
            valid_spans.append(span)
        except ValueError:
            continue 

    tokens = tokenizer.tokenize(text)
    token_entity_spans = aligner.align(entities, tokens)

    labels = labeler.label(tokens, token_entity_spans)
    encoded_labels = label_encoder.encode(labels)

    # 🧪 Per-span validation — skip bad spans
    valid_token_spans = []
    for span in token_entity_spans:
        try:
            for validator in label_validators:
                validator.validate(tokens, span)
            valid_token_spans.append(span)
        except ValueError:
            continue  # Optionally log or collect stats on dropped spans

    record = ChiselRecord(
                id=str(idx),
                chunk_id=0,
                text=tokenizer.tokenizer.decode([token.id for token in tokens]),
                tokens=tokens,
                input_ids=[token.id for token in tokens],
                attention_mask=[1] * len(tokens),
                entities=[tes.entity for tes in valid_token_spans],
                bio_labels=labels,
                labels=encoded_labels
            )
    processed_data.append(record)

data = formatters.format(processed_data)

Token indices sequence length is longer than the specified maximum sequence length for this model (550 > 512). Running this sequence through the model will result in indexing errors


### ✅ Output
You now have a model ready hugginface dataset

In [13]:
data[0].keys()

dict_keys(['id', 'chunk_id', 'tokens', 'input_ids', 'attention_mask', 'labels', 'bio_labels'])

In [25]:
for token, idx, label, bio_label in zip(
    data[0]["tokens"][0:10],
    data[0]["input_ids"][0:10], 
    data[0]["labels"][0:10], 
    data[0]["bio_labels"][0:10]
):
    print(f"Token ({idx}): {token}, {label}, {bio_label}")

Token (7270): EU, 1, B-ORG
Token (22961): rejects, 0, O
Token (1528): German, 5, B-MISC
Token (1840): call, 0, O
Token (1106): to, 0, O
Token (21423): boycott, 0, O
Token (1418): British, 5, B-MISC
Token (2495): la, 0, O
Token (12913): ##mb, 0, O
Token (119): ., 0, O
