In [1]:
import sys
sys.path.append("/workspaces/chisel/")

# 👗 Example: Processing Fashion Brand NER (JSON Format) with Chisel

This example shows how to preprocess the explosion/ner-fashion-brands dataset into ChiselRecord objects for training transformer-based NER models using BILO labeling.

## 📥 Step 1: Load the Dataset

In [2]:
from datasets import load_dataset
ds = load_dataset("explosion/ner-fashion-brands")

  from .autonotebook import tqdm as notebook_tqdm
Using the latest cached version of the dataset since explosion/ner-fashion-brands couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/vscode/.cache/huggingface/datasets/explosion___ner-fashion-brands/default/0.0.0/3e49f04a58c644035071317efa1c3d6e4a52e6e6 (last modified on Mon Jun  9 06:26:22 2025).


In [7]:
ds['train'][0].keys()

dict_keys(['text', 'meta', '_input_hash', '_task_hash', 'tokens', 'spans', '_session_id', '_view_id', 'answer'])

## 🧩 Step 2: Implement a JSON Span Parser
The dataset provides character-level spans in a spans field. We write a parser that extracts these into Chisel's EntitySpan format.

In [8]:
from typing import Tuple, List
from chisel.extraction.base.protocols import Parser
from chisel.extraction.models.models import EntitySpan

class JSONSpanParser(Parser):
    def parse(self, doc: dict) -> Tuple[str, List[EntitySpan]]:
        text = doc["text"]
        entities = [
            EntitySpan(
                text=text[e["start"]:e["end"]],
                start=e["start"],
                end=e["end"],
                label=e["label"]
            )
            for e in doc.get("spans", [])
        ]
        return text, entities

## 🔧 Step 3: Initialize Chisel Components

In [10]:
from chisel.extraction.tokenizers.hf_tokenizer import HFTokenizer
from chisel.extraction.span_aligners.token_span_aligner import TokenSpanAligner
from chisel.extraction.labelers.bilo_labeler import BILOLabeler
from chisel.extraction.labelers.label_encoder import SimpleLabelEncoder
from chisel.extraction.validators.validators import DefaultParseValidator, HFTokenAlignmentValidator
from chisel.extraction.formatters.torch_formatter import TorchDatasetFormatter
from chisel.extraction.models.models import ChiselRecord

## Component Setup

In [11]:
parser = JSONSpanParser()
tokenizer = HFTokenizer(model_name="bert-base-cased")
aligner = TokenSpanAligner()
labeler = BILOLabeler()

label_encoder = SimpleLabelEncoder(label_to_id={
    'O': 0,
    'B-FASHION_BRAND': 1,
    'I-FASHION_BRAND': 2,
    'L-FASHION_BRAND': 3,
    'U-FASHION_BRAND': 4,
})

parse_validators = [DefaultParseValidator()]
label_validators = [HFTokenAlignmentValidator(tokenizer=tokenizer.tokenizer)]
formatter = TorchDatasetFormatter()

## 🔄 Step 4: Run the Preprocessing Pipeline

In [12]:
processed_data = []

for idx, example in enumerate(ds["train"]):
    text, entities = parser.parse(example)

    for validator in parse_validators:
        validator.validate(text, entities)

    tokens = tokenizer.tokenize(text)
    token_entity_spans = aligner.align(entities, tokens)

    labels = labeler.label(tokens, token_entity_spans)
    encoded_labels = label_encoder.encode(labels)

    for validator in label_validators:
        validator.validate(tokens, token_entity_spans)

    record = ChiselRecord(
        id=str(idx),
        chunk_id=0,
        text=tokenizer.tokenizer.decode([token.id for token in tokens]),
        tokens=tokens,
        input_ids=[token.id for token in tokens],
        attention_mask=[1] * len(tokens),
        entities=[tes.entity for tes in token_entity_spans],
        bio_labels=labels,
        labels=encoded_labels
    )
    processed_data.append(record)

data = formatter.format(processed_data)

Token indices sequence length is longer than the specified maximum sequence length for this model (525 > 512). Running this sequence through the model will result in indexing errors


### ✅ Output
You now have a torch dataset ready for training!

In [17]:
data[0]

{'input_ids': tensor([ 1135,   112,   188,  1155, 12629,  1111,  1134,  2736,  1618,   117,
          7572,   146,  1631,  1115,  1103,  1167,  2379,  1103,  1716,  2736,
          1103,  1618,  1103,  1947,   117,  1134,  1111,  1143,  2086,  1280,
          1114,   170, 22591,  1566,  3146,  1134,  2972,  1103,  1716,  1702,
          1112,  2379,  1112,  1936,  1229,  1253,  2355,  1122,  1107,  1282]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1]),
 'labels': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0])}