In [1]:
from datasets import load_dataset
ds = load_dataset("explosion/ner-fashion-brands")

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 100%|██████████| 1235/1235 [00:00<00:00, 25938.09 examples/s]
Generating eval split: 100%|██████████| 500/500 [00:00<00:00, 70010.08 examples/s]


In [11]:
import sys
sys.path.append("/workspaces/chisel/")

from chisel.extraction.parsers.json_span_parser import JSONSpanParser
import json

In [23]:
# 📦 Imports
from typing import Tuple, List
from chisel.extraction.base.protocols import Parser
from chisel.extraction.models.models import EntitySpan
from transformers import AutoTokenizer
from chisel.extraction.tokenizers.hf_tokenizer import HFTokenizer
from chisel.extraction.span_aligners.token_span_aligner import TokenSpanAligner
from chisel.extraction.labelers.bilo_labeler import BILOLabeler
from chisel.extraction.labelers.label_encoder import SimpleLabelEncoder
from chisel.extraction.validators.validators import DefaultParseValidator, HFTokenAlignmentValidator
from chisel.extraction.exporters.dataset_exporter import DatasetExporter
from chisel.extraction.models.models import ChiselRecord
from chisel.extraction.models.models import EntitySpan
from typing import List
import json

In [24]:
class JSONSpanParser(Parser):
    def parse(self, doc: str) -> Tuple[str, List[EntitySpan]]:
        """
        Parse a JSON-formatted string into plain text and entity spans.

        The input JSON must contain:
        - "text": a string of the original text
        - "entities": a list of dicts with "start", "end", and "label"

        Parameters:
        ----------
        doc : str
            A JSON string representing a single document with character-level entity annotations.

        Returns:
        -------
        Tuple[str, List[EntitySpan]]
            The original text and a list of extracted entity spans.
        """
        text = doc["text"]
        entities = [
            EntitySpan(
                text=text[e["start"] : e["end"]],
                start=e["start"],
                end=e["end"],
                label=e["label"],
            )
            for e in doc.get("spans", [])
        ]
        return text, entities


In [None]:
# 🔍 Components
parser = JSONSpanParser()
tokenizer = HFTokenizer(model_name="bert-base-cased")
aligner = TokenSpanAligner()
labeler = BILOLabeler()
label_encoder = SimpleLabelEncoder(label_to_id={
 'O': 0,
 'B-FASHION_BRAND': 1,
 'I-FASHION_BRAND': 2,
 'L-FASHION_BRAND': 3,
 'U-FASHION_BRAND': 4,
})

parse_validators = [DefaultParseValidator()]
label_validators = [HFTokenAlignmentValidator(tokenizer=tokenizer.tokenizer)]


exporter = DatasetExporter(output_path="./data/fashion-brands-ner")

In [32]:
processed_data = []

# 🔁 Pipeline loop
for idx, example in enumerate(ds["train"]):
    text, entities = parser.parse(example)
    
    for validator in parse_validators:
        validator.validate(text, entities)

    tokens = tokenizer.tokenize(text)
    token_entity_spans = aligner.align(entities, tokens)

    labels = labeler.label(tokens, token_entity_spans)
    encoded_labels = label_encoder.encode(labels)

    for validator in label_validators:
        validator.validate(tokens, token_entity_spans)

    record = ChiselRecord(
                id=str(idx),
                chunk_id=0,
                text=tokenizer.tokenizer.decode([token.id for token in tokens]),
                tokens=tokens,
                input_ids=[token.id for token in tokens],
                attention_mask=[1] * len(tokens),
                entities=[tes.entity for tes in token_entity_spans],
                bio_labels=labels,
                labels=encoded_labels
            )
    processed_data.append(record)

# export
exporter.export(processed_data)
print("✅ Export complete")

Token indices sequence length is longer than the specified maximum sequence length for this model (525 > 512). Running this sequence through the model will result in indexing errors
Saving the dataset (1/1 shards): 100%|██████████| 1235/1235 [00:00<00:00, 9064.38 examples/s]

✅ Export complete





IsADirectoryError: [Errno 21] Is a directory: './data/fashion-brands-ner.pkl'