## Umwandlung der annotierten Daten ins BIO-Format

In [5]:
import json
from nltk.tokenize import TreebankWordTokenizer

def convert_to_conll_span_tokenizer(json_data):
    tokenizer = TreebankWordTokenizer()
    dataset = []

    for item in json_data:
        text = item['text']
        entities = item['entities']
        
        # Liste der Entity-Spans
        spans = []
        for ent in entities:
            start, end, label = ent['start'], ent['end'], ent['label']
            if start >= len(text) or end > len(text) or start >= end:
                continue
            spans.append((start, end, label))

        # Tokenize mit Start/End
        tokens = list(tokenizer.span_tokenize(text))
        conll_sample = []

        for start, end in tokens:
            token = text[start:end]
            tag = 'O'

            for ent_start, ent_end, label in spans:
                if start == ent_start:
                    tag = f'B-{label}'
                    break
                elif ent_start < start < ent_end:
                    tag = f'I-{label}'
                    break

            conll_sample.append((token, tag))

        dataset.append(conll_sample)

    return dataset

def save_to_conll(dataset, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        for sample in dataset:
            for token, tag in sample:
                f.write(f"{token} {tag}\n")
            f.write("\n")

# Laden und ausfÃ¼hren
with open('../../data/data_annotated.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

conll_data = convert_to_conll_span_tokenizer(data)
save_to_conll(conll_data, '../../data/output_data.conll')
