In [12]:
import json
from datasets import load_dataset

In [None]:
from datasets import load_dataset

ds = load_dataset("rcds/swiss_citation_extraction", "original")
ds

In [2]:
def join_tokens(tokens):
    # code from Gliner_Studio: https://colab.research.google.com/drive/1Kl3TrpiGBpMw569ek_AL6Ee3uqBK-Gfw?usp=sharing
    # Joining tokens with space, but handling special characters correctly
    text = ""
    for token in tokens:
        if token in {",", ".", "!", "?", ":", ";", "..."}:
            text = text.rstrip() + token
        else:
            text += " " + token
    return text.strip()

In [3]:
example = ds['train'][42]
join_tokens(example['considerations']), example['NER_labels']

('in Erwägung, dass das Obergericht erwog, nach Einreichung einer Beschwerde mit ungebührlichem Inhalt sei die Beschwerdeführerin mit Verfügung vom 7. Oktober 2011 nach § 187 GOG i. V. m. Art. 132 Abs. 1 und 2 ZPO ( unter Androhung der Säumnisfolgen ) zur Einreichung einer Beschwerdeschrift ohne ungebührlichen Inhalt innert 5 Tagen aufgefordert worden, indessen sei auch die neue Eingabe ungebührlich, die erste Eingabe sei somit nicht verbessert worden, weshalb androhungsgemäss die Beschwerde als nicht erfolgt gelte und das Beschwerdeverfahren abzuschreiben sei unter Auferlegung der unnötigen Prozesskosten von Fr. 500. - - an die Beschwerdeführerin ( Art. 108 ZPO ), dass die Beschwerde nach Art. 72ff. BGG nebst einem Antrag eine Begründung zu enthalten hat, in welcher in gedrängter Form dargelegt wird, inwiefern der angefochtene Entscheid Recht ( Art. 95f. BGG ) verletzt ( Art. 42 Abs. 1 und 2 BGG ), ansonst auf die Beschwerde nicht eingetreten wird ( Art. 108 Abs. 1 lit. b BGG ), dass 

In [4]:
def convert_ner_labels(ner_labels):
    ner = []
    i = 0
    while i < len(ner_labels):
        if ner_labels[i] == 1:
            start = i
            while i<len(ner_labels) - 1 and ner_labels[i+1] in [1,2]:
                i += 1
            ner.append([start, i, "citation"])
        elif ner_labels[i] == 3:
            start = i
            while i<len(ner_labels) - 1 and ner_labels[i+1] in [3,4]:
                i += 1
            ner.append([start, i, "law"])
        else:
            i += 1

    return ner

convert_ner_labels(example['NER_labels'])

[[35, 37, 'law'],
 [109, 112, 'law'],
 [119, 121, 'law'],
 [147, 149, 'law'],
 [155, 157, 'law'],
 [174, 183, 'law'],
 [222, 225, 'citation']]

In [5]:
def view(tokenized_text, ners):
    for ner in ners:
        start, end, label = ner
        print(f"{label}: {tokenized_text[start:end+1]}")

view(example['considerations'], convert_ner_labels(example['NER_labels']))

law: ['Art', '.', '132']
law: ['Art', '.', '108', 'ZPO']
law: ['Art', '.', '72ff']
law: ['Art', '.', '95f']
law: ['Art', '.', '42']
law: ['Art', '.', '108', 'Abs', '.', '1', 'lit', '.', 'b', 'BGG']
citation: ['BGE', '133', 'IV', '286']


In [18]:
# merge train, validation, test into one dataset which is a list of dicts
# each dict has 2 keys 'tokenized_text' which is same as 'considerations' in dataset
# and 'ner' which is a list of lists where each list has 3 elements: start, end, label

def merge(ds):
    data = []
    for split in ds:
        print("Processing", split)
        for i, example in enumerate(ds[split]):
            print(f"Processing {i+1}/{len(ds[split])}", end="\r")
            tokenized_text = example['considerations']
            ner_labels = example['NER_labels']
            ner = convert_ner_labels(ner_labels)
            data.append({'tokenized_text': tokenized_text, 'ner': ner})
        print()
    return data

data = merge(ds)


Processing train
Processing 87760/87760
Processing validation
Processing 12359/12359
Processing test
Processing 27364/27364


In [8]:
def create_set(ds):
    data = []
    for i, example in enumerate(ds):
        print(f"Processing {i+1}/{len(ds)}", end="\r")
        tokenized_text = example['considerations']
        ner_labels = example['NER_labels']
        ner = convert_ner_labels(ner_labels)
        data.append({'tokenized_text': tokenized_text, 'ner': ner})
    print()

    return data

In [None]:
citation_train = create_set(ds['train'])

with open("citation_data/citation_train.json",'w') as f:
    json.dump(citation_train,f)

Processing 87760/87760


In [None]:
#save dataset

citation_val = create_set(ds['validation'])

with open("citation_data/citation_validation.json",'w') as f:
    json.dump(citation_val,f)
    

In [16]:
#save dataset

citation_test = create_set(ds['test'])

with open("citation_data/citation_test.json",'w') as f:
    json.dump(citation_test,f)

Processing 27364/27364


In [19]:
data[0]

{'tokenized_text': ['Erwägungen',
  ':',
  '1',
  '.',
  '1',
  '.',
  '1',
  '.',
  'Die',
  'Beschwerde',
  'kann',
  'wegen',
  'Rechtsverletzung',
  'gemäss',
  'Art',
  '.',
  '95',
  'und',
  'Art',
  '.',
  '96',
  'BGG',
  'erhoben',
  'werden',
  '.',
  'Das',
  'Bundesgericht',
  'wendet',
  'das',
  'Recht',
  'von',
  'Amtes',
  'wegen',
  'an',
  '(',
  'Art',
  '.',
  '106',
  'Abs',
  '.',
  '1',
  'BGG',
  ')',
  '.',
  'Es',
  'ist',
  'somit',
  'weder',
  'an',
  'die',
  'in',
  'der',
  'Beschwerde',
  'geltend',
  'gemachten',
  'Argumente',
  'noch',
  'an',
  'die',
  'Erwägungen',
  'der',
  'Vorinstanz',
  'gebunden',
  ';',
  'es',
  'kann',
  'eine',
  'Beschwerde',
  'aus',
  'einem',
  'anderen',
  'als',
  'dem',
  'angerufenen',
  'Grund',
  'gutheissen',
  'und',
  'es',
  'kann',
  'sie',
  'mit',
  'einer',
  'von',
  'der',
  'Argumentation',
  'der',
  'Vorinstanz',
  'abweichenden',
  'Begründung',
  'abweisen',
  '(',
  'vgl',
  '.',
  'BGE',
  '1

In [None]:
import json
with open("citation_data/swiss_citation_extraction.json", "w") as f:
    json.dump(data, f)