In [12]:
import json

In [13]:
with open('anon_data/complete_ner_training_data.json', 'r') as f:
    data = json.load(f)

In [25]:
def join_tokens(tokens):
    # code from Gliner_Studio: https://colab.research.google.com/drive/1Kl3TrpiGBpMw569ek_AL6Ee3uqBK-Gfw?usp=sharing
    # Joining tokens with space, but handling special characters correctly
    text = ""
    for token in tokens:
        if token in {",", ".", "!", "?", ":", ";", "..."}:
            text = text.rstrip() + token
        else:
            text += " " + token
    return text.strip()

join_tokens(data[110771]['tokenized_text'])

"Bundesgericht Tribunal fédéral Tribunale federale Tribunal federal 5A_541 / 2022 Urteil vom 15. Juli 2022 II. zivilrechtliche Abteilung Besetzung Bundesrichterin Escher, präsidierendes Mitglied, Gerichtsschreiber Möckli. Verfahrensbeteiligte Angela Leuenberger, Beschwerdeführer, gegen Familiengericht Muri, Seetalstrasse 8, 5630 Muri. Gegenstand Erweiterung einer Beistandschaft, Beschwerde gegen den Entscheid des Obergerichts des Kantons Aargau, Kammer für Kindes - und Erwachsenenschutz, vom 23. Juni 2022 ( XBE. 2022. 16 ). Sachverhalt: Die Vorgeschichte ist dem Bundesgericht aus einer Vielzahl von Verfahren bekannt. Der Beschwerdeführer leidet an gutachterlich festgestellten Wahnvorstellungen und befindet sich zur Zeit in Untersuchungshaft. Seit dem 2. Juni 2021 besteht eine Vertretungsbeistandschaft mit Vermögensverwaltung mit teilweisem Entzug der Handlungsfähigkeit. Am 29. Oktober 2021 wandte er sich an das Familiengericht Muri und beantragte die Sistierung der Wohnungsmiete für di

In [3]:
tokenized_texts = [data[i]['tokenized_text'] for i in range(len(data))]
text_lens = [len(tokenized_texts[i]) for i in range(len(tokenized_texts))]

In [4]:
# print summary statistics about text lengths
print('Number of texts:', len(tokenized_texts))
print('Mean text length:', sum(text_lens) / len(tokenized_texts))
print('Max text length:', max(text_lens))
print('Min text length:', min(text_lens))
print('Median text length:', sorted(text_lens)[len(text_lens) // 2])
print('Top 10 text lengths:', sorted(text_lens, reverse=True)[:10])

Number of texts: 124089
Mean text length: 1767.4813641821595
Max text length: 93973
Min text length: 0
Median text length: 855
Top 10 text lengths: [93973, 87855, 78326, 71245, 58304, 58178, 57889, 56923, 44227, 40641]


In [5]:
# proportion of text lenghts more than 32000
MAXLEN = 32000
print(f'Proportion of text lengths more than {MAXLEN}:', sum([1 for l in text_lens if l > MAXLEN]) / len(text_lens))

Proportion of text lengths more than 32000: 0.00013699844466471644


In [6]:
# filter for data points less than 32000 length and existing NER Label
filtered_data = [data[i] for i in range(len(data)) if text_lens[i] <= MAXLEN and text_lens[i] > 0 and len(data[i]['ner']) > 0]
print('Number of texts remaining:', len(filtered_data))

Number of texts remaining: 124089


In [7]:
# save the filtered data
with open('anon_data/filtered_ner_training_data.json', 'w') as f:
    json.dump(filtered_data, f)

In [8]:
filtered_data[0]

{'tokenized_text': ['Parteien',
  'Maëlle',
  'Meyer',
  ',',
  'Beschwerdeführer',
  ',',
  'vertreten',
  'durch',
  'Fürsprecher',
  'Roman',
  'Weber',
  ',',
  'gegen',
  'Verhöramt',
  'des',
  'Kantons',
  'Schwyz',
  ',',
  'Sicherheitsstützpunkt',
  'Biberbrugg',
  ',',
  'Postfach',
  '74',
  ',',
  '8836',
  'Bennau',
  ',',
  'Kantonsgerichtspräsident',
  'des',
  'Kantons',
  'Schwyz',
  ',',
  'Kollegiumstrasse',
  '28',
  ',',
  'Postfach',
  '2265',
  ',',
  '6431',
  'Schwyz',
  '.',
  'Gegenstand',
  'Untersuchungshaft',
  ',',
  'Beschwerde',
  'in',
  'Strafsachen',
  'gegen',
  'die',
  'Verfügung',
  'des',
  'Kantonsgerichtspräsidenten',
  'des',
  'Kantons',
  'Schwyz',
  'vom',
  '5',
  '.',
  'Januar',
  '2007',
  '.',
  'Sachverhalt',
  ':',
  'A',
  '.',
  'Die',
  'Kantonspolizei',
  'Schwyz',
  'verhaftete',
  'Maëlle',
  'Meyer',
  'am',
  '2',
  '.',
  'Januar',
  '2007',
  'wegen',
  'des',
  'Verdachts',
  ',',
  'Einbruchdiebstähle',
  'begangen',
  '

In [14]:

def chunk_data(sample, chunk_size, offset):
    tokenized_text = sample['tokenized_text']
    ners = sample['ner']
    new_data = []
    for i in range(0, len(tokenized_text), chunk_size):
        new_data_dict = {}
        start = i
        end = i + chunk_size + offset
        new_data_dict['tokenized_text'] = tokenized_text[start:end]
        new_data_dict['ner'] = []
        for ner_label in ners:
            if ner_label[0] >= i and ner_label[0] < end:
                new_data_dict['ner'].append([ner_label[0] - i, ner_label[1] - i, ner_label[2]])
        if len(new_data_dict['ner']) > 0:
            new_data.append(new_data_dict)
    return new_data

def chunk_dataset(data, chunk_size, offset):
    new_data = []
    for i, sample in enumerate(data):
        print(f"Processing sample {i + 1} of {len(data)}", end='\r')
        new_data.extend(chunk_data(sample, chunk_size, offset))
    print()
    return new_data

# After filtering the data, we divide it into chunks with an offset 
chunk_size = 256
offset = 10

chunked_data = chunk_dataset(filtered_data, chunk_size, offset)
print('Number of data points:', len(data))
print('Number of chunks:', len(chunked_data))


Processing sample 80220 of 80220
Number of data points: 124089
Number of chunks: 303794


In [15]:
chunked_data[12312]['ner']

[[6, 7, 'a_name'],
 [37, 38, 'a_name'],
 [245, 246, 'a_name'],
 [174, 175, 'a_name'],
 [177, 178, 'a_name'],
 [9, 10, 'a_name'],
 [26, 27, 'a_name']]

In [16]:
# save the data
with open('anon_data/chunked_ner_training_data.json', 'w') as f:
    json.dump(chunked_data, f)

In [19]:
chunked_data[12312]['tokenized_text'][5:39]

['de',
 'Ghulam',
 'Hess',
 'et',
 'Oliver',
 'Eichelberger',
 ',',
 'ainsi',
 'que',
 'd',
 "'",
 'une',
 'dénonciation',
 'émanant',
 'd',
 "'",
 'un',
 'tiers',
 ',',
 'le',
 'dénommé',
 'Carmen',
 'Thurnher',
 '.',
 'Dans',
 'sa',
 'plainte',
 'du',
 '5',
 'juillet',
 '2004',
 ',',
 'Ghulam',
 'Hess']

In [20]:
tokenized_texts = [chunked_data[i]['tokenized_text'] for i in range(len(chunked_data))]
text_lens = [len(tokenized_texts[i]) for i in range(len(tokenized_texts))]
print('Number of texts:', len(tokenized_texts))
print('Mean text length:', sum(text_lens) / len(tokenized_texts))
print('Max text length:', max(text_lens))
print('Min text length:', min(text_lens))
print('Median text length:', sorted(text_lens)[len(text_lens) // 2])
print('Top 10 text lengths:', sorted(text_lens, reverse=True)[:10])

Number of texts: 303794
Mean text length: 261.66258385616567
Max text length: 266
Min text length: 2
Median text length: 266
Top 10 text lengths: [266, 266, 266, 266, 266, 266, 266, 266, 266, 266]


In [10]:

# load test data and map ner labels to .split('_')[1]
import json
with open('anon_data/test_data_baseline.json', 'r') as f:
    test_data = json.load(f)

for i in range(len(test_data)):
    for j in range(len(test_data[i]['ner'])):
        nr = test_data[i]['ner'][j][2]
        if nr == 'person':
            test_data[i]['ner'][j][2] = 'a_name'
        if nr == 'organization':
            test_data[i]['ner'][j][2] = 'a_organisation'
        if nr == 'place':
            test_data[i]['ner'][j][2] = 'a_place'

test_data[0]['ner']

[[212, 213, 'a_name'], [230, 231, 'a_name'], [215, 216, 'a_name']]

In [11]:
# save the new test data in test_data_baseline.json
with open('anon_data/test.json', 'w') as f:
    json.dump(test_data, f)

In [6]:
with open('anon_data/chunked_ner_training_data.json', 'r') as f:
    chunked_data = json.load(f)

with open('anon_data/test_data_baseline.json', 'r') as f:
    test_data = json.load(f)

In [7]:
# count all labels in the test dataset
def count_labels(data):
    label_count = {}
    for example in data:
        ner_data = example.get("ner", [])
        for entity in ner_data:
            label = entity[2]  # Assuming the label is the third element in the entity list
            if label not in label_count:
                label_count[label] = 0
            label_count[label] += 1
    return label_count

label_count = count_labels(chunked_data)
label_count

{'a_name': 999522, 'a_organisation': 137546, 'a_place': 37682}