In [9]:
import json
file = 'citation_data/swiss_citation_extraction.json'
filtered_file = 'citation_data/swiss_citation_extraction_filtered.json'

In [2]:
with open(file, 'r') as f:
    data = json.load(f)

In [3]:
tokenized_texts = [data[i]['tokenized_text'] for i in range(len(data))]
text_lens = [len(tokenized_texts[i]) for i in range(len(tokenized_texts))]

In [4]:
# print summary statistics about text lengths
print('Number of texts:', len(tokenized_texts))
print('Mean text length:', sum(text_lens) / len(tokenized_texts))
print('Max text length:', max(text_lens))
print('Min text length:', min(text_lens))
print('Median text length:', sorted(text_lens)[len(text_lens) // 2])
print('Top 10 text lengths:', sorted(text_lens, reverse=True)[:10])

Number of texts: 127483
Mean text length: 209.63097040389698
Max text length: 423
Min text length: 31
Median text length: 221
Top 10 text lengths: [423, 416, 382, 358, 354, 352, 347, 338, 338, 336]


In [6]:
# proportion of text lenghts more than 32000
MAXLEN = 32000
print(f'Proportion of text lengths more than {MAXLEN}:', sum([1 for l in text_lens if l > MAXLEN]) / len(text_lens))

Proportion of text lengths more than 32000: 0.0


In [7]:
# filter for data points less than 32000 length and existing NER Label
filtered_data = [data[i] for i in range(len(data)) if text_lens[i] <= MAXLEN and text_lens[i] > 0 and len(data[i]['ner']) > 0]
print('Number of texts remaining:', len(filtered_data))

Number of texts remaining: 113464


In [10]:
# save the filtered data
with open(filtered_file, 'w') as f:
    json.dump(filtered_data, f)

In [11]:
filtered_data[0]

{'tokenized_text': ['Erwägungen',
  ':',
  '1',
  '.',
  '1',
  '.',
  '1',
  '.',
  'Die',
  'Beschwerde',
  'kann',
  'wegen',
  'Rechtsverletzung',
  'gemäss',
  'Art',
  '.',
  '95',
  'und',
  'Art',
  '.',
  '96',
  'BGG',
  'erhoben',
  'werden',
  '.',
  'Das',
  'Bundesgericht',
  'wendet',
  'das',
  'Recht',
  'von',
  'Amtes',
  'wegen',
  'an',
  '(',
  'Art',
  '.',
  '106',
  'Abs',
  '.',
  '1',
  'BGG',
  ')',
  '.',
  'Es',
  'ist',
  'somit',
  'weder',
  'an',
  'die',
  'in',
  'der',
  'Beschwerde',
  'geltend',
  'gemachten',
  'Argumente',
  'noch',
  'an',
  'die',
  'Erwägungen',
  'der',
  'Vorinstanz',
  'gebunden',
  ';',
  'es',
  'kann',
  'eine',
  'Beschwerde',
  'aus',
  'einem',
  'anderen',
  'als',
  'dem',
  'angerufenen',
  'Grund',
  'gutheissen',
  'und',
  'es',
  'kann',
  'sie',
  'mit',
  'einer',
  'von',
  'der',
  'Argumentation',
  'der',
  'Vorinstanz',
  'abweichenden',
  'Begründung',
  'abweisen',
  '(',
  'vgl',
  '.',
  'BGE',
  '1

In [12]:
# count all labels in the test dataset
def count_labels(data):
    label_count = {}
    for example in data:
        ner_data = example.get("ner", [])
        for entity in ner_data:
            label = entity[2]  # Assuming the label is the third element in the entity list
            if label not in label_count:
                label_count[label] = 0
            label_count[label] += 1
    return label_count

label_count = count_labels(filtered_data)
label_count

{'law': 466729, 'citation': 134903}