In [23]:
import json
import pandas as pd
import numpy as np

from utils import join_tokens, tokenize_text

import random



In [11]:
#Hyperparameters

CHUNKSIZE = 256
_ptrain= .7
_pval = .15
_ptest = .15

In [10]:
with open(f"anon_data/ner_data_2.json",'r') as f:
    data = json.load(f)


In [69]:
len(data)

124089

In [16]:
print(join_tokens(data[110771]['tokenized_text']))
print(data[110771]['ner'])

Verfahrensbeteiligte Karin Marti, Beschwerdeführer, gegen Familiengericht Muri, Seetalstrasse 8, 5630 Muri. Gegenstand Erweiterung einer Beistandschaft, Beschwerde gegen den Entscheid des Obergerichts des Kantons Aargau, Kammer für Kindes - und Erwachsenenschutz, vom 23. Juni 2022 ( XBE. 2022. 16 ). Sachverhalt: Die Vorgeschichte ist dem Bundesgericht aus einer Vielzahl von Verfahren bekannt. Der Beschwerdeführer leidet an gutachterlich festgestellten Wahnvorstellungen und befindet sich zur Zeit in Untersuchungshaft. Seit dem 2. Juni 2021 besteht eine Vertretungsbeistandschaft mit Vermögensverwaltung mit teilweisem Entzug der Handlungsfähigkeit. Am 29. Oktober 2021 wandte er sich an das Familiengericht Muri und beantragte die Sistierung der Wohnungsmiete für die nächsten sechs Monate, damit er im Notfall seine Ex-Frau finanziell unterstützen könne. Mit Eingabe vom 3. November 2021 beantragte die Beiständin eine Ausweitung der Beistandschaft. Mit Entscheid vom 16. Februar 2022 weitete d

In [17]:
tokenized_texts = [data[i]['tokenized_text'] for i in range(len(data))]
text_lens = [len(tokenized_texts[i]) for i in range(len(tokenized_texts))]

In [18]:
# print summary statistics about text lengths
print('Number of texts:', len(tokenized_texts))
print('Mean text length:', sum(text_lens) / len(tokenized_texts))
print('Max text length:', max(text_lens))
print('Min text length:', min(text_lens))
print('Median text length:', sorted(text_lens)[len(text_lens) // 2])
print('Top 10 text lengths:', sorted(text_lens, reverse=True)[:10])

Number of texts: 124089
Mean text length: 2435.367913352513
Max text length: 93790
Min text length: 8
Median text length: 1845
Top 10 text lengths: [93790, 88627, 78435, 71139, 58708, 58119, 57739, 56936, 44997, 44226]


In [None]:
#chunk without offset into CHUNKSIZE, keep the document identity 

def chunk_data(sample, chunk_size):
    tokenized_text = sample['tokenized_text']
    ners = sample['ner']

    tokenized_texts = []
    start_token_indices = []
    end_token_indices = []
    labels = []
    contains_ner = []
    for i in range(0, len(tokenized_text), chunk_size):
        start = i
        end = i + chunk_size
        tokenized_texts.append(tokenized_text[start:end])
        new_starts = []
        new_ends = []
        new_labels = []
        for ner_label in ners:
            if ner_label[0] >= i and ner_label[0] < end and ner_label[1] < end: #make sure the whole ner range fits into the chunk
                new_starts.append(ner_label[0]-i)
                new_ends.append(ner_label[1] -i)
                new_labels.append(ner_label[2])
        contains_ner.append(len(new_labels) > 0) 
        start_token_indices.append(new_starts)
        end_token_indices.append(new_ends) 
        labels.append(new_labels) 

    return tokenized_texts,start_token_indices,end_token_indices,labels,contains_ner

def chunk_dataset_into_df(data : list[dict], chunk_size : int) -> pd.DataFrame:
    document_ids = []
    tokenized_texts = []
    start_token_indices = []
    end_token_indices = []
    labels = []
    contains_ner = []
    for i, sample in enumerate(data):
        print(f"Processing sample {i + 1} of {len(data)}", end='\r')
        new_tok_texts, new_starts, new_ends, new_labels, new_contains_ner = chunk_data(sample,chunk_size)
        document_ids.extend([i]*(len(new_tok_texts)))
        tokenized_texts.extend(new_tok_texts)
        start_token_indices.extend(new_starts)
        end_token_indices.extend(new_ends)
        labels.extend(new_labels)
        contains_ner.extend(new_contains_ner)

    return pd.DataFrame({"document_id" : document_ids,
                         "tokenized_text" : tokenized_texts,
                         "start_token_indices" : start_token_indices,
                         "end_token_indices" : end_token_indices,
                         "labels" : labels,
                         "contains_ner" : contains_ner
                         })

chunked_data = chunk_dataset_into_df(data, CHUNKSIZE)

Number of data points: 12408924089
Number of chunks: 1242717


In [70]:
print('Number of data points:', len(data))
print('Number of chunks:', len(chunked_data))


Number of data points: 124089
Number of chunks: 1242717


In [24]:
chunked_data.sample(10,random_state=42)

Unnamed: 0,document_id,tokenized_text,start_token_indices,end_token_indices,labels,contains_ner
611381,67938,"[Partecipanti, al, procedimento, Elisabeth, Ra...","[3, 134, 209, 83, 93, 89]","[4, 135, 210, 84, 94, 89]","[person, person, person, person, person, locat...",True
155556,18721,"[Oktober, 2006, ein, Panvertebralsyndrom, bei,...","[20, 168, 70]","[21, 168, 72]","[person, location, location]",True
222753,26330,"[Jahren, nicht, mehr, tragbar, war, und, -, na...",[115],[116],[person],True
99499,12384,"[1015, ;, PETER, HÄNNI, ,, Planungs, -, Bau, -...",[],[],[],False
275143,32021,"[Sinne, von, Art, ., 98, BGG, der, Beschwerde,...",[],[],[],False
710148,77708,"[père, et, la, mère, bénéficiant, d, ', un, li...","[215, 98]","[216, 99]","[person, person]",True
760664,82462,"[., Aucun, fait, nouveau, ni, preuve, nouvelle...",[],[],[],False
24406,3515,"[im, Sinne, von, Art, ., 16, MWSTV, nur, dann,...",[],[],[],False
511792,57662,"[von, weiteren, Eingaben, und, Verfahren, in, ...",[],[],[],False
940525,99083,"[Verfahrensbeteiligte, Tonie, Hochstrasser, ,,...","[1, 68, 129]","[2, 69, 130]","[person, person, person]",True


In [26]:
#save the chunked data
chunked_data.to_parquet(f"anon_data/chunked_ner_data_2.parquet",engine='pyarrow')

In [12]:
n_nempty = chunked_data['contains_ner'].sum()
n_empty = len(chunked_data)-n_nempty
print(f"number of non-empty chunks : {n_nempty}")
print(f"number of emtpy chunks : {n_empty}")
print(f"ratio empty to non-empty : {n_empty/n_nempty}")


number of non-empty chunks : 434666
number of emtpy chunks : 808051
ratio empty to non-empty : 1.859015888061178


## Do document-based split into train, validation, test set 

In [5]:
chunked_data = pd.read_parquet(f"anon_data/chunked_ner_data_2.parquet",engine='pyarrow')

In [13]:
chunked_data.sample(10,random_state=42)

Unnamed: 0,document_id,tokenized_text,start_token_indices,end_token_indices,labels,contains_ner
611381,67938,"[Partecipanti, al, procedimento, Elisabeth, Ra...","[3, 134, 209, 83, 93, 89]","[4, 135, 210, 84, 94, 89]","[person, person, person, person, person, locat...",True
155556,18721,"[Oktober, 2006, ein, Panvertebralsyndrom, bei,...","[20, 168, 70]","[21, 168, 72]","[person, location, location]",True
222753,26330,"[Jahren, nicht, mehr, tragbar, war, und, -, na...",[115],[116],[person],True
99499,12384,"[1015, ;, PETER, HÄNNI, ,, Planungs, -, Bau, -...",[],[],[],False
275143,32021,"[Sinne, von, Art, ., 98, BGG, der, Beschwerde,...",[],[],[],False
710148,77708,"[père, et, la, mère, bénéficiant, d, ', un, li...","[215, 98]","[216, 99]","[person, person]",True
760664,82462,"[., Aucun, fait, nouveau, ni, preuve, nouvelle...",[],[],[],False
24406,3515,"[im, Sinne, von, Art, ., 16, MWSTV, nur, dann,...",[],[],[],False
511792,57662,"[von, weiteren, Eingaben, und, Verfahren, in, ...",[],[],[],False
940525,99083,"[Verfahrensbeteiligte, Tonie, Hochstrasser, ,,...","[1, 68, 129]","[2, 69, 130]","[person, person, person]",True


In [14]:
import random
import numpy as np

max_doc_id = chunked_data["document_id"].max()
print(f"max document id : {max_doc_id}")
n_docs = max_doc_id + 1
print(f"number of documents : {n_docs}")

#split doc_ids into _ptrain,_pval,_ptest split

shuffled_ids = list(range(n_docs))
random.seed(42)
random.shuffle(shuffled_ids)

ids_train, ids_val, ids_test = np.split(shuffled_ids,[int(n_docs*_ptrain),int(n_docs*(_ptrain+_pval))])

print(f"number of training docs : {len(ids_train)}")
print(f"number of validation docs : {len(ids_val)}")
print(f"number of test docs : {len(ids_test)}")

max document id : 124088
number of documents : 124089
number of training docs : 86862
number of validation docs : 18613
number of test docs : 18614


In [None]:
#save training data
chunked_data[chunked_data['document_id'].isin(ids_train)].reset_index(drop=True).to_parquet(f"anon_data/chunked_ner_data_train.parquet")

In [20]:
#save validation data
chunked_data[chunked_data['document_id'].isin(ids_val)].reset_index(drop=True).to_parquet(f"anon_data/chunked_ner_data_val.parquet")

In [21]:
#save validation data
chunked_data[chunked_data['document_id'].isin(ids_test)].reset_index(drop=True).to_parquet(f"anon_data/chunked_ner_data_test.parquet")

## Split each set into empty and non-empty samples

In [105]:
def df_to_json_non_empty(data: pd.DataFrame) -> list[dict]:

    
    ner = data.apply(lambda x: [[int(start),int(end),str(label)] for start,end,label in zip(x["start_token_indices"],x["end_token_indices"],x["labels"])],
                     axis=1)

    return [{"tokenized_text" : list(text),
             "ner" : list(ner)} for text,ner in zip(data['tokenized_text'],ner)]

    

In [106]:
def df_to_json_empty(data: pd.DataFrame,labels) -> list[dict]:


    return [{"tokenized_text" : list(text),
             "ner" : [],
             "label" : list(labels)} for text in data['tokenized_text']] #see training with empty data : https://github.com/urchade/GLiNER/issues/139


In [108]:
def df_to_json(data,split_empty = True,labels = ['person','organization','location','law','citation']):


    if split_empty:
        return df_to_json_empty(data[~data['contains_ner']],labels=labels), df_to_json_non_empty(data[data['contains_ner']])

### Training set

In [109]:
chunked_train = pd.read_parquet(f"anon_data/chunked_ner_data_train.parquet")

In [111]:
chunked_train.head()

Unnamed: 0,document_id,tokenized_text,start_token_indices,end_token_indices,labels,contains_ner
0,0,"[Parteien, Maëlle, Meyer, ,, Beschwerdeführer,...","[1, 69, 131, 175]","[2, 70, 132, 176]","[person, person, person, person]",True
1,0,"[dem, Antrag, des, Untersuchungsrichters, -, d...","[53, 224, 249]","[54, 225, 250]","[person, person, person]",True
2,0,"[Verteidigung, und, hält, in, seiner, Replik, ...",[],[],[],False
3,0,"[Dazu, ist, er, befugt, ., Die, Nichtigkeit, e...",[],[],[],False
4,0,"[schriftlicher, Einwilligung, der, inhaftierte...",[],[],[],False


In [112]:
n_train = len(chunked_train)

In [113]:
train_json_empty,train_json_non_empty = df_to_json(chunked_train)

In [114]:
print(len(chunked_train[chunked_train['contains_ner']]))
print(len(train_json_non_empty))
print(len(chunked_train[~chunked_train['contains_ner']]))
print(len(train_json_empty))

print(n_train == len(train_json_empty) + len(train_json_non_empty))

303458
303458
564961
564961
True


In [115]:
#Save data

with open(f"anon_data/train/train_non_empty.json",'w') as f:
    json.dump(train_json_non_empty,f)


In [116]:
#Save data
with open(f"anon_data/train/train_empty.json",'w') as f:
    json.dump(train_json_empty,f)

In [117]:
#clear memory
del chunked_train


In [118]:
del train_json_non_empty,train_json_empty

### Validation set

In [119]:
chunked_val = pd.read_parquet(f"anon_data/chunked_ner_data_val.parquet")

In [120]:
val_json_empty,val_json_non_empty = df_to_json(chunked_val)

In [121]:
#Save data
with open(f"anon_data/train/validation_empty.json",'w') as f:
    json.dump(val_json_empty,f)

#Save data
with open(f"anon_data/train/validation_non_empty.json",'w') as f:
    json.dump(val_json_non_empty,f)

In [122]:
#release memory
del chunked_val,val_json_empty,val_json_non_empty

### Test set

In [123]:
chunked_test = pd.read_parquet(f"anon_data/chunked_ner_data_test.parquet")

In [124]:
test_json_empty,test_json_non_empty = df_to_json(chunked_test)

In [125]:
#Save data
with open(f"anon_data/test/test_empty.json",'w') as f:
    json.dump(test_json_empty,f)

#Save data
with open(f"anon_data/test/test_non_empty.json",'w') as f:
    json.dump(test_json_non_empty,f)