# Load

In [1]:
from gatenlp import Document
from gatenlp.corpora import ListCorpus
import json
import os
import sys
from pathlib import Path
import json


In [2]:
script_dir = os.getcwd()
input_dir = script_dir + "/input/annotated-json/train"


In [3]:
def load_json_corpus(folder_path: Path) -> ListCorpus:
    # Create a new corpus with an empty list
    corpus = []
    
    # Walk through the input directory and load each JSON file
    for file_path in os.listdir(folder_path):
        try:
            with open(folder_path + "\\" + file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # Add the document to the corpus
            corpus.append(data)
            print(f"Loaded {file_path} into corpus")
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
    
    print(f"Loaded {len(corpus)} documents into the corpus.")
    return corpus

In [4]:
train = load_json_corpus(input_dir)

Loaded CASE OF DOKTOROV v. BULGARIA.json into corpus
Loaded CASE OF EGILL EINARSSON v. ICELAND (No. 2).json into corpus
Loaded CASE OF HOINESS v. NORWAY.json into corpus
Loaded CASE OF KOSAITE - CYPIENE AND OTHERS v. LITHUANIA.json into corpus
Loaded CASE OF LOZOVYYE v. RUSSIA.json into corpus
Loaded CASE OF M.T. v. UKRAINE.json into corpus
Loaded CASE OF MOSKALEV v. RUSSIA.json into corpus
Loaded CASE OF MURUZHEVA v. RUSSIA.json into corpus
Loaded CASE OF NODI v. HUNGARY.json into corpus
Loaded CASE OF O.C.I. AND OTHERS v. ROMANIA.json into corpus
Loaded CASE OF OTGON v. THE REPUBLIC OF MOLDOVA.json into corpus
Loaded CASE OF PAKHTUSOV v. RUSSIA.json into corpus
Loaded CASE OF PANYUSHKINY v. RUSSIA.json into corpus
Loaded CASE OF RESIN v. RUSSIA.json into corpus
Loaded CASE OF S.N. v. RUSSIA.json into corpus
Loaded CASE OF S.V. v. ITALY.json into corpus
Loaded CASE OF SHVIDKIYE v. RUSSIA.json into corpus
Loaded CASE OF SIDOROVA v. RUSSIA.json into corpus
Loaded CASE OF SOLCAN v. ROMAN

In [5]:
dev_dir = script_dir + "/input/annotated-json/dev"
dev = load_json_corpus(dev_dir)

Loaded CASE OF ALTAY v. TURKEY (No. 2).json into corpus
Loaded CASE OF BELYAYEV AND OTHERS v. UKRAINE.json into corpus
Loaded CASE OF BIGUN v. UKRAINE.json into corpus
Loaded 3 documents into the corpus.


In [6]:
test_dir = script_dir + "/input/annotated-json/test"
test = load_json_corpus(test_dir)

Loaded CASE OF CABUCAK v. GERMANY.json into corpus
Loaded CASE OF CAN v. TURKEY.json into corpus
Loaded CASE OF CRISTIAN CATALIN UNGUREANU v. ROMANIA.json into corpus
Loaded 3 documents into the corpus.


In [7]:
dev

[{'tokenized_text': ['ECHR',
   '\n',
   '24',
   '\t',
   'ALTAY',
   'v.',
   'TURKEY',
   '(',
   'No',
   '.',
   '2',
   ')',
   'JUDGMENT',
   '\n\t',
   'ALTAY',
   'v.',
   'TURKEY',
   '(',
   'No',
   '.',
   '2',
   ')',
   'JUDGMENT',
   '\t',
   '1',
   '\n',
   'SECOND',
   'SECTION',
   '\n',
   'CASE',
   'OF',
   'ALTAY',
   'v.',
   'TURKEY',
   '(',
   'No.2',
   ')',
   '\n',
   '(',
   'Application',
   'no',
   '.',
   '11236/09',
   ')',
   '\n',
   'JUDGMENT',
   '\n',
   'STRASBOURG',
   '\n',
   '9',
   'April',
   '2019',
   '\n',
   'FINAL',
   '\n',
   '09/07/2019',
   '\n',
   'This',
   'judgment',
   'has',
   'become',
   'final',
   'under',
   'Article',
   '44',
   '§',
   '2',
   'of',
   'the',
   'Convention',
   '.',
   'It',
   'may',
   'be',
   'subject',
   'to',
   'editorial',
   'revision',
   '.',
   '\n',
   'In',
   'the',
   'case',
   'of',
   'Altay',
   'v.',
   'Turkey',
   '(',
   'no',
   '.',
   '2',
   ')',
   ',',
   '\n',
   

# Training

In [8]:
import torch
from gliner import GLiNERConfig, GLiNER
from gliner.training import Trainer, TrainingArguments
from gliner.data_processing.collator import DataCollatorWithPadding, DataCollator
from gliner.utils import load_config_as_namespace
from gliner.data_processing import WordsSplitter, GLiNERDataset

In [9]:
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')

model = GLiNER.from_pretrained("urchade/gliner_small", local_dir_use_symlinks=False) #gliner_large-v2.1")

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]



In [10]:
data_collator = DataCollator(model.config, data_processor=model.data_processor, prepare_labels=True)

In [11]:
model.to(device)
print("done")

done


In [None]:
num_steps = 200
batch_size = 8
data_size = len(train)
num_batches = data_size // batch_size
num_epochs = max(1, num_steps // num_batches)

training_args = TrainingArguments(
    output_dir="models",
    learning_rate=5e-6,
    weight_decay=0.01,
    others_lr=1e-5,
    others_weight_decay=0.01,
    lr_scheduler_type="linear", #cosine
    warmup_ratio=0.1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    eval_strategy="steps",
    save_steps = 100,
    save_total_limit=10,
    dataloader_num_workers = 0,
    use_cpu = False,
    report_to="none",
    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=dev,
    tokenizer=model.data_processor.transformer_tokenizer,
    data_collator=data_collator,
)

trainer.train()

  trainer = Trainer(


Step,Training Loss,Validation Loss


In [None]:
model.save_pretrained("gliner_Vienna_NER")

trained_model = GLiNER.from_pretrained("gliner_Vienna_NER", load_tokenizer=True)

# Test

In [None]:
labels = ["Event", "Event_when", "Event_what", "Event_who"] # for v2.1 use capital case for better performance


In [None]:
evaluation_results = trained_model.evaluate(
    test, entity_types=labels
)

In [None]:
print(evaluation_results)

# OLD

In [2]:
def loadCorpus():
    # Create a new corpus with an empty list
    corpus = ListCorpus([])

    # Define the base directory
    base_dir = "input/annotated"

    # Walk through the directory and load each XML file
    for root, dirs, files in os.walk(base_dir):
        for file in files:
            if file.endswith(".xml"):
                file_path = os.path.join(root, file)
                doc = Document.load(file_path, fmt="gatexml")
                # Add the document to the corpus
                corpus.append(doc)
                print(f"Loaded {file_path} into corpus")            
                    
    print("All documents loaded into the corpus.")
    return corpus

In [3]:
def create_gold_standard_json():
    corpus = loadCorpus()
    # Create a JSON file with the gold standard annotations
    results = []
    for doc in corpus:
        doc_dict = {"Document": doc.features.get("gate.SourceURL")}
        annotations = doc.annset("consensus")
        event_annotations = annotations.with_type("Event")
        who_annotations = annotations.with_type("Event_who")
        what_annotations = annotations.with_type("Event_what")
        when_annotations = annotations.with_type("Event_when")

        events = []
        for event_ann in event_annotations:
            features = event_ann.features
            # Find overlapping or contained who/what/when annotations
            event_span = (event_ann.start, event_ann.end)
            def find_first_matching(anns):
                for ann in anns:
                    # Overlap or containment
                    if ann.start >= event_span[0] and ann.end <= event_span[1]:
                        return doc.text[ann.start:ann.end]
                return ""

            event_who = find_first_matching(who_annotations)
            event_what = find_first_matching(what_annotations)
            event_when = find_first_matching(when_annotations)
            # If event_type is a separate annotation, use similar logic, else use event_ann.features

            events.append({
                "event": doc.text[event_ann.start:event_ann.end],
                "event_who": event_who,
                "event_when": event_when,
                "event_what": event_what,
                "event_type": "event_" + features.get("type", "")
            })

        doc_dict["annotations"] = {
            "model_name": "gold_standard",
            "events": events
        }

        results.append(doc_dict)

    with open("gold_standard_events.json", "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

In [4]:
from gatenlp.visualization import CorpusViewer

corpus = loadCorpus()
viewer = CorpusViewer(corpus)
viewer.show()

HBox(children=(Button(icon='arrow-left', layout=Layout(width='5em'), style=ButtonStyle()), IntSlider(value=0, …

In [5]:
create_gold_standard_json()

Loaded input/annotated\dev\CASE OF ALTAY v. TURKEY (No. 2).xml into corpus
Loaded input/annotated\dev\CASE OF BELYAYEV AND OTHERS v. UKRAINE.xml into corpus
Loaded input/annotated\dev\CASE OF BIGUN v. UKRAINE.xml into corpus
Loaded input/annotated\test\CASE OF CABUCAK v. GERMANY.xml into corpus
Loaded input/annotated\test\CASE OF CAN v. TURKEY.xml into corpus
Loaded input/annotated\test\CASE OF CRISTIAN CATALIN UNGUREANU v. ROMANIA.xml into corpus
Loaded input/annotated\train\CASE OF DOKTOROV v. BULGARIA.xml into corpus
Loaded input/annotated\train\CASE OF EGILL EINARSSON v. ICELAND (No. 2).xml into corpus
Loaded input/annotated\train\CASE OF HOINESS v. NORWAY.xml into corpus
Loaded input/annotated\train\CASE OF KOSAITE - CYPIENE AND OTHERS v. LITHUANIA.xml into corpus
Loaded input/annotated\train\CASE OF LOZOVYYE v. RUSSIA.xml into corpus
Loaded input/annotated\train\CASE OF M.T. v. UKRAINE.xml into corpus
Loaded input/annotated\train\CASE OF MOSKALEV v. RUSSIA.xml into corpus
Loaded 