In [30]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Processing Label Studio Exports for Spacy
This notebook processes exports from Label Studio for use in Spacy.

In [1]:
import os
from pprint import pprint
from pathlib import Path

from processor import Processor

# Set the root directory of the project
ROOT_DIR = os.path.abspath(
    os.path.join(os.path.dirname("model_trainig.ipynb"), os.pardir)
)  # This file is the root of the project
DATA_PATH = os.path.join(ROOT_DIR, "data")

# Determine which assignment component to run
COMPONENT: int = 1 # 1 or 2

# Define all general paths
NER_MODEL: Path = os.path.join(ROOT_DIR, 'model', 'ner_model')
REL_MODEL: Path = os.path.join(ROOT_DIR, 'model', 'rel_model')

processor = Processor(ROOT_DIR)

In [32]:
# Define all component specific paths
if COMPONENT == 1:
    ANNOTATIONS: Path = 'final_assignment_1.json'
    GROUND_TRUTHS: Path = 'label_studio_ground_truth_task1.json'
    NER_DEV: Path = os.path.join(NER_MODEL, "assets", "dev_1.json")
    NER_TRAIN: Path = os.path.join(NER_MODEL, "assets", "train_1.json")
    NER_DEV_CORP: Path = os.path.join(NER_MODEL, "corpus", "dev_1.spacy")
    NER_TRAIN_CORP: Path = os.path.join(NER_MODEL, "corpus", "train_1.spacy")
    NER_OUTPUT: Path = os.path.join(NER_MODEL, "training_1")
    REL_ASSETS_TRAIN: Path = os.path.join(REL_MODEL, "assets", "annotations_1_train.jsonl")
    REL_ASSETS_DEV: Path = os.path.join(REL_MODEL, "assets", "annotations_1_dev.jsonl")
    REL_OUTPUT: Path = os.path.join(REL_MODEL, "training_1")
elif COMPONENT == 2:
    ANNOTATIONS: Path = ''
    GROUND_TRUTHS: Path = ''
    NER_DEV: Path = os.path.join(NER_MODEL, "assets", "dev_2.json")
    NER_TRAIN: Path = os.path.join(NER_MODEL, "assets", "train_2.json")
    NER_DEV_CORP: Path = os.path.join(NER_MODEL, "corpus", "dev_2.spacy")
    NER_TRAIN_CORP: Path = os.path.join(NER_MODEL, "corpus", "train_2.spacy")
    NER_OUTPUT: Path = os.path.join(NER_MODEL, "training_2")
    REL_ASSETS_TRAIN: Path = os.path.join(REL_MODEL, "assets", "annotations_2_train.jsonl")
    REL_ASSETS_DEV: Path = os.path.join(REL_MODEL, "assets", "annotations_2_dev.jsonl")
    REL_OUTPUT: Path = os.path.join(REL_MODEL, "training_2")
else:
    raise ValueError("COMPONENT must be 1 or 2")

## 1. Loading the Data
### 1.1. Loading JSON export from Label Studio
The annotations need to be loaded into the notebook. This is done via the preprocessor Python class. The training_data_export variable will store the item if this item is not used in the ground_truths. Additionally the ground_truth file is read and stored in the ground_truths variable.

In [33]:
export_data = processor.loadFile(ANNOTATIONS)

# Filter out annotations for which a ground truth exists (drop other annotations for this article as well)
training_data_export = [
    item
    for item in export_data
    if all(annotation["ground_truth"] is False for annotation in item["annotations"])
]
ground_truth_export = processor.loadFile(GROUND_TRUTHS)

#### 1.1.1 Check the training_data_export
The exported data constists of all the annotations from a specific text of all the users. The first annotated text in the exported data file looks something like the following:

In [34]:
print("Length of training data: ", len(training_data_export))
pprint(training_data_export[0])

Length of training data:  102
{'agreement': 100.0,
 'annotations': [{'completed_by': {'email': 'n.p.g.t.v.beuningen@student.tue.nl',
                                   'first_name': '',
                                   'id': 12634,
                                   'last_name': ''},
                  'created_at': '2023-10-12T19:16:05.761058Z',
                  'draft_created_at': '2023-10-12T19:06:11.353924Z',
                  'ground_truth': False,
                  'id': 23190706,
                  'import_id': None,
                  'last_action': 'submitted',
                  'last_created_by': 12634,
                  'lead_time': 611.387,
                  'parent_annotation': None,
                  'parent_prediction': None,
                  'prediction': {},
                  'project': 41784,
                  'result': [{'from_name': 'label',
                              'id': 'fLVAxBL9tN',
                              'origin': 'manual',
                         

### 1.2 Converting to Spacy training format:
To provide custom labels to Spacy, we need to convert the data to the following format:

```python
training_data = [
  ("Tokyo Tower is 333m tall.", [(0, 11, "BUILDING"), (20, 23, "HEIGHT")]),
]
```

The `process_export` function from the preprocessor can fix this for us, it retrieves the training data in the given format, in combination with the labels and their relationship

In [35]:
training_data, training_relations = processor.process_export_sentences(training_data_export)
validation_data, validation_relations = processor.process_export_sentences(ground_truth_export, ground_truth = True)

#### 1.2.1 Checking results
Now it is time to check what the results are from the process_export_sentences. The training data and training_relations will look differently and both are shown below. For all examples only the first text is used. Specifically only the first entity is shown for the entities that will be used to train the NER model. For the REL model one whole text is shown.

In [36]:
print("Training data info item 1 \ntext:")
print(training_data[0][0])
print("Labels:")
print(*training_data[0][1]["entities"], sep = "\n")

print("\n Validation data info item 1 \ntext:")
print(validation_data[0][0])
print("Labels:")
print(*validation_data[0][1]["entities"], sep = "\n")

Training data info item 1 
text:
Engelsberg Ironworks () is an ironworks in "ngelsberg", a village in Fagersta Municipality in Vstmanland, Sweden.
Labels:
[0, 20, 'landmark_name']

 Validation data info item 1 
text:
 Ephesus (; ; ; may ultimately derive from ) was a city in Ancient Greece on the coast of Ionia, southwest of present-day Seluk in zmir Province, Turkey.
Labels:
[1, 8, 'landmark_name']


In [37]:
pprint(training_relations[0])

{'answer': 'accept',
 'meta': {'source': 'Engelsberg Ironworks'},
 'relations': [{'child': 6,
                'child_span': {'end': 39,
                               'label': 'type',
                               'start': 30,
                               'token_end': 6,
                               'token_start': 6},
                'head': 1,
                'head_span': {'end': 20,
                              'label': 'landmark_name',
                              'start': 0,
                              'token_end': 1,
                              'token_start': 0},
                'label': 'org:is_type'},
               {'child': 9,
                'child_span': {'end': 53,
                               'label': 'location',
                               'start': 44,
                               'token_end': 9,
                               'token_start': 9},
                'head': 1,
                'head_span': {'end': 20,
                              'label': 'la

### 1.3 Preparing the data for Spacy
The data is now in the correct format, so it can be processed and saved as Spacy training file using the `preprocess_spacy` function from the `Preprocessor` class.

In [38]:
processor.preprocess_json(training_data = training_data, validation_data = validation_data, train_path = NER_TRAIN, dev_path = NER_DEV)

In [39]:
processor.preprocess_json_rel(relational_annotations_train = training_relations, relational_annotations_val = validation_relations, save_path_train=REL_ASSETS_TRAIN, save_path_dev=REL_ASSETS_DEV)

# 2 Training the Spacy Model on on the training file

For training Spacy is used. This is a good way to implement our own data into a specific model. Spacy works out of the box, but not with our specific labels and/or relations. The default behaviour of spacy looks like the following:

### 2.1 Spacy before training with custom labels

In [40]:
import spacy
from spacy import displacy

example_text = training_data[0][0]

nlp = spacy.load("en_core_web_sm")
doc = nlp(example_text)

displacy.render(doc, style="ent")

### 2.2 Training a spacy NER model
Training of a spacy model usually is done via the command line. This is the reason for the following no so understandable lines of code. There are a few steps in the training process:
1. The spacy model needs a config file and all necessary files are in the spacy folder
2. The model needs training data, which was exported in this file above to the spacy folder
3. After training the model is evaluated and the results are printed for training and evaluation

In [41]:
from ner_model.scripts.convert import convert as ner_convert

ner_convert("en", NER_TRAIN, NER_TRAIN_CORP)
ner_convert("en", NER_DEV, NER_DEV_CORP)

Start converting NER data...



'It is listed as a UNESCO World Heritage Site since 1993.Name.Engelsberg Ironworks is named after Englika.'


'In total, the site consists of seven component parts  Kintrishi-Mtirala and Ispani in Adjara, Grigoleti and Imnati in Guria, and Pitshora, Nabada, and Churia in Samegrelo-Zemo Svaneti.'


'They represent the work of the Pskov School that drew from the Byzantine and Novgorod traditions, fused them with the local vernacular tradition, and adjusted the architecture to the use of local resources.'


'It was established in 1973 as the Royal Chitwan National Park and was granted the status of a World Heritage Site in 1984.'


'The coherent protected area of represents the "Tiger Conservation Unit (TCU) Chitwan-Parsa-Valmiki", which covers a huge block of alluvial grasslands and subtropical moist deciduous forests.'


"The Capital Cities and Tombs of the Ancient Koguryo Kingdom is an UNESCO World Heritage Site which includes a number of archaeological sites currently in Ji'an, Jilin

Finished convertin NER data
Start converting NER data...
Finished convertin NER data



'The site includes Durham Castle, Durham Cathedral, Durham University, Palace Green and University College, Durham.'


'It is named for the hundreds of paintings of hands stenciled, in multiple collages, on the rock walls.'


'The art was created in several waves between 7,300 BC and 700 AD, during the Archaic period of pre-Columbian South America.'


'The age of the paintings was calculated from the remains of bone pipes used for spraying the paint on the wall of the cave to create the artwork, radiocarbon dating of the artwork, and stratigraphic dating.'


'The park was named a World Heritage Site by UNESCO.History.Canaima National Park was established on 12 June 1962.As early as 1990, the countries that participate in the Amazonian Cooperation Treaty had recommended expanding the Canaima National Park southward to connect it with Monte Roraima National Park in Brazil, with coordinated management of tourism, research and conservation.'


'The cathedral is the seat of the Bishop of A

In [42]:
from spacy.cli.train import train

train("ner_model/configs/config.cfg", output_path=NER_OUTPUT, overrides={"paths.train": NER_TRAIN_CORP, "paths.dev": NER_DEV_CORP})

ℹ Saving to output directory: ner_model\training_1
ℹ Using CPU
[1m




✔ Initialized pipeline
[1m
ℹ Pipeline: ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer',
'ner']
ℹ Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS TAGGER  LOSS PARSER  LOSS NER  LEMMA_ACC  ENTS_F  ENTS_P  ENTS_R  SPEED   SCORE 
---  ------  ------------  -----------  -----------  --------  ---------  ------  ------  ------  ------  ------
  0       0          0.00         0.00         0.00     23.18       0.00    0.00    0.00    0.00  3191.80    0.00
  0     100          0.00         0.00         0.00    595.55       0.00    0.00    0.00    0.00  5440.95    0.00
  1     200          0.00         0.00         0.00    799.97       0.00    0.00    0.00    0.00  5732.81    0.00
  2     300          0.00         0.00         0.00    631.50       0.00   26.80   54.17   17.81  5791.21    0.06
  3     400          0.00         0.00         0.00    588.15       0.00   33.66   60.71   23.29  5875.18    0.08
  4     500          0.00         0.00         0.00    617.02   

In [43]:
from spacy.cli.evaluate import evaluate

evaluate(os.path.join(NER_OUTPUT, "model-best"), NER_DEV_CORP, output=os.path.join(NER_OUTPUT, "metrics.json"))

{'token_acc': 1.0,
 'token_p': 1.0,
 'token_r': 1.0,
 'token_f': 1.0,
 'tag_acc': None,
 'sents_p': None,
 'sents_r': None,
 'sents_f': None,
 'dep_uas': None,
 'dep_las': None,
 'dep_las_per_type': None,
 'pos_acc': None,
 'morph_acc': None,
 'morph_micro_p': None,
 'morph_micro_r': None,
 'morph_micro_f': None,
 'morph_per_feat': None,
 'lemma_acc': None,
 'ents_p': 0.5434782608695652,
 'ents_r': 0.3424657534246575,
 'ents_f': 0.4201680672268907,
 'ents_per_type': {'landmark_name': {'p': 0.6785714285714286,
   'r': 0.76,
   'f': 0.7169811320754718},
  'date': {'p': 0.6, 'r': 0.3333333333333333, 'f': 0.42857142857142855},
  'number': {'p': 0.3333333333333333,
   'r': 0.3333333333333333,
   'f': 0.3333333333333333},
  'people': {'p': 0.0, 'r': 0.0, 'f': 0.0},
  'component': {'p': 0.0, 'r': 0.0, 'f': 0.0},
  'location': {'p': 0.0, 'r': 0.0, 'f': 0.0},
  'type': {'p': 0.25, 'r': 0.14285714285714285, 'f': 0.18181818181818182},
  'animal': {'p': 0.0, 'r': 0.0, 'f': 0.0},
  'condition': {'p

### 2.2 Visualizing the results
The model is now trained. This model can be loaded into spacy and an example text can be visualized with the NER.

In [44]:
options = {
    "colors": {"location": "lightyellow",
               "person_name": "lightgreen",
               "landmark_name": "lightred",
               "condition": "lightblue"}
}

# Now test teh newly created spacy model on a sample text and visualize it using spacy
nlp = spacy.load(os.path.join(NER_OUTPUT, "model-best"))

example_text = str([text[0] for i, text in enumerate(training_data[:20]) if text != ""])
doc = nlp(example_text)

displacy.render(doc, style="ent", jupyter=True, options=options)

# Show the tokens, their labels and their entities
for ent in doc.ents:
    print(ent.text, ent.label_)

Emas National Park landmark_name
park type
17 number


### 2.3 Training the Spacy Custom REL component

In [45]:
import subprocess
if spacy.prefer_gpu():
    output = subprocess.run(f"spacy project run all_{COMPONENT}_gpu", cwd="rel_model", capture_output=True)
else:
    output = subprocess.run(f"spacy project run all_{COMPONENT}", cwd="rel_model", capture_output=True)

print(output.stdout.decode("utf-8"))

## 2.4 Visualizing the results
 

In [None]:
# make the factory work
from rel_model.scripts.rel_pipe import make_relation_extractor

# make the config work
from rel_model.scripts.rel_model import create_relation_model, create_classification_layer, create_instances, create_tensors

# Now test teh newly created spacy model on a sample text and visualize it using spacy
nlp2 = spacy.load(os.path.join(REL_OUTPUT, "model-best"))

doc = nlp2(doc.text) # doc is the output from the NER model (nlp)

displacy.render(doc, style="ent", jupyter=True, options=options)

# Show the tokens, their labels and their entities
for rel in doc._.rel:
    print(rel)

ℹ Could not determine any instances in doc - returning doc as is.


