In [131]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Processing Label Studio Exports for Spacy
This notebook processes exports from Label Studio for use in Spacy.

In [132]:
import preprocessor
import os
from pprint import pprint
from pathlib import Path

# Determine which assignment component to run
COMPONENT: int = 1 # 1 or 2

# Define all general paths
ROOT_DIR: Path = preprocessor.ROOT_DIR
DATA_PATH: Path = preprocessor.DATA_PATH
NER_MODEL: Path = os.path.join(ROOT_DIR, 'ner_model')
REL_MODEL: Path = os.path.join(ROOT_DIR, 'rel_model')

# Create preprocessor object
preprocessor = preprocessor.Preprocessor()

In [133]:
# Define all component specific paths
if COMPONENT == 1:
    ANNOTATIONS: Path = 'final_assignment_1.json'
    GROUND_TRUTHS: Path = 'label_studio_ground_truth_task1.json'
    NER_DEV: Path = os.path.join(NER_MODEL, "assets", "dev_1.json")
    NER_TRAIN: Path = os.path.join(NER_MODEL, "assets", "train_1.json")
    NER_DEV_CORP: Path = os.path.join(NER_MODEL, "corpus", "dev_1.spacy")
    NER_TRAIN_CORP: Path = os.path.join(NER_MODEL, "corpus", "train_1.spacy")
    NER_OUTPUT: Path = os.path.join(NER_MODEL, "training_1")
    REL_ASSETS: Path = os.path.join(REL_MODEL, "assets", "annotations_1.jsonl")
    REL_OUTPUT: Path = os.path.join(REL_MODEL, "training_1")
elif COMPONENT == 2:
    ANNOTATIONS: Path = ''
    GROUND_TRUTHS: Path = ''
    NER_DEV: Path = os.path.join(NER_MODEL, "assets", "dev_2.json")
    NER_TRAIN: Path = os.path.join(NER_MODEL, "assets", "train_2.json")
    NER_DEV_CORP: Path = os.path.join(NER_MODEL, "corpus", "dev_2.spacy")
    NER_TRAIN_CORP: Path = os.path.join(NER_MODEL, "corpus", "train_2.spacy")
    NER_OUTPUT: Path = os.path.join(NER_MODEL, "training_1")
    REL_ASSETS: Path = os.path.join(REL_MODEL, "assets", "annotations_2.jsonl")
    REL_OUTPUT: Path = os.path.join(REL_MODEL, "training_2")
else:
    raise ValueError("COMPONENT must be 1 or 2")

## 1. Loading the Data
### 1.1. Loading JSON export from Label Studio
The annotations need to be loaded into the notebook. This is done via the preprocessor Python class. The training_data_export variable will store the item if this item is not used in the ground_truths. Additionally the ground_truth file is read and stored in the ground_truths variable.

In [134]:
export_data = preprocessor.loadFile(ANNOTATIONS)

# Filter out annotations for which a ground truth exists (drop other annotations for this article as well)
training_data_export = [
    item
    for item in export_data
    if any(annotation["ground_truth"] is False for annotation in item["annotations"])
]
ground_truth_export = preprocessor.loadFile(GROUND_TRUTHS)

#### 1.1.1 Check the training_data_export
The exported data constists of all the annotations from a specific text of all the users. The first annotated text in the exported data file looks something like the following:

In [135]:
print("Length of training data: ", len(training_data_export))
pprint(training_data_export[0])

Length of training data:  122
{'agreement': 65.94881669230801,
 'annotations': [{'completed_by': {'email': 'f.a.ensink.op.kemma@student.tue.nl',
                                   'first_name': '',
                                   'id': 12716,
                                   'last_name': ''},
                  'created_at': '2023-09-29T14:37:21.934927Z',
                  'draft_created_at': '2023-09-29T14:33:09.743687Z',
                  'ground_truth': True,
                  'id': 22591087,
                  'import_id': None,
                  'last_action': 'updated',
                  'last_created_by': 12716,
                  'lead_time': 1477.8770000000002,
                  'parent_annotation': None,
                  'parent_prediction': None,
                  'prediction': {},
                  'project': 41784,
                  'result': [{'from_name': 'label',
                              'id': 'a-cF8klU4-',
                              'origin': 'manual',
     

### 1.2 Converting to Spacy training format:
To provide custom labels to Spacy, we need to convert the data to the following format:

```python
training_data = [
  ("Tokyo Tower is 333m tall.", [(0, 11, "BUILDING"), (20, 23, "HEIGHT")]),
]
```

The `process_export` function from the preprocessor can fix this for us, it retrieves the training data in the given format, in combination with the labels and their relationship

In [136]:
training_data, training_relations = preprocessor.process_export_sentences(training_data_export)
validation_data, validation_relations = preprocessor.process_export_sentences(ground_truth_export, ground_truth = True)

#### 1.2.1 Checking results
Now it is time to check what the results are from the process_export_sentences. The training data and training_relations will look differently and both are shown below. For all examples only the first text is used. Specifically only the first entity is shown for the entities that will be used to train the NER model. For the REL model one whole text is shown.

In [137]:
print("Training data info item 1 \ntext:")
print(training_data[0][0])
print("Labels:")
print(*training_data[0][1]["entities"], sep = "\n")

print("\n Validation data info item 1 \ntext:")
print(validation_data[0][0])
print("Labels:")
print(*validation_data[0][1]["entities"], sep = "\n")

Training data info item 1 
text:
 Ephesus (; ; ; may ultimately derive from ) was a city in Ancient Greece on the coast of Ionia, southwest of present-day Seluk in zmir Province, Turkey.
Labels:
[1, 9, 'landmark_name']

 Validation data info item 1 
text:
 Ephesus (; ; ; may ultimately derive from ) was a city in Ancient Greece on the coast of Ionia, southwest of present-day Seluk in zmir Province, Turkey.
Labels:
[1, 8, 'landmark_name']


In [138]:
pprint(training_relations[0])

{'answer': 'accept',
 'meta': {'source': 'Ephesus'},
 'relations': [{'child': 15,
                'child_span': {'end': 73,
                               'label': 'location',
                               'start': 59,
                               'token_end': 15,
                               'token_start': 14},
                'head': 1,
                'head_span': {'end': 8,
                              'label': 'landmark_name',
                              'start': 1,
                              'token_end': 1,
                              'token_start': 1},
                'label': 'org:located_in'},
               {'child': 27,
                'child_span': {'end': 127,
                               'label': 'location',
                               'start': 122,
                               'token_end': 27,
                               'token_start': 27},
                'head': 1,
                'head_span': {'end': 8,
                              'label': 'la

### 1.3 Preparing the data for Spacy
The data is now in the correct format, so it can be processed and saved as Spacy training file using the `preprocess_spacy` function from the `Preprocessor` class.

In [139]:
preprocessor.preprocess_json(training_data = training_data, validation_data = validation_data, train_path = NER_TRAIN, dev_path = NER_DEV)

In [140]:
preprocessor.preprocess_json_rel(relational_annotations = training_relations, save_path=REL_ASSETS)

# 2 Training the Spacy Model on on the training file

For training Spacy is used. This is a good way to implement our own data into a specific model. Spacy works out of the box, but not with our specific labels and/or relations. The default behaviour of spacy looks like the following:

### 2.1 Spacy before training with custom labels

In [141]:
import spacy
from spacy import displacy

example_text = training_data[0][0]

nlp = spacy.load("en_core_web_sm")
doc = nlp(example_text)

displacy.render(doc, style="ent")

### 2.2 Training a spacy NER model
Training of a spacy model usually is done via the command line. This is the reason for the following no so understandable lines of code. There are a few steps in the training process:
1. The spacy model needs a config file and all necessary files are in the spacy folder
2. The model needs training data, which was exported in this file above to the spacy folder
3. After training the model is evaluated and the results are printed for training and evaluation

In [142]:
from ner_model.scripts.convert import convert as ner_convert

ner_convert("en", NER_TRAIN, NER_TRAIN_CORP)
ner_convert("en", NER_DEV, NER_DEV_CORP)

Start converting NER data...



' Ephesus (; ; ; may ultimately derive from ) was a city in Ancient Greece on the coast of Ionia, southwest of present-day Seluk in zmir Province, Turkey.'

  span = doc.char_span(start, end, label=label)

'It is listed as a UNESCO World Heritage Site since 1993.Name.Engelsberg Ironworks is named after Englika.'

  span = doc.char_span(start, end, label=label)

'The site includes Durham Castle, Durham Cathedral, Durham University, Palace Green and University College, Durham.'

  span = doc.char_span(start, end, label=label)

'It is named for the hundreds of paintings of hands stenciled, in multiple collages, on the rock walls.'

  span = doc.char_span(start, end, label=label)

'The art was created in several waves between 7,300 BC and 700 AD, during the Archaic period of pre-Columbian South America.'

  span = doc.char_span(start, end, label=label)

'The age of the paintings was calculated from the remains of bone pipes used for spraying the paint on the wall of the cave to create the

Finished convertin NER data
Start converting NER data...
Finished convertin NER data



'The Monastery of Saint John of Rila, also known as Rila Monastery "Sveti Ivan Rilski" (), is the largest and most famous Eastern Orthodox monastery in Bulgaria.'

  span = doc.char_span(start, end, label=label)

'The Villa Romana del Casale (Sicilian: "Villa Rumana d Casali") is a large and elaborate Roman villa or palace located about 3km from the town of Piazza Armerina, Sicily.'

  span = doc.char_span(start, end, label=label)


In [143]:
from spacy.cli.train import train

train("ner_model/configs/config.cfg", output_path=NER_OUTPUT, overrides={"paths.train": NER_TRAIN_CORP, "paths.dev": NER_DEV_CORP})

ℹ Saving to output directory: ner_model\training_1
ℹ Using CPU
[1m




✔ Initialized pipeline
[1m
ℹ Pipeline: ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer',
'ner']
ℹ Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS TAGGER  LOSS PARSER  LOSS NER  LEMMA_ACC  ENTS_F  ENTS_P  ENTS_R  SPEED   SCORE 
---  ------  ------------  -----------  -----------  --------  ---------  ------  ------  ------  ------  ------
  0       0          0.00         0.00         0.00     22.09       0.00    0.00    0.00    0.00  7263.95    0.00
  0     100          0.00         0.00         0.00    590.47       0.00    0.00    0.00    0.00  9682.47    0.00
  1     200          0.00         0.00         0.00    622.22       0.00    9.76   44.44    5.48  8527.25    0.02
  1     300          0.00         0.00         0.00    751.82       0.00   25.81   60.00   16.44  7249.81    0.06
  2     400          0.00         0.00         0.00    701.86       0.00   29.41   51.72   20.55  8907.69    0.07
  3     500          0.00         0.00         0.00    618.35   

In [144]:
from spacy.cli.evaluate import evaluate

evaluate(os.path.join(NER_OUTPUT, "model-best"), NER_DEV_CORP, output=os.path.join(NER_OUTPUT, "metrics.json"))

{'token_acc': 1.0,
 'token_p': 1.0,
 'token_r': 1.0,
 'token_f': 1.0,
 'tag_acc': None,
 'sents_p': None,
 'sents_r': None,
 'sents_f': None,
 'dep_uas': None,
 'dep_las': None,
 'dep_las_per_type': None,
 'pos_acc': None,
 'morph_acc': None,
 'morph_micro_p': None,
 'morph_micro_r': None,
 'morph_micro_f': None,
 'morph_per_feat': None,
 'lemma_acc': None,
 'ents_p': 0.7090909090909091,
 'ents_r': 0.5342465753424658,
 'ents_f': 0.6093750000000001,
 'ents_per_type': {'landmark_name': {'p': 0.8148148148148148,
   'r': 0.88,
   'f': 0.8461538461538461},
  'condition': {'p': 0.5, 'r': 0.2, 'f': 0.28571428571428575},
  'date': {'p': 0.625, 'r': 0.5555555555555556, 'f': 0.5882352941176471},
  'number': {'p': 0.6666666666666666,
   'r': 0.6666666666666666,
   'f': 0.6666666666666666},
  'people': {'p': 0.0, 'r': 0.0, 'f': 0.0},
  'component': {'p': 0.0, 'r': 0.0, 'f': 0.0},
  'location': {'p': 1.0, 'r': 0.375, 'f': 0.5454545454545454},
  'type': {'p': 0.5454545454545454,
   'r': 0.4285714285

### 2.2 Visualizing the results
The model is now trained. This model can be loaded into spacy and an example text can be visualized with the NER.

In [145]:
options = {
    "colors": {"location": "lightyellow",
               "person_name": "lightgreen",
               "landmark_name": "lightred",
               "condition": "lightblue"}
}

# Now test teh newly created spacy model on a sample text and visualize it using spacy
nlp = spacy.load(os.path.join(NER_OUTPUT, "model-best"))

example_text = str([text[0] for i, text in enumerate(training_data[:20]) if text != ""])
doc = nlp(example_text)

displacy.render(doc, style="ent", jupyter=True, options=options)

# Show the tokens, their labels and their entities
for ent in doc.ents:
    print(ent.text, ent.label_)

built condition
17 number
ancient India location
ancient India location


### 2.3 Training the Spacy Custom REL component

In [146]:
import subprocess
if spacy.prefer_gpu():
    output = subprocess.run(f"spacy project run all_{COMPONENT}_gpu", cwd="rel_model", capture_output=True)
else:
    output = subprocess.run(f"spacy project run all_{COMPONENT}", cwd="rel_model", capture_output=True)

print(output.stdout.decode("utf-8"))

ℹ Running workflow 'all_1'
[1m
Running command: 'C:\Users\20182640\.virtualenvs\Text-Mining-xR8YyNgY\Scripts\python.exe' ./scripts/parse_data.py assets/annotations_1.jsonl data/train_1.spacy data/dev_1.spacy 1
for  Ephesus , skipped:  2832 found:  460  from true relations:  3292
for  Engelsberg Ironworks , skipped:  2845 found:  447  from true relations:  3292
for  Emas National Park , skipped:  3153 found:  139  from true relations:  3292
for  Ellora Caves , skipped:  2918 found:  374  from true relations:  3292
for  Elephanta Caves , skipped:  3201 found:  91  from true relations:  3292
for  East Rennell , skipped:  3023 found:  269  from true relations:  3292
for  Durham Castle and Cathedral , skipped:  2905 found:  387  from true relations:  3292
for  Doñana National Park , skipped:  3002 found:  290  from true relations:  3292
for  Djoudj National Bird Sanctuary , skipped:  3119 found:  173  from true relations:  3292
for  Djémila , skipped:  2996 found:  296  from true relations

## 2.4 Visualizing the results
 

In [148]:
# make the factory work
from rel_model.scripts.rel_pipe import make_relation_extractor

# make the config work
from rel_model.scripts.rel_model import create_relation_model, create_classification_layer, create_instances, create_tensors

# Now test teh newly created spacy model on a sample text and visualize it using spacy
nlp2 = spacy.load(os.path.join(REL_OUTPUT, "model-best"))

example_text = str([text[0] for i, text in enumerate(training_data[:20]) if text != ""])
doc = nlp2(example_text)

displacy.render(doc, style="ent", jupyter=True, options=options)

# Show the tokens, their labels and their entities
for rel in doc._.rel:
    print(rel)

ℹ Could not determine any instances in doc - returning doc as is.


