In [1]:
%load_ext autoreload
%autoreload 2

# Processing Label Studio Exports for Spacy
This notebook processes exports from Label Studio for use in Spacy.

In [2]:
import os
from pprint import pprint
from pathlib import Path

from processor import Processor

# Set the root directory of the project
ROOT_DIR = os.path.abspath(
    os.path.join(os.path.dirname("model_trainig.ipynb"), os.pardir)
)  # This file is the root of the project
DATA_PATH = os.path.join(ROOT_DIR, "data")

# Determine which assignment component to run
COMPONENT: int = 2 # 1 or 2

# Define all general paths
NER_MODEL: Path = os.path.join(ROOT_DIR, 'model', 'ner_model')
REL_MODEL: Path = os.path.join(ROOT_DIR, 'model', 'rel_model')

processor = Processor(ROOT_DIR)

Root directory: c:\Users\diede\Documents\GitHub\Text-Mining


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Define all component specific paths
if COMPONENT == 1:
    ANNOTATIONS: Path = 'final_assignment_1.json'
    GROUND_TRUTHS: Path = 'label_studio_ground_truth_task1.json'
    NER_DEV: Path = os.path.join(NER_MODEL, "assets", "dev_1.json")
    NER_TRAIN: Path = os.path.join(NER_MODEL, "assets", "train_1.json")
    NER_DEV_CORP: Path = os.path.join(NER_MODEL, "corpus", "dev_1.spacy")
    NER_TRAIN_CORP: Path = os.path.join(NER_MODEL, "corpus", "train_1.spacy")
    NER_OUTPUT: Path = os.path.join(NER_MODEL, "training_1")
    REL_ASSETS_TRAIN: Path = os.path.join(REL_MODEL, "assets", "annotations_1_train.jsonl")
    REL_ASSETS_DEV: Path = os.path.join(REL_MODEL, "assets", "annotations_1_dev.jsonl")
    REL_OUTPUT: Path = os.path.join(REL_MODEL, "training_1")
elif COMPONENT == 2:
    ANNOTATIONS: Path = 'final_assignment_2.json'
    GROUND_TRUTHS: Path = 'label_studio_ground_truth_task2.json'
    NER_DEV: Path = os.path.join(NER_MODEL, "assets", "dev_2.json")
    NER_TRAIN: Path = os.path.join(NER_MODEL, "assets", "train_2.json")
    NER_DEV_CORP: Path = os.path.join(NER_MODEL, "corpus", "dev_2.spacy")
    NER_TRAIN_CORP: Path = os.path.join(NER_MODEL, "corpus", "train_2.spacy")
    NER_OUTPUT: Path = os.path.join(NER_MODEL, "training_2")
    REL_ASSETS_TRAIN: Path = os.path.join(REL_MODEL, "assets", "annotations_2_train.jsonl")
    REL_ASSETS_DEV: Path = os.path.join(REL_MODEL, "assets", "annotations_2_dev.jsonl")
    REL_OUTPUT: Path = os.path.join(REL_MODEL, "training_2")
else:
    raise ValueError("COMPONENT must be 1 or 2")

## 1. Loading the Data
### 1.1. Loading JSON export from Label Studio
The annotations need to be loaded into the notebook. This is done via the preprocessor Python class. The training_data_export variable will store the item if this item is not used in the ground_truths. Additionally the ground_truth file is read and stored in the ground_truths variable.

In [4]:
export_data = processor.loadFile(ANNOTATIONS)

# Filter out annotations for which a ground truth exists (drop other annotations for this article as well)
training_data_export = [
    item
    for item in export_data
    if all(annotation["ground_truth"] is False for annotation in item["annotations"])
]
ground_truth_export = processor.loadFile(GROUND_TRUTHS)

#### 1.1.1 Check the training_data_export
The exported data constists of all the annotations from a specific text of all the users. The first annotated text in the exported data file looks something like the following:

In [5]:
print("Length of training data: ", len(training_data_export))
pprint(training_data_export[0])

Length of training data:  58
{'agreement': 100.0,
 'annotations': [{'completed_by': {'email': 'l.r.siecker@student.tue.nl',
                                   'first_name': 'Luc',
                                   'id': 12485,
                                   'last_name': 'Siecker'},
                  'created_at': '2023-10-18T09:14:20.056013Z',
                  'draft_created_at': '2023-10-18T08:43:28.496908Z',
                  'ground_truth': False,
                  'id': 24574363,
                  'import_id': None,
                  'last_action': 'updated',
                  'last_created_by': 12485,
                  'lead_time': 3107.605,
                  'parent_annotation': None,
                  'parent_prediction': None,
                  'prediction': {},
                  'project': 43850,
                  'result': [{'from_name': 'label',
                              'id': '6s2CenzJex',
                              'origin': 'manual',
                         

### 1.2 Converting to Spacy training format:
To provide custom labels to Spacy, we need to convert the data to the following format:

```python
training_data = [
  ("Tokyo Tower is 333m tall.", [(0, 11, "BUILDING"), (20, 23, "HEIGHT")]),
]
```

The `process_export` function from the preprocessor can fix this for us, it retrieves the training data in the given format, in combination with the labels and their relationship

In [6]:
training_data, training_relations = processor.process_export_sentences(training_data_export, component=COMPONENT)
validation_data, validation_relations = processor.process_export_sentences(ground_truth_export, ground_truth = True, component=COMPONENT)

#### 1.2.1 Checking results
Now it is time to check what the results are from the process_export_sentences. The training data and training_relations will look differently and both are shown below. For all examples only the first text is used. Specifically only the first entity is shown for the entities that will be used to train the NER model. For the REL model one whole text is shown.

In [7]:
print("Training data info item 1 \ntext:")
print(training_data[0][0])
print("Labels:")
print(*training_data[0][1]["entities"], sep = "\n")

print("\n Validation data info item 1 \ntext:")
print(validation_data[0][0])
print("Labels:")
print(*validation_data[0][1]["entities"], sep = "\n")

Training data info item 1 
text:
On December 6, 2021, at approximately 1203 Mountain Standard Time, Unit 3 reactor automatically tripped due to receipt of a low departure from nucleate boiling ratio trip signal.
Labels:
[3, 19, 'Datetime']

 Validation data info item 1 
text:
On July 6, 1987, with the plant in Mode 2 at 5 E-6 amps reactor power, a reactor scram on high main coolant pressure occurred during a main turbine overspeed trip test.
Labels:
[3, 15, 'Datetime']


In [8]:
pprint(training_relations[0])

{'answer': 'accept',
 'meta': {'source': '72964643'},
 'relations': [{'child': 11,
                'child_span': {'end': 65,
                               'label': 'Datetime',
                               'start': 38,
                               'token_end': 11,
                               'token_start': 8},
                'head': 17,
                'head_span': {'end': 103,
                              'label': 'Event',
                              'start': 82,
                              'token_end': 17,
                              'token_start': 16},
                'label': 'org:happened_at'},
               {'child': 4,
                'child_span': {'end': 19,
                               'label': 'Datetime',
                               'start': 3,
                               'token_end': 4,
                               'token_start': 1},
                'head': 17,
                'head_span': {'end': 103,
                              'label': 'Event'

### 1.3 Preparing the data for Spacy
The data is now in the correct format, so it can be processed and saved as Spacy training file using the `preprocess_spacy` function from the `Preprocessor` class.

In [9]:
processor.preprocess_json(training_data = training_data, validation_data = validation_data, train_path = NER_TRAIN, dev_path = NER_DEV)

In [10]:
processor.preprocess_json_rel(relational_annotations_train = training_relations, relational_annotations_val = validation_relations, save_path_train=REL_ASSETS_TRAIN, save_path_dev=REL_ASSETS_DEV)

# 2 Training the Spacy Model on on the training file

For training Spacy is used. This is a good way to implement our own data into a specific model. Spacy works out of the box, but not with our specific labels and/or relations. The default behaviour of spacy looks like the following:

### 2.1 Spacy before training with custom labels

In [11]:
import spacy
from spacy import displacy

example_text = training_data[0][0]

nlp = spacy.load("en_core_web_sm")
doc = nlp(example_text)

displacy.render(doc, style="ent")

### 2.2 Training a spacy NER model
Training of a spacy model usually is done via the command line. This is the reason for the following no so understandable lines of code. There are a few steps in the training process:
1. The spacy model needs a config file and all necessary files are in the spacy folder
2. The model needs training data, which was exported in this file above to the spacy folder
3. After training the model is evaluated and the results are printed for training and evaluation

In [12]:
from ner_model.scripts.convert import convert as ner_convert

ner_convert("en", NER_TRAIN, NER_TRAIN_CORP)
ner_convert("en", NER_DEV, NER_DEV_CORP)

Start converting NER data...
Finished convertin NER data
Start converting NER data...
Finished convertin NER data



"At approximately 1028 a.m. on August 11, 1994, Braidwood Unit 1 experienced a spurious Train 'A' Main Steam Line Isolation followed by an automatic reactor trip from Low Low water level on the 1C Steam Generator."



In [13]:
from spacy.cli.train import train

train("ner_model/configs/config.cfg", output_path=NER_OUTPUT, overrides={"paths.train": NER_TRAIN_CORP, "paths.dev": NER_DEV_CORP})

[38;5;4mℹ Saving to output directory:
c:\Users\diede\Documents\GitHub\Text-Mining\model\ner_model\training_2[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'tagger', 'parser', 'attribute_ruler',
'lemmatizer', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS TAGGER  LOSS PARSER  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SPEED   SCORE 
---  ------  ------------  -----------  -----------  --------  ------  ------  ------  ------  ------
  0       0          0.00         0.00         0.00     15.82    0.00    0.00    0.00  4787.31    0.00
  0     100          0.00         0.00         0.00    934.84    0.00    0.00    0.00  6584.81    0.00
  2     200          0.00         0.00         0.00    851.10   15.97   44.68    9.72  6682.47    0.21
  3     300          0.00         0.00         0.00    787.52   17.36   34.72   11.57  6405.34    0.20
  4     400          0.00         0.00         0.00    907.04

In [None]:
from spacy.cli.evaluate import evaluate

evaluate(os.path.join(NER_OUTPUT, "model-best"), NER_DEV_CORP, code="evaluate.py", output=os.path.join(NER_OUTPUT, "metrics.json"))

TypeError: evaluate() got an unexpected keyword argument 'code'

In [None]:
options = {
    "colors": {"location": "lightyellow",
               "person_name": "lightgreen",
               "landmark_name": "lightred",
               "condition": "lightblue"}
}

# Now test teh newly created spacy model on a sample text and visualize it using spacy
nlp = spacy.load(os.path.join(NER_OUTPUT, "model-best"))

example_text = str([text[0] for i, text in enumerate(training_data[:20]) if text != ""])
doc = nlp(example_text)

displacy.render(doc, style="ent", jupyter=True, options=options)

# Show the tokens, their labels and their entities
for ent in doc.ents:
    print(ent.text, ent.label_)

December 6, 2021 Datetime
control element assembly alignment was taking place Activity
reactor Location
all control element assemblies inserted fully into the reactor core Event
Unit 3 reactor Location
electrical short was identified Activity
lowered voltage caused current to go below minimum required holding current and resulted in a slipped control element assembly. Cause
PVNGS Units 1 and 2 Location
2/18/87 Datetime
operators Attribute
0003 hours Datetime
0101 hours Datetime
procedure inadequacy. Cause
concluded that a manual load reduction Activity


### 2.3 Training the Spacy Custom REL component

In [None]:
import subprocess
if spacy.prefer_gpu():
    output = subprocess.run(f"spacy project run all_{COMPONENT}_gpu", cwd="rel_model", capture_output=True)
else:
    output = subprocess.run(f"spacy project run all_{COMPONENT}", cwd="rel_model", capture_output=True)

print(output.stdout.decode("utf-8"))

[38;5;4mℹ Running workflow 'all_2'[0m
[1m
[38;5;4mℹ Skipping 'data2': nothing changed[0m
[1m
Running command: 'C:\Users\diede\anaconda3\envs\tm\python.exe' -m spacy train configs/rel_tok2vec.cfg --output training_2 --paths.train data/train_2.spacy --paths.dev data/dev_2.spacy -c ./scripts/custom_functions.py
[38;5;2m✔ Created output directory: training_2[0m
[38;5;4mℹ Saving to output directory: training_2[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'relation_extractor'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS RELAT...  REL_MICRO_P  REL_MICRO_R  REL_MICRO_F  SCORE 
---  ------  ------------  -------------  -----------  -----------  -----------  ------
  0       0          0.09           2.23         0.00         0.00         0.00    0.00
  8     500          0.66          47.66         0.00         0.00         0.00    0.00
 17    1000          0.17          37.51         0.00  

## 2.4 Visualizing the results
 

In [None]:
# make the custom rel component work, this MUST be imported
from rel_model.scripts.rel_pipe import make_relation_extractor
from rel_model.scripts.rel_model import create_relation_model, create_classification_layer, create_instances, create_tensors

# Now test teh newly created spacy model on a sample text and visualize it using spacy
nlp2 = spacy.load(os.path.join(REL_OUTPUT, "model-best"))

doc = nlp2(doc.text) # doc is the output from the NER model (nlp)

displacy.render(doc, style="ent", jupyter=True, options=options)

# Show the tokens, their labels and their entities
for rel in doc._.rel:
    print(rel)

[38;5;4mℹ Could not determine any instances in doc - returning doc as is.[0m


