Skip to content

Commit

Permalink
Merge pull request #193 from nestauk/prodigy_ner
Browse files Browse the repository at this point in the history
Add prodigy folder for tagging more skills data using prodigy
  • Loading branch information
lizgzil committed Aug 18, 2023
2 parents a852d8e + d2fe5e6 commit f24eb5c
Show file tree
Hide file tree
Showing 10 changed files with 491 additions and 32 deletions.
14 changes: 14 additions & 0 deletions ojd_daps_skills/getters/data_getters.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,20 @@ def load_s3_json(s3, bucket_name, file_name):
return json.loads(file)


def load_prodigy_jsonl_s3_data(s3, bucket_name, file_name):
"""
Load prodigy jsonl formatted data from S3 location.
s3: S3 boto3 resource
bucket_name: The S3 bucket name
file_name: S3 key to load
"""
obj = s3.Object(bucket_name, file_name)
if fnmatch(file_name, "*.jsonl"):
file = obj.get()["Body"].read().decode()
return [json.loads(str(item)) for item in file.strip().split("\n")]


def load_s3_data(s3, bucket_name, file_name):
"""
Load data from S3 location.
Expand Down
29 changes: 20 additions & 9 deletions ojd_daps_skills/pipeline/skill_ner/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,27 +96,38 @@ Using the combined labelled data, we can fine-tune a Spacy model to extract skil
The model can be trained by running:

```
python ojd_daps_skills/pipeline/skill_ner/ner_spacy.py --labelled_date_filename "escoe_extension/outputs/labelled_job_adverts/combined_labels_20220824.json" --convert_multiskill --train_prop 0.8 --drop_out 0.1 --learn_rate 0.001 --num_its 100
python -m spacy download en_core_web_lg
```

This will save out the model in a time stamped folder, e.g. `outputs/models/ner_model/20220825/`, it also saves out the evaluation results and some general information about the model training in the file `outputs/models/ner_model/20220825/train_details.json`.
and then

By default this won't sync the newly trained model to S3, but by adding `--save_s3` it will sync the `outputs/models/ner_model/20220825/` to S3.
```
python ojd_daps_skills/pipeline/skill_ner/ner_spacy.py --labelled_date_filename "escoe_extension/outputs/labelled_job_adverts/combined_labels_20230808.json" --convert_multiskill --train_prop 0.8 --drop_out 0.1 --learn_rate 0.001 --num_its 100 --save_s3
```

This will save out the model in a time stamped folder, e.g. `outputs/models/ner_model/20230808/`, it also saves out the evaluation results and some general information about the model training in the file `outputs/models/ner_model/20230808/train_details.json`.

By default this won't sync the newly trained model to S3, but by adding `--save_s3` it will sync the `outputs/models/ner_model/20230808/` to S3.

This model can be used by running:
A trained model can be used by running:

```python
>>> from ojd_daps_skills.pipeline.skill_ner.ner_spacy import JobNER
>>> job_ner = JobNER()
>>> nlp = job_ner.load_model('outputs/models/ner_model/20220825/', s3_download=True)
>>> text = "We want someone with good communication and maths skills"
>>> nlp = job_ner.load_model('outputs/models/ner_model/20230808/', s3_download=True)
>>> text = "We want someone with good communication and maths skills. There are job benefits such as a pension and cycle to work scheme. We would like someone with experience in marketing."
>>> pred_ents = job_ner.predict(text)
>>> pred_ents
[{'label': 'SKILL', 'start': 21, 'end': 39}, {'label': 'SKILL', 'start': 44, 'end': 56}]
[{'label': 'SKILL', 'start': 26, 'end': 39},
{'label': 'SKILL', 'start': 44, 'end': 56},
{'label': 'BENEFIT', 'start': 103, 'end': 123},
{'label': 'EXPERIENCE', 'start': 152, 'end': 175}]
>>> for ent in pred_ents:
>>> print(text[ent['start']:ent['end']])
good communication
communication
maths skills
cycle to work scheme
experience in marketing
```

The `s3_download=True` argument will mean this model will be first downloaded from S3, so you don't have to have it locally to begin with.
Expand All @@ -126,7 +137,7 @@ The `s3_download=True` argument will mean this model will be first downloaded fr
Running

```
python ojd_daps_skills/pipeline/skill_ner/get_skills.py --model_path outputs/models/ner_model/20220825/ --output_file_dir escoe_extension/outputs/data/skill_ner/skill_predictions/ --job_adverts_filename escoe_extension/inputs/data/skill_ner/data_sample/20220622_sampled_job_ads.json
python ojd_daps_skills/pipeline/skill_ner/get_skills.py --model_path outputs/models/ner_model/20230808/ --output_file_dir escoe_extension/outputs/data/skill_ner/skill_predictions/ --job_adverts_filename escoe_extension/inputs/data/skill_ner/data_sample/20220622_sampled_job_ads.json
```

will make skill predictions on the data in `job_adverts_filename` (an output of `create_data_sample.py`) using the model loaded from `model_path`. By default this will look for the model on S3, but if you want to load a locally stored model just add `--use_local_model`.
Expand Down
91 changes: 90 additions & 1 deletion ojd_daps_skills/pipeline/skill_ner/combine_labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,24 @@
load_s3_json,
load_s3_data,
save_to_s3,
load_prodigy_jsonl_s3_data,
)

from ojd_daps_skills import bucket_name

s3 = get_s3_resource()

# The labelling outputs and the metadata files relevant for their inputs
# The Label-Studio labelling outputs and the metadata files relevant for their inputs
labelled_data_s3_folders = {
"escoe_extension/outputs/skill_span_labels/": "escoe_extension/outputs/data/skill_ner/label_chunks/20220624_0_sample_labelling_metadata.json",
"escoe_extension/outputs/labelled_job_adverts/LIZ_skill_spans/": "escoe_extension/outputs/data/skill_ner/label_chunks/20220819_3_sample_labelling_metadata.json",
"escoe_extension/outputs/labelled_job_adverts/INDIA_skill_spans/": "escoe_extension/outputs/data/skill_ner/label_chunks/20220819_1_sample_labelling_metadata.json",
"escoe_extension/outputs/labelled_job_adverts/CATH_skill_spans/": "escoe_extension/outputs/data/skill_ner/label_chunks/20220819_0_sample_labelling_metadata.json",
}

# The Prodigy labelled data
prodigy_labelled_data_s3_folder = "escoe_extension/outputs/labelled_job_adverts/prodigy/labelled_dataset_skills_080823.jsonl"


def load_original_metadata(labelled_data_s3_folders):
metadata_jobids = {}
Expand Down Expand Up @@ -112,11 +116,89 @@ def combine_results(labelled_data_s3_folders, keep_id_dict, metadata_jobids):
job_labels[job_id] = {
"text": job_advert_labels["task"]["data"]["text"],
"labels": job_advert_labels["result"],
"type": "label-studio",
}

return job_labels


def load_format_prodigy(prodigy_labelled_data_s3_folder):
"""
Load all prodigy labels
Since these were labelled in 5 sentence chunks, then sort them into a nested dict
with the job advert id and the sentence chunk number
"""
s3 = get_s3_resource()
prodigy_data_chunks = defaultdict(dict)
prodigy_data = load_prodigy_jsonl_s3_data(
s3, bucket_name, prodigy_labelled_data_s3_folder
)
for p in prodigy_data:
if p["answer"] == "accept":
prodigy_data_chunks[str(p["meta"]["id"])][p["meta"]["chunk"]] = p
return prodigy_data_chunks


def combine_prodigy_spans(prodigy_data_chunks):
"""
Since the prodigy data was labelled in 5 sentence chunks, we need
to merge all of these chunks per advert including updating the span start and end
characters to fit with merged text
"""

not_equal_spans_count = 0
prodigy_job_labels = {}
for job_id, job_adv_labels in prodigy_data_chunks.items():
# Make sure the sentence chunks are in the correct order
job_adv_labels = {k: job_adv_labels[k] for k in sorted(job_adv_labels)}

# Combine texts and spans for each job advert
full_text = []
all_labels = []
total_chars = 0
for chunk_labels in job_adv_labels.values():
full_text.append(chunk_labels["text"])
for spans_info in chunk_labels["spans"]:
all_labels.append(
{
"value": {
"start": spans_info["start"] + total_chars,
"end": spans_info["end"] + total_chars,
"text": chunk_labels["text"][
spans_info["start"] : spans_info["end"]
],
"labels": [spans_info["label"]],
},
"id": (chunk_labels["_input_hash"], chunk_labels["_task_hash"]),
"origin": chunk_labels["_annotator_id"],
}
)
total_chars += (
len(chunk_labels["text"]) + 2
) # plus two since we combine the 5 sentence chunks together with ". " at the end

full_text = ". ".join(full_text)

# checks
for v in all_labels:
if v["value"]["text"] != full_text[v["value"]["start"] : v["value"]["end"]]:
not_equal_spans_count += 1
if not_equal_spans_count != 0:
print(
f"There were {not_equal_spans_count} issues with merging these spans. Please investigate"
)

# Final output
prodigy_job_labels[job_id] = {
"text": full_text,
"labels": all_labels,
"type": "prodigy",
}

return prodigy_job_labels


if __name__ == "__main__":

metadata_jobids = load_original_metadata(labelled_data_s3_folders)
Expand All @@ -128,6 +210,13 @@ def combine_results(labelled_data_s3_folders, keep_id_dict, metadata_jobids):
labelled_data_s3_folders, keep_id_dict, metadata_jobids
)

prodigy_data = load_format_prodigy(prodigy_labelled_data_s3_folder)
prodigy_job_labels = combine_prodigy_spans(prodigy_data)

# Merge label-studio and prodigy labels
job_labels.update(prodigy_job_labels)
print(f"We will be using data from {len(job_labels)} job adverts")

from datetime import datetime as date

date_stamp = str(date.today().date()).replace("-", "")
Expand Down
37 changes: 19 additions & 18 deletions ojd_daps_skills/pipeline/skill_ner/ner_spacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
from argparse import ArgumentParser
import pickle

from spacy.util import minibatch, compounding
from spacy.util import minibatch, compounding, fix_random_seed
from spacy.training.example import Example
import spacy
from spacy import displacy
Expand Down Expand Up @@ -152,7 +152,9 @@ def process_data(self, job_advert_labels, all_labels):
# character order
ent_list.sort(key=lambda y: y[0])

text, ent_list = clean_entities_text(text, ent_list)
if job_advert_labels.get("type") == "label-studio":
# Label-studio- specific cleaning, won't work (and not needed) for Prodigy
text, ent_list = clean_entities_text(text, ent_list)

return text, ent_list, all_labels

Expand Down Expand Up @@ -184,6 +186,7 @@ def load_data(self):
text, ent_list, self.all_labels = self.process_data(
label_data, self.all_labels
)

data.append(
(
text,
Expand All @@ -193,7 +196,6 @@ def load_data(self):
},
)
)

return data

def get_test_train(self, data):
Expand Down Expand Up @@ -234,26 +236,21 @@ def prepare_model(self):
"""
Prepare a Spacy model to have it's NER component trained
"""
self.nlp = spacy.blank("en")
self.nlp.add_pipe("ner")
self.nlp.begin_training()
fix_random_seed(0)

# Use a new model
# self.nlp = spacy.blank("en")
# self.nlp.add_pipe("ner")
# self.nlp.begin_training()

# self.nlp = spacy.load("en_core_web_sm")
# Use a pre-trained model
self.nlp = spacy.load("en_core_web_lg")

# Getting the ner component
ner = self.nlp.get_pipe("ner")

# Add the new labels to ner (don't train the MULTISKILL)
self.train_labels = self.all_labels.copy()
if self.convert_multiskill:
self.train_labels.remove("MULTISKILL")

for label in self.train_labels:
ner.add_label(label)

# Resume training
self.optimizer = self.nlp.resume_training()
move_names = list(ner.move_names)

def train_multiskill_classifier(self, train_data, test_data):
"""
Expand Down Expand Up @@ -335,6 +332,7 @@ def train(
self.drop_out = drop_out
self.num_its = num_its
self.learn_rate = learn_rate

# List of pipes you want to train
pipe_exceptions = ["ner"]
# List of pipes which should remain unaffected in training
Expand Down Expand Up @@ -450,7 +448,8 @@ def score(self, results_summary):

def save_model(self, output_folder, save_s3=False):

output_folder = os.path.join(str(PROJECT_DIR), output_folder)
if not save_s3:
output_folder = os.path.join(str(PROJECT_DIR), output_folder)

if not os.path.exists(output_folder):
os.makedirs(output_folder)
Expand Down Expand Up @@ -482,7 +481,6 @@ def save_model(self, output_folder, save_s3=False):
"ms_classifier_train_evaluation": self.ms_classifier_train_evaluation,
"ms_classifier_test_evaluation": self.ms_classifier_test_evaluation,
"seen_job_ids": self.seen_job_ids,
"losses": self.all_losses,
}
)
save_json_dict(
Expand Down Expand Up @@ -579,7 +577,9 @@ def parse_arguments(parser):
convert_multiskill=args.convert_multiskill,
train_prop=float(args.train_prop),
)

data = job_ner.load_data()

train_data, test_data = job_ner.get_test_train(data)

job_ner.prepare_model()
Expand All @@ -597,4 +597,5 @@ def parse_arguments(parser):
date_stamp = str(date.today().date()).replace("-", "")
output_folder = f"outputs/models/ner_model/{date_stamp}"
results = job_ner.evaluate(test_data)

job_ner.save_model(output_folder, args.save_s3)
7 changes: 4 additions & 3 deletions ojd_daps_skills/pipeline/skill_ner/ner_spacy_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,10 @@ def fix_entity_annotations(text, ents):

# If the char before the start of this span is not a space,
# Then update from this ent onwards
if text[b - 1] != " ":
ent_additions[i:] = [ea + 1 for ea in ent_additions[i:]]
insert_index_space.append(b)
if b != 0:
if text[b - 1] != " ":
ent_additions[i:] = [ea + 1 for ea in ent_additions[i:]]
insert_index_space.append(b)

# If the next char after this span is not a space,
# then update the start and endings of all entities after this
Expand Down
Loading

0 comments on commit f24eb5c

Please sign in to comment.