Merge pull request #193 from nestauk/prodigy_ner

Add prodigy folder for tagging more skills data using prodigy
nestauk · Aug 18, 2023 · f24eb5c · f24eb5c
2 parents a852d8e + d2fe5e6
commit f24eb5c
Show file tree

Hide file tree

Showing 10 changed files with 491 additions and 32 deletions.
diff --git a/ojd_daps_skills/getters/data_getters.py b/ojd_daps_skills/getters/data_getters.py
@@ -116,6 +116,20 @@ def load_s3_json(s3, bucket_name, file_name):
     return json.loads(file)
 
 
+def load_prodigy_jsonl_s3_data(s3, bucket_name, file_name):
+    """
+    Load prodigy jsonl formatted data from S3 location.
+
+    s3: S3 boto3 resource
+    bucket_name: The S3 bucket name
+    file_name: S3 key to load
+    """
+    obj = s3.Object(bucket_name, file_name)
+    if fnmatch(file_name, "*.jsonl"):
+        file = obj.get()["Body"].read().decode()
+        return [json.loads(str(item)) for item in file.strip().split("\n")]
+
+
 def load_s3_data(s3, bucket_name, file_name):
     """
     Load data from S3 location.

diff --git a/ojd_daps_skills/pipeline/skill_ner/README.md b/ojd_daps_skills/pipeline/skill_ner/README.md
@@ -96,27 +96,38 @@ Using the combined labelled data, we can fine-tune a Spacy model to extract skil
 The model can be trained by running:
 
 ```
-python ojd_daps_skills/pipeline/skill_ner/ner_spacy.py --labelled_date_filename "escoe_extension/outputs/labelled_job_adverts/combined_labels_20220824.json" --convert_multiskill --train_prop 0.8 --drop_out 0.1 --learn_rate 0.001 --num_its 100
+python -m spacy download en_core_web_lg
 ```
 
-This will save out the model in a time stamped folder, e.g. `outputs/models/ner_model/20220825/`, it also saves out the evaluation results and some general information about the model training in the file `outputs/models/ner_model/20220825/train_details.json`.
+and then
 
-By default this won't sync the newly trained model to S3, but by adding `--save_s3` it will sync the `outputs/models/ner_model/20220825/` to S3.
+```
+python ojd_daps_skills/pipeline/skill_ner/ner_spacy.py --labelled_date_filename "escoe_extension/outputs/labelled_job_adverts/combined_labels_20230808.json" --convert_multiskill --train_prop 0.8 --drop_out 0.1 --learn_rate 0.001 --num_its 100 --save_s3
+```
+
+This will save out the model in a time stamped folder, e.g. `outputs/models/ner_model/20230808/`, it also saves out the evaluation results and some general information about the model training in the file `outputs/models/ner_model/20230808/train_details.json`.
+
+By default this won't sync the newly trained model to S3, but by adding `--save_s3` it will sync the `outputs/models/ner_model/20230808/` to S3.
 
-This model can be used by running:
+A trained model can be used by running:
 
 ```python
 >>> from ojd_daps_skills.pipeline.skill_ner.ner_spacy import JobNER
 >>> job_ner = JobNER()
->>> nlp = job_ner.load_model('outputs/models/ner_model/20220825/', s3_download=True)
->>> text = "We want someone with good communication and maths skills"
+>>> nlp = job_ner.load_model('outputs/models/ner_model/20230808/', s3_download=True)
+>>> text = "We want someone with good communication and maths skills. There are job benefits such as a pension and cycle to work scheme. We would like someone with experience in marketing."
 >>> pred_ents = job_ner.predict(text)
 >>> pred_ents
-[{'label': 'SKILL', 'start': 21, 'end': 39}, {'label': 'SKILL', 'start': 44, 'end': 56}]
+[{'label': 'SKILL', 'start': 26, 'end': 39},
+ {'label': 'SKILL', 'start': 44, 'end': 56},
+ {'label': 'BENEFIT', 'start': 103, 'end': 123},
+ {'label': 'EXPERIENCE', 'start': 152, 'end': 175}]
 >>> for ent in pred_ents:
 >>>     print(text[ent['start']:ent['end']])
-good communication
+communication
 maths skills
+cycle to work scheme
+experience in marketing
 ```
 
 The `s3_download=True` argument will mean this model will be first downloaded from S3, so you don't have to have it locally to begin with.
@@ -126,7 +137,7 @@ The `s3_download=True` argument will mean this model will be first downloaded fr
 Running
 
 ```
-python ojd_daps_skills/pipeline/skill_ner/get_skills.py --model_path outputs/models/ner_model/20220825/ --output_file_dir escoe_extension/outputs/data/skill_ner/skill_predictions/ --job_adverts_filename escoe_extension/inputs/data/skill_ner/data_sample/20220622_sampled_job_ads.json
+python ojd_daps_skills/pipeline/skill_ner/get_skills.py --model_path outputs/models/ner_model/20230808/ --output_file_dir escoe_extension/outputs/data/skill_ner/skill_predictions/ --job_adverts_filename escoe_extension/inputs/data/skill_ner/data_sample/20220622_sampled_job_ads.json
 ```
 
 will make skill predictions on the data in `job_adverts_filename` (an output of `create_data_sample.py`) using the model loaded from `model_path`. By default this will look for the model on S3, but if you want to load a locally stored model just add `--use_local_model`.

diff --git a/ojd_daps_skills/pipeline/skill_ner/combine_labels.py b/ojd_daps_skills/pipeline/skill_ner/combine_labels.py
@@ -13,20 +13,24 @@
     load_s3_json,
     load_s3_data,
     save_to_s3,
+    load_prodigy_jsonl_s3_data,
 )
 
 from ojd_daps_skills import bucket_name
 
 s3 = get_s3_resource()
 
-# The labelling outputs and the metadata files relevant for their inputs
+# The Label-Studio labelling outputs and the metadata files relevant for their inputs
 labelled_data_s3_folders = {
     "escoe_extension/outputs/skill_span_labels/": "escoe_extension/outputs/data/skill_ner/label_chunks/20220624_0_sample_labelling_metadata.json",
     "escoe_extension/outputs/labelled_job_adverts/LIZ_skill_spans/": "escoe_extension/outputs/data/skill_ner/label_chunks/20220819_3_sample_labelling_metadata.json",
     "escoe_extension/outputs/labelled_job_adverts/INDIA_skill_spans/": "escoe_extension/outputs/data/skill_ner/label_chunks/20220819_1_sample_labelling_metadata.json",
     "escoe_extension/outputs/labelled_job_adverts/CATH_skill_spans/": "escoe_extension/outputs/data/skill_ner/label_chunks/20220819_0_sample_labelling_metadata.json",
 }
 
+# The Prodigy labelled data
+prodigy_labelled_data_s3_folder = "escoe_extension/outputs/labelled_job_adverts/prodigy/labelled_dataset_skills_080823.jsonl"
+
 
 def load_original_metadata(labelled_data_s3_folders):
     metadata_jobids = {}
@@ -112,11 +116,89 @@ def combine_results(labelled_data_s3_folders, keep_id_dict, metadata_jobids):
                 job_labels[job_id] = {
                     "text": job_advert_labels["task"]["data"]["text"],
                     "labels": job_advert_labels["result"],
+                    "type": "label-studio",
                 }
 
     return job_labels
 
 
+def load_format_prodigy(prodigy_labelled_data_s3_folder):
+    """
+    Load all prodigy labels
+    Since these were labelled in 5 sentence chunks, then sort them into a nested dict
+    with the job advert id and the sentence chunk number
+
+    """
+    s3 = get_s3_resource()
+    prodigy_data_chunks = defaultdict(dict)
+    prodigy_data = load_prodigy_jsonl_s3_data(
+        s3, bucket_name, prodigy_labelled_data_s3_folder
+    )
+    for p in prodigy_data:
+        if p["answer"] == "accept":
+            prodigy_data_chunks[str(p["meta"]["id"])][p["meta"]["chunk"]] = p
+    return prodigy_data_chunks
+
+
+def combine_prodigy_spans(prodigy_data_chunks):
+    """
+    Since the prodigy data was labelled in 5 sentence chunks, we need
+    to merge all of these chunks per advert including updating the span start and end
+    characters to fit with merged text
+    """
+
+    not_equal_spans_count = 0
+    prodigy_job_labels = {}
+    for job_id, job_adv_labels in prodigy_data_chunks.items():
+        # Make sure the sentence chunks are in the correct order
+        job_adv_labels = {k: job_adv_labels[k] for k in sorted(job_adv_labels)}
+
+        # Combine texts and spans for each job advert
+        full_text = []
+        all_labels = []
+        total_chars = 0
+        for chunk_labels in job_adv_labels.values():
+            full_text.append(chunk_labels["text"])
+            for spans_info in chunk_labels["spans"]:
+                all_labels.append(
+                    {
+                        "value": {
+                            "start": spans_info["start"] + total_chars,
+                            "end": spans_info["end"] + total_chars,
+                            "text": chunk_labels["text"][
+                                spans_info["start"] : spans_info["end"]
+                            ],
+                            "labels": [spans_info["label"]],
+                        },
+                        "id": (chunk_labels["_input_hash"], chunk_labels["_task_hash"]),
+                        "origin": chunk_labels["_annotator_id"],
+                    }
+                )
+            total_chars += (
+                len(chunk_labels["text"]) + 2
+            )  # plus two since we combine the 5 sentence chunks together with ". " at the end
+
+        full_text = ". ".join(full_text)
+
+        # checks
+        for v in all_labels:
+            if v["value"]["text"] != full_text[v["value"]["start"] : v["value"]["end"]]:
+                not_equal_spans_count += 1
+        if not_equal_spans_count != 0:
+            print(
+                f"There were {not_equal_spans_count} issues with merging these spans. Please investigate"
+            )
+
+        # Final output
+        prodigy_job_labels[job_id] = {
+            "text": full_text,
+            "labels": all_labels,
+            "type": "prodigy",
+        }
+
+    return prodigy_job_labels
+
+
 if __name__ == "__main__":
 
     metadata_jobids = load_original_metadata(labelled_data_s3_folders)
@@ -128,6 +210,13 @@ def combine_results(labelled_data_s3_folders, keep_id_dict, metadata_jobids):
         labelled_data_s3_folders, keep_id_dict, metadata_jobids
     )
 
+    prodigy_data = load_format_prodigy(prodigy_labelled_data_s3_folder)
+    prodigy_job_labels = combine_prodigy_spans(prodigy_data)
+
+    # Merge label-studio and prodigy labels
+    job_labels.update(prodigy_job_labels)
+    print(f"We will be using data from {len(job_labels)} job adverts")
+
     from datetime import datetime as date
 
     date_stamp = str(date.today().date()).replace("-", "")

diff --git a/ojd_daps_skills/pipeline/skill_ner/ner_spacy.py b/ojd_daps_skills/pipeline/skill_ner/ner_spacy.py
@@ -36,7 +36,7 @@
 from argparse import ArgumentParser
 import pickle
 
-from spacy.util import minibatch, compounding
+from spacy.util import minibatch, compounding, fix_random_seed
 from spacy.training.example import Example
 import spacy
 from spacy import displacy
@@ -152,7 +152,9 @@ def process_data(self, job_advert_labels, all_labels):
         # character order
         ent_list.sort(key=lambda y: y[0])
 
-        text, ent_list = clean_entities_text(text, ent_list)
+        if job_advert_labels.get("type") == "label-studio":
+            # Label-studio- specific cleaning, won't work (and not needed) for Prodigy
+            text, ent_list = clean_entities_text(text, ent_list)
 
         return text, ent_list, all_labels
 
@@ -184,6 +186,7 @@ def load_data(self):
             text, ent_list, self.all_labels = self.process_data(
                 label_data, self.all_labels
             )
+
             data.append(
                 (
                     text,
@@ -193,7 +196,6 @@ def load_data(self):
                     },
                 )
             )
-
         return data
 
     def get_test_train(self, data):
@@ -234,26 +236,21 @@ def prepare_model(self):
         """
         Prepare a Spacy model to have it's NER component trained
         """
-        self.nlp = spacy.blank("en")
-        self.nlp.add_pipe("ner")
-        self.nlp.begin_training()
+        fix_random_seed(0)
+
+        # Use a new model
+        # self.nlp = spacy.blank("en")
+        # self.nlp.add_pipe("ner")
+        # self.nlp.begin_training()
 
-        # self.nlp = spacy.load("en_core_web_sm")
+        # Use a pre-trained model
+        self.nlp = spacy.load("en_core_web_lg")
 
         # Getting the ner component
         ner = self.nlp.get_pipe("ner")
 
-        # Add the new labels to ner (don't train the MULTISKILL)
-        self.train_labels = self.all_labels.copy()
-        if self.convert_multiskill:
-            self.train_labels.remove("MULTISKILL")
-
-        for label in self.train_labels:
-            ner.add_label(label)
-
         # Resume training
         self.optimizer = self.nlp.resume_training()
-        move_names = list(ner.move_names)
 
     def train_multiskill_classifier(self, train_data, test_data):
         """
@@ -335,6 +332,7 @@ def train(
         self.drop_out = drop_out
         self.num_its = num_its
         self.learn_rate = learn_rate
+
         # List of pipes you want to train
         pipe_exceptions = ["ner"]
         # List of pipes which should remain unaffected in training
@@ -450,7 +448,8 @@ def score(self, results_summary):
 
     def save_model(self, output_folder, save_s3=False):
 
-        output_folder = os.path.join(str(PROJECT_DIR), output_folder)
+        if not save_s3:
+            output_folder = os.path.join(str(PROJECT_DIR), output_folder)
 
         if not os.path.exists(output_folder):
             os.makedirs(output_folder)
@@ -482,7 +481,6 @@ def save_model(self, output_folder, save_s3=False):
                 "ms_classifier_train_evaluation": self.ms_classifier_train_evaluation,
                 "ms_classifier_test_evaluation": self.ms_classifier_test_evaluation,
                 "seen_job_ids": self.seen_job_ids,
-                "losses": self.all_losses,
             }
         )
         save_json_dict(
@@ -579,7 +577,9 @@ def parse_arguments(parser):
         convert_multiskill=args.convert_multiskill,
         train_prop=float(args.train_prop),
     )
+
     data = job_ner.load_data()
+
     train_data, test_data = job_ner.get_test_train(data)
 
     job_ner.prepare_model()
@@ -597,4 +597,5 @@ def parse_arguments(parser):
     date_stamp = str(date.today().date()).replace("-", "")
     output_folder = f"outputs/models/ner_model/{date_stamp}"
     results = job_ner.evaluate(test_data)
+
     job_ner.save_model(output_folder, args.save_s3)
diff --git a/ojd_daps_skills/pipeline/skill_ner/ner_spacy_utils.py b/ojd_daps_skills/pipeline/skill_ner/ner_spacy_utils.py
@@ -102,9 +102,10 @@ def fix_entity_annotations(text, ents):
 
         # If the char before the start of this span is not a space,
         # Then update from this ent onwards
-        if text[b - 1] != " ":
-            ent_additions[i:] = [ea + 1 for ea in ent_additions[i:]]
-            insert_index_space.append(b)
+        if b != 0:
+            if text[b - 1] != " ":
+                ent_additions[i:] = [ea + 1 for ea in ent_additions[i:]]
+                insert_index_space.append(b)
 
         # If the next char after this span is not a space,
         # then update the start and endings of all entities after this