In [None]:
import spacy
import yaml
from pathlib import Path
import pandas as pd
import json

# Without GPU this is quite slow. Further info on using a GCP server on GPU can be found in GCP.md
spacy.prefer_gpu()

In [None]:
# Load the ESCO data from SPARQL and create a dataframe with some useful columns.
# Alternatively, use the pre-processed esco.json file.
df = pd.read_json("../esco/esco.json.gz", orient="records")
df.index = df.s
skills = df.groupby(df.s).agg(
    {
        "altLabel": lambda x: list(set(x)),
        "label": lambda x: x.iloc[0],
        "description": lambda x: x.iloc[0],
        "skillType": lambda x: x.iloc[0],
    }
)
# Add a lowercase text field for semantic search.
skills["text"] = skills.apply(
    lambda x: "; ".join([x.label] + x.altLabel + [x.description]).lower(), axis=1
)
# .. and a set of all the labels for each skill.
skills["allLabel"] = skills.apply(
    lambda x: {t.lower() for t in x.altLabel} | {x.label.lower()}, axis=1
)



In [None]:
# Smoke test some skills
list(skills[skills.label.str.contains("Python")].altLabel)

In [None]:
def make_pattern(kn: dict):
    """Given an ESCO skill entry in the dataframe, create a pattern for the matcher.
    
    The entry has the following fields:
    - label: the preferred label
    - altLabel: a list of alternative labels
    - the skillType: e.g. knowledge, skill, ability

    The logic uses some euristic to decide whether to use the preferred label or the alternative labels.
    """
    label = kn["label"]    
    pattern = [{"LOWER": label.lower()}] if len(label) > 3 else [{"TEXT": label}] 
    patterns = [pattern]
    altLabel = [kn['altLabel']] if isinstance(kn['altLabel'], str) else kn['altLabel']
    for alt in altLabel:
        if len(alt) <= 3:
            candidate = [{"TEXT": alt}]
        elif len(alt.split()) > 1:
            candidate = [{"LOWER": x} for x in alt.lower().split()]
        else:
            candidate = [{"LOWER": alt.lower()}]
        if candidate in patterns:
            print(f"Skipping {candidate}")
            continue
        patterns.append(candidate)
    pattern_identifier = f"{kn['skillType'][:2]}_{label.replace(' ', '_')}".upper().translate(
        str.maketrans("", "", "()")
    )
    if 'python' in pattern_identifier.lower():
        print(pattern_identifier, patterns)
    return pattern_identifier, patterns

# Create the patterns for the matcher
m = dict(make_pattern(kni) for kni in skills.to_dict(orient="records"))


In [None]:
# Use spacy matcher with a blank model to validate the patterns.
# If this doesn't work, spacy will raise an error.
import spacy
from spacy.matcher import Matcher
nlp_test = spacy.blank("en")
m1 = Matcher(nlp_test.vocab, validate=True)
for pid, patterns in m.items():
    m1.add(pid, patterns) 


In [None]:
# Show the first 3 patterns
list(m.items())[:3]
json.dump(m, open("../generated/esco_matchers.json", "w"))
esco_p = [{"label":"ESCO", "pattern": pattern } for k, p in m.items()  for pattern in p ]

# Save the patterns to a json
import json
with open("../generated/esco_patterns.json", "w") as f:
    json.dump(esco_p, f)

## Create an ESCO spacy entity recognizer model

This entity recognizer reuses the en_core_web_trf model, that is quite good at identifying PRODUCTS.
We will add a new entity label, ESCO, that uses the altLabel patterns to identify further entities.

The ESCO entity label is added to the pipeline after the NER component, so that the NER component can identify the entities that are already in the en_core_web_trf model, and the ESCO component can add the ESCO entities.

In [None]:
# This model is quite good at recognizing ICT entities like products.
import spacy
from spacy import displacy  # Load a viewer.
nlp_e = spacy.load("en_core_web_trf")
ruler = nlp_e.add_pipe("entity_ruler", after="ner")
ruler.add_patterns(esco_p)
nlp_e.to_disk("../generated/en_core_web_trf_esco_ner")


In [None]:
import sys
from hashlib import sha256
DATADIR = Path(sys.path[0]).parent / "tests" / "data"
text = (DATADIR / "rpolli.txt").read_text()


def get_stats(doc):
    doc_id = sha256(doc.text.encode("utf-8")).hexdigest()
    all_entities = [(ent.text, ent.label_) for ent in doc.ents]
    ent_count = len(all_entities)
    ent_unique = set(all_entities)
    ent_unique_count = len(ent_unique)
    ent_text_unique = set(text for text, _ in ent_unique)
    ent_text_unique_count = len(ent_text_unique)
    ent_skills = set((t, l) for t, l in ent_unique if l in ("ESCO", "PRODUCT"))
    ent_skills_text_unique = len(set(text for text, _ in ent_skills))
    return {
        "doc_id": doc_id,
        "ent_count": ent_count,
        "ent_unique": list(ent_unique),
        "ent_unique_count": ent_unique_count,
        "ent_text_unique": list(ent_text_unique),
        "ent_text_unique_count": ent_text_unique_count,
        "ent_skills": list(ent_skills),
        "ent_skills_text_unique": ent_skills_text_unique,
    }

doc = nlp_e(text)
# Show some stats
get_stats(doc)


In [None]:
# Visualize the result.
displacy.render(doc, style="ent", jupyter=True)

# Model testing

The recognizer is tested processing a set of CVs and returning the entities found in each CV.


In [None]:
DATADIR = Path(".").parent / "tests" / "data"
cvs = yaml.safe_load((DATADIR / 'curricula.yaml').read_text("utf-8"))


In [None]:
# Prepare the data for the test.


def model_factory(model_name, patterns=None, config=None):
    model = spacy.load(model_name)
    if patterns:
        config = config or {}
        ruler = model.add_pipe("entity_ruler", **config)
        ruler.add_patterns(esco_p)
    return model


testcases = {
    "base": model_factory("en_core_web_md"),
    "trf": model_factory("en_core_web_trf"),
    "trf_pre": model_factory("en_core_web_trf", esco_p, config={"before": "ner"}),
    "trf_post": model_factory("en_core_web_trf", esco_p, config={"after": "ner"}),
}


In [None]:
# Run the testcases and save each result asap, since the test can take a long time.
results_out = Path("results.out")
with results_out.open("wb") as fh:
    fh.write(b"[")
    for model_name, nlp_model in testcases.items():
        for doc, cv in nlp_model.pipe([(cv["text"], cv) for cv in cvs], as_tuples=True):
            stats = get_stats(doc)
            stats["model"] = model_name
            fh.write(json.dumps(stats).encode())
            fh.write(b",\n")
    fh.write(b"]")

In [None]:
results = Path("results.out").read_text()
data=yaml.safe_load(results)
df = pd.DataFrame(data)
df["model"] = df.index // 9
# aggregate the dataframe above by doc_id, using the couple (model, ent_.._count) as columns
# and the doc_id as the index
results = df.groupby(["doc_id", "model"]).agg(list).unstack()


# Test saved model

In [None]:
import spacy
import sys
import re

DATADIR = Path(sys.path[0]).parent / "tests" / "data"

nlp_esco = spacy.load("../generated/en_core_web_trf_esco_ner/")

In [None]:
text_raw = (DATADIR / "rpolli.txt").read_text()
text = re.sub('\n+','\n',text_raw)


In [None]:

doc = nlp_esco(text)
displacy.render(doc, style="ent", jupyter=True)