# Identify ESCO skills in CVs using neural search

This notebook uses neural search to identify ESCO skills where `esco:skillType=skill`.
Differently from `esco:skillType=knowledge`, these skills are expressed in natural language using multiple words and do not usually contain acronyms.

Acronyms can be a problem for some neural search models, as they can be confused with other words.



In [2]:
import esco
import pandas as pd
import yaml
from pathlib import Path
from sentence_transformers import SentenceTransformer

skills = esco.load_skills(source="json")

In [19]:
# Clean up data files. For example, use a LLM to add a structure to a text CV

text = Path("rpolli.txt").read_text()
# ... ask openai to Split the following text in consistent blocks. Replace non-ascii characters. Return a json list...
text_js = yaml.safe_load(Path("rpolli.json").read_text())

In [20]:
def tokenize(text_js):
  for section in text_js:
    content = section.get("content")
    if not content:
        continue
    if isinstance(content[0], dict):
        yield from (str(x) for x in content)
        continue
    yield str(content)

# Show some token stats, and ensure they are consistent with the model.
cv_tokens = list(tokenize(text_js))
import tiktoken
enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
sorted(((len(enc.encode(x)), x) for x in cv_tokens), reverse=True)

[(102,
  "{'years': '2005–2007', 'position': 'C developer and system integrator', 'company': 'Babel srl, Rome area', 'details': 'Integrating different communication infrastructures (mail,voip,sms,chat) enhancing various opensource software; from architectural design to software implementation. Maintainer of Caldav4j Libraries. Customization of opensource software (courier-imap, postﬁx, openssl, rrdjtool ..).'}"),
 (97,
  '{\'degree\': \'Laurea (Master Degree)\', \'years\': \'1996-2002\', \'institution\': "University of Rome \'La Sapienza\'", \'grades\': \'110/110\', \'major\': \'Mathematics\', \'skills\': \'Geometry, Algebra, Analysis, Physics (Mechanics, Electromagnetism)\', \'thesis\': \'Arithmetic-Geometric Mean and Algebraic Curves\', \'advisor\': \'Kieran O’Grady\'}'),
 (96,
  "{'years': '2012–2022', 'role': 'EuroPython Speaker/Trainer', 'details': 'I started loving python in 2010 and contributed to various python project, including openstack-shade, openshift-on-openstack template

In [None]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens', device="cuda")
print("Max Sequence Length:", model.max_seq_length)

# Embed sentences in skills['embeddings']
skills['embeddings'] = model.encode(skills['text'], show_progress_bar=True, convert_to_tensor=True)


In [None]:
# Compute embeddings for the CV sections converting them to numpy arrays, so that they are compatible with pandas.
embeddings = model.encode(skills.text.values, show_progress_bar=True, convert_to_numpy=True)
cv_embeddings = model.encode(cv_tokens, convert_to_numpy=True)

# For each CV section, find the most similar skill in the skills database
from sklearn.metrics.pairwise import cosine_similarity
result = []
query_threshold = 0.7
for cv_section, cv_emb in zip(cv_tokens, cv_embeddings):
        for skill, skillType, emb in zip(skills.label, skills.skillType, skills.embeddings):
            if skillType.endswith("knowledge"):
                continue
            sim = cosine_similarity([cv_emb], [emb])[0][0]
            if sim < query_threshold:
                continue
            result.append({"text": cv_section,"skill": skill, "score": float(sim)})

# Save the results.
yaml.dump(result, open("result.yaml", "wt"))

In [11]:
result = yaml.safe_load(Path("result.yaml").read_text())

In [14]:
from collections import Counter
ret = Counter(x["skill"] for x in result)

In [18]:
ret.most_common(40)

[('manage digital documents', 9),
 ('use spreadsheets software', 7),
 ('manage standard enterprise resource planning system', 7),
 ('manage ICT virtualisation environments', 6),
 ('provide ICT support', 6),
 ('use presentation software', 5),
 ('use online tools to collaborate', 5),
 ('use personal organization software', 5),
 ('use databases', 4),
 ('deploy cloud resource', 3),
 ('develop with cloud services', 3),
 ('deploy ICT systems', 3),
 ('manage digital archives', 3),
 ('collaborate through digital technologies', 3),
 ('use markup languages', 3),
 ('use object-oriented programming', 3),
 ('use scripting programming', 3),
 ('use access control software', 3),
 ('evaluate information services using metrics', 3),
 ('implement data warehousing techniques', 3),
 ('manage ICT semantic integration', 3),
 ('plan migration to cloud', 2),
 ('monitor ICT research', 2),
 ('analyse pipeline database information', 2),
 ('define database physical structure', 2),
 ('use logic programming', 2),
 (