In [None]:
import spacy
from collections import Counter
import esco


nlp = spacy.load("en_core_web_trf")
db = esco.LocalDB()

In [None]:
skills_labels = [
        "collaborate with engineers",
        "deploy cloud resource",
        "design cloud architecture",
        "design cloud networks",
        "plan migration to cloud",
        "automate cloud tasks",
        "coordinate engineering teams",
        "design database in the cloud",
        "design for organisational complexity",
        "develop with cloud services",
        "do cloud refactoring",
    ]
skills = db.skills[db.skills.label.str.lower().isin(skills_labels)]
labels  = [l for labels in skills.allLabel for l in labels]

In [None]:
def get_verb_obj_compound_from_label(label, nlp):
    """
    Given a label, return the root verb, the direct object, adjectives, and compounds.
    """
    doc = nlp(label)
    root = find_root(doc)
    if root is None:
        doc = nlp(f"to {label}")
        root = find_root(doc)
    if root is None:
        return None, None, []

    dobj = find_dobj(root)

    # Find compounds
    compounds = [token for token in doc if token.dep_ == "compound"]

    return root, dobj, compounds

def find_root(doc):
    for token in doc:
        if token.dep_ == "ROOT" and token.pos_ == "VERB":
            return token
    return None

def find_dobj(token):
    for child in token.children:
        if child.dep_ in ("dobj", "pobj"):
            return child
        elif child.dep_ in ("prep", "adp"):
            return find_dobj(child)
    return None

In [None]:
def generate_pattern(label):
    root, obj, compounds = get_verb_obj_compound_from_label(label, nlp)
    pattern = []

    # Aggiungi il verbo radice se trovato
    if root:
        pattern.append({"LEMMA": root.lemma_, "POS": "VERB"})

    # Aggiungi i compound al pattern
    for compound in compounds:
        pattern.append({"LOWER": compound.text.lower(), "DEP": "compound"})

    # Aggiungi l'oggetto diretto se trovato
    if obj:
        pattern.append({"LEMMA": obj.lemma_, "POS": "NOUN"})

    # Se non Ã¨ stato generato alcun pattern, usa l'etichetta originale
    if not pattern:
        pattern = [{"LOWER": label.lower()}]

    return pattern

In [None]:
import spacy
from spacy.matcher import Matcher

# Crea un matcher
matcher = Matcher(nlp.vocab)
for label in labels:
  print(label)

# Genera e stampa i pattern per ogni etichetta
c = 0
for label in labels:
    print(f"Label: {label}")
    pattern = generate_pattern(label)
    print(f"Pattern: {pattern}")
    print()
    c+=1
    # Aggiungi il pattern al matcher
    matcher.add(label, [pattern])
print(c)
# Esempio di utilizzo del matcher
def find_matches(text):
    doc = nlp(text)
    matches = matcher(doc)
    predicted_matches = []
    for match_id, start, end in matches:
        string_id = nlp.vocab.strings[match_id]
        span = doc[start:end]
        print(f"Matched '{string_id}': {span.text}")
        predicted_matches.append(span.text)
    return set(predicted_matches)

# Test del matcher
#test_text_1 = "I am an experienced IT professional with a strong background in cloud computing and software development. I specialize in planning and executing cloud migrations, designing cloud architectures, and automating cloud tasks. My skills include creating cloud network, managing cloud resource, and developing cloud applications. I am also adept at designing cloud environments for large organizations, ensuring they are scalable and efficient."

test_text = "I design cloud architectures, I contributed to the design of cloud architectures,I worked on multi-cluster cloud architectures,"
print("Testing matcher with:", test_text)
find_matches(test_text)


Results obtained through the matcher implementation. See matcher_skill_fixbugs.ipynb.

```
Label: plan migration to cloud
Pattern: [{'LEMMA': 'plan', 'POS': 'VERB'}, {'LEMMA': 'migration', 'POS': 'NOUN'}]

Label: cloud migration planning
Pattern: [{'LOWER': 'cloud migration planning'}]

Label: plan refactoring
Pattern: [{'LEMMA': 'plan', 'POS': 'VERB'}, {'LEMMA': 'refactoring', 'POS': 'NOUN'}]

Label: design cloud architecture
Pattern: [{'LEMMA': 'design', 'POS': 'VERB'}, {'LOWER': 'cloud', 'DEP': 'compound'}, {'LEMMA': 'architecture', 'POS': 'NOUN'}]

Label: create cloud architecture
Pattern: [{'LEMMA': 'create', 'POS': 'VERB'}, {'LOWER': 'cloud', 'DEP': 'compound'}, {'LEMMA': 'architecture', 'POS': 'NOUN'}]

Label: engineer cloud architecture
Pattern: [{'LEMMA': 'engineer', 'POS': 'VERB'}, {'LOWER': 'cloud', 'DEP': 'compound'}, {'LEMMA': 'architecture', 'POS': 'NOUN'}]

Label: design multi-tier cloud architecture
Pattern: [{'LEMMA': 'design', 'POS': 'VERB'}, {'LOWER': 'cloud', 'DEP': 'compound'}, {'LEMMA': 'architecture', 'POS': 'NOUN'}]

Label: refactoring
Pattern: [{'LOWER': 'refactoring'}]

Label: do cloud refactoring
Pattern: [{'LEMMA': 'do', 'POS': 'VERB'}, {'LOWER': 'cloud', 'DEP': 'compound'}]

Label: implement cloud network
Pattern: [{'LEMMA': 'implement', 'POS': 'VERB'}, {'LOWER': 'cloud', 'DEP': 'compound'}, {'LEMMA': 'network', 'POS': 'NOUN'}]

Label: design cloud networks
Pattern: [{'LEMMA': 'design', 'POS': 'VERB'}, {'LOWER': 'cloud', 'DEP': 'compound'}, {'LEMMA': 'network', 'POS': 'NOUN'}]

Label: create cloud network
Pattern: [{'LEMMA': 'create', 'POS': 'VERB'}, {'LOWER': 'cloud', 'DEP': 'compound'}, {'LEMMA': 'network', 'POS': 'NOUN'}]

Label: deployment and provisioning
Pattern: [{'LOWER': 'deployment and provisioning'}]

Label: deploy cloud resource
Pattern: [{'LEMMA': 'deploy', 'POS': 'VERB'}, {'LOWER': 'cloud', 'DEP': 'compound'}, {'LEMMA': 'resource', 'POS': 'NOUN'}]

Label: cloud deployment
Pattern: [{'LOWER': 'cloud deployment'}]

Label: provision cloud resources
Pattern: [{'LEMMA': 'provision', 'POS': 'VERB'}, {'LOWER': 'cloud', 'DEP': 'compound'}, {'LEMMA': 'resource', 'POS': 'NOUN'}]

Label: develop cloud applications
Pattern: [{'LEMMA': 'develop', 'POS': 'VERB'}, {'LOWER': 'cloud', 'DEP': 'compound'}, {'LEMMA': 'application', 'POS': 'NOUN'}]

Label: write code with cloud services
Pattern: [{'LEMMA': 'write', 'POS': 'VERB'}, {'LOWER': 'cloud', 'DEP': 'compound'}, {'LEMMA': 'code', 'POS': 'NOUN'}]

Label: code with cloud services
Pattern: [{'LEMMA': 'code', 'POS': 'VERB'}, {'LOWER': 'cloud', 'DEP': 'compound'}, {'LEMMA': 'service', 'POS': 'NOUN'}]

Label: develop with cloud services
Pattern: [{'LEMMA': 'develop', 'POS': 'VERB'}, {'LOWER': 'cloud', 'DEP': 'compound'}, {'LEMMA': 'service', 'POS': 'NOUN'}]

Label: design cloud data architecture
Pattern: [{'LEMMA': 'design', 'POS': 'VERB'}, {'LOWER': 'cloud', 'DEP': 'compound'}, {'LOWER': 'data', 'DEP': 'compound'}, {'LEMMA': 'architecture', 'POS': 'NOUN'}]

Label: design database in the cloud
Pattern: [{'LEMMA': 'design', 'POS': 'VERB'}, {'LEMMA': 'database', 'POS': 'NOUN'}]

Label: develop cloud database design
Pattern: [{'LEMMA': 'develop', 'POS': 'VERB'}, {'LOWER': 'cloud', 'DEP': 'compound'}, {'LOWER': 'database', 'DEP': 'compound'}, {'LEMMA': 'design', 'POS': 'NOUN'}]

Label: automate cloud tasks
Pattern: [{'LEMMA': 'automate', 'POS': 'VERB'}, {'LOWER': 'cloud', 'DEP': 'compound'}, {'LEMMA': 'task', 'POS': 'NOUN'}]

Label: automation of cloud tasks
Pattern: [{'LOWER': 'automation of cloud tasks'}]

Label: design cloud environment for complex organisations
Pattern: [{'LEMMA': 'design', 'POS': 'VERB'}, {'LOWER': 'cloud', 'DEP': 'compound'}, {'LEMMA': 'environment', 'POS': 'NOUN'}]

Label: design for organisational complexity
Pattern: [{'LEMMA': 'design', 'POS': 'VERB'}, {'LEMMA': 'complexity', 'POS': 'NOUN'}]

27
Testing matcher with: I design cloud architectures, I contributed to the design of cloud architectures,I worked on multi-cluster cloud architectures,
Matched 'design cloud architecture': design cloud architectures
Matched 'design multi-tier cloud architecture': design cloud architectures
{'design cloud architectures'}
 
```


I am quite satisfied with the results produced, because I implemented an algorithm that returns the statistics relating to the matcher in question which are Precision, Recall and F1-score. Referred to the text_1.

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_matches(predicted_matches, true_matches):
    predicted = set(predicted_matches)
    true = set(true_matches)
    
    precision = len(predicted & true) / len(predicted) if predicted else 0
    recall = len(predicted & true) / len(true) if true else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")

# Test_match
true_matches = [
    "planning cloud migrations",
    "designing cloud architectures",
    "automating cloud tasks",
    "creating cloud network",
    "managing cloud resource",
    "developing cloud applications",
    "designing cloud environments"
]

predicted_matches = find_matches(test_text)
# Evaluation
evaluate_matches(predicted_matches, true_matches)


```
Precision: 1.00
Recall: 0.57
F1-Score: 0.73
```

A precision of 1.00 indicates that all instances labeled as positive by the model are indeed correct. In other words, the model produced no false positives.
This is an excellent result regarding the accuracy of the model's predictions.

A recall of 0.57 indicates that the model identified 57% of true matches.
This value suggests that the model may have difficulty capturing all true positive instances, resulting in some false negative samples.

An F1-Score of 0.73 is generally considered good and indicates that the model has a decent performance in balancing precision and recall.
However, the value is less than 1.00, suggesting that although the model is perfect at classifying positive instances (precision), it is missing some real positive instances (recall).

Conclusion
Model Strength: The model has excellent accuracy (1.00), meaning that when it makes a positive prediction, it is always correct. This is especially useful in scenarios where the consequences of false positives are severe.
Model Weakness: The relatively low recall (0.57) indicates that the model is not identifying all true positive instances. This could be a problem if it is important to capture all positive instances (for example, in rare disease detection this isn't the our case).
Balance: The F1-Score of 0.73 reflects a good balance between precision and recall, but there is still room for improvement, especially in improving recall without compromising precision.