# Skill extraction pipeline

## Creating labeled training data (CoNLL/BIO Format)

In [2]:
import warnings
warnings.filterwarnings("ignore")

import spacy
import subprocess
from spacy.matcher import PhraseMatcher

# Function to check if a spaCy model is installed
def is_model_installed(model_name):
    try:
        spacy.load(model_name)
        return True
    except OSError:
        return False

# Ensure spaCy language models are installed and loaded correctly
models = {
    "de_core_news_sm": "de_core_news_sm",
    "en_core_web_sm": "en_core_web_sm"
}

for model_name in models:
    if not is_model_installed(model_name):
        subprocess.run(['python', '-m', 'spacy', 'download', model_name])

spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

# Load German spaCy model
nlp = spacy.load("de_core_news_sm")

# Define technical skills
technical_skills = [
    "Python", "Java", "Machine Learning", "Cloud Computing", "SQL", "Datenanalyse"
]

# Define transversal skills
transversal_skills = [
    "Teamfähigkeit", "Kommunikationsstärke", "Selbstorganisation", 
    "Problemlösungskompetenz", "Kritisches Denken", "Zeitmanagement", 
    "Belastbarkeit"
]

# Initialize matcher
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")

# Add patterns for technical and transversal skills
tech_patterns = [nlp.make_doc(skill) for skill in technical_skills]
trans_patterns = [nlp.make_doc(skill) for skill in transversal_skills]

matcher.add("TECH", tech_patterns)
matcher.add("TRANS", trans_patterns)

# BIO labeling function
def label_skills(text):
    doc = nlp(text)
    labels = ["O"] * len(doc)
    matches = matcher(doc)

    for match_id, start, end in matches:
        label_type = nlp.vocab.strings[match_id]  # "TECH" or "TRANS"
        prefix = "B-" + label_type
        labels[start] = prefix
        for i in range(start + 1, end):
            labels[i] = "I-" + label_type

    return [(token.text, labels[i]) for i, token in enumerate(doc)]

# Sample input text
text = """Sie verfügen über Kenntnisse in Python und Teamfähigkeit sowie 
          Erfahrung in Cloud Computing und Selbstorganisation."""

# Apply labeling
labeled_output = label_skills(text)

# Print result
for token, label in labeled_output:
    print(f"{token}\t{label}")

Sie	O
verfügen	O
über	O
Kenntnisse	O
in	O
Python	B-TECH
und	O
Teamfähigkeit	B-TRANS
sowie	O

          	O
Erfahrung	O
in	O
Cloud	B-TECH
Computing	I-TECH
und	O
Selbstorganisation	B-TRANS
.	O
