In [1]:
import spacy
import faiss
import pandas as pd
import numpy as np
from skillNer.skill_extractor_class import SkillExtractor
from skillNer.general_params import SKILL_DB
from spacy.matcher import PhraseMatcher
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OrdinalEncoder

In [2]:
DATA_PATH = "dummy_jobs.csv"

df = pd.read_csv(DATA_PATH)
print(df.shape)
df.head()

(100, 6)


Unnamed: 0,job_title,career_level,experience_level,education_level,job_description,skills
0,Backend Developer,Mid,1-3 years,High School,Collaborates with stakeholders to define produ...,"Java, Spring Boot, Microservices"
1,Cloud Architect,Lead,3-5 years,Bachelor,Builds machine learning models and deploys the...,"TensorFlow, Deep Learning, NLP"
2,Frontend Developer,Senior,5+ years,PhD,Collaborates with stakeholders to define produ...,"JavaScript, React, CSS"
3,Frontend Developer,Mid,3-5 years,Master,Responsible for analyzing large datasets and g...,"Python, SQL, Machine Learning"
4,Data Analyst,Senior,3-5 years,Master,Responsible for analyzing large datasets and g...,"Git, CI/CD, Linux"


## Skill extraction

In [3]:
nlp = spacy.load("en_core_web_lg")
skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher)

def extract_skills(text):
  annotation_result = skill_extractor.annotate(text)
  if not isinstance(annotation_result, dict) or 'results' not in annotation_result:
    return []

  doc_node_values = []
  results = annotation_result['results']

  if 'full_matches' in results:
    for match in results['full_matches']:
      doc_node_values.append(match['doc_node_value'])

  if 'ngram_scored' in results:
    for match in results['ngram_scored']:
      doc_node_values.append(match['doc_node_value'])

  return doc_node_values

loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...


## Preprocessing kategori & teks

In [4]:
df["role_text"] = df["job_title"] + " - " + df["job_description"]

## Embedding generation

In [5]:
EMBED_MODEL_NAME = "paraphrase-multilingual-MiniLM-L12-v2"
embed_model = SentenceTransformer(EMBED_MODEL_NAME)

def embed_texts(texts, batch_size=64):
    return embed_model.encode(texts, show_progress_bar=True, batch_size=batch_size, convert_to_numpy=True)

role_texts = df["role_text"].tolist()
role_embeddings = embed_texts(role_texts)
print("role_embeddings", role_embeddings.shape)

skills_texts = df["skills"].tolist()
skill_embeddings = embed_texts(skills_texts)
print("skill_embeddings", skill_embeddings.shape)

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

role_embeddings (100, 384)


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

skill_embeddings (100, 384)


## Indexing untuk retrieval

In [6]:
d = role_embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(role_embeddings)
print("index ntotal:", index.ntotal)

faiss.write_index(index, "faiss_role_index.idx")
np.save("role_embeddings.npv", role_embeddings)
np.save("skill_embeddings.npv", skill_embeddings)

index ntotal: 100


## Skill importance signal

In [7]:
tfidf = TfidfVectorizer(lowercase=True, token_pattern=r"(?u)\b\w+\b")
tfidf_matrix = tfidf.fit_transform(df["skills"])
print("tfidf shape", tfidf_matrix.shape)

tfidf shape (100, 26)


## Encode categorical features

In [8]:
cat_cols = ["career_level", "experience_level", "education_level"]

encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
enc_df = pd.DataFrame()
enc_cols = []
for c in cat_cols:
    enc = encoder.fit_transform(df[[c]])
    enc_df[c + "_enc"] = enc.ravel()
    enc_cols.append(c + "_enc")

enc_df.head()

Unnamed: 0,career_level_enc,experience_level_enc,education_level_enc
0,2.0,1.0,2.0
1,1.0,2.0,0.0
2,3.0,3.0,4.0
3,2.0,2.0,3.0
4,3.0,2.0,3.0
