In [1]:
import spacy
import faiss
import pandas as pd
import numpy as np
import joblib
import json
from skillNer.skill_extractor_class import SkillExtractor
from skillNer.general_params import SKILL_DB
from spacy.matcher import PhraseMatcher
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity
from math import isnan

In [2]:
DATA_PATH = "dummy_jobs.csv"

df = pd.read_csv(DATA_PATH)
print(df.shape)
df.head()

(100, 6)


Unnamed: 0,job_title,career_level,experience_level,education_level,job_description,skills
0,Backend Developer,Mid,1-3 years,High School,Collaborates with stakeholders to define produ...,"Java, Spring Boot, Microservices"
1,Cloud Architect,Lead,3-5 years,Bachelor,Builds machine learning models and deploys the...,"TensorFlow, Deep Learning, NLP"
2,Frontend Developer,Senior,5+ years,PhD,Collaborates with stakeholders to define produ...,"JavaScript, React, CSS"
3,Frontend Developer,Mid,3-5 years,Master,Responsible for analyzing large datasets and g...,"Python, SQL, Machine Learning"
4,Data Analyst,Senior,3-5 years,Master,Responsible for analyzing large datasets and g...,"Git, CI/CD, Linux"


In [3]:
df["skills_list"] = df["skills"].apply(lambda x: [s.strip() for s in x.split(",")])

## Skill extraction

In [4]:
nlp = spacy.load("en_core_web_lg")
skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher)

def extract_skills(text):
  annotation_result = skill_extractor.annotate(text)
  if not isinstance(annotation_result, dict) or 'results' not in annotation_result:
    return []

  doc_node_values = []
  results = annotation_result['results']

  if 'full_matches' in results:
    for match in results['full_matches']:
      doc_node_values.append(match['doc_node_value'])

  if 'ngram_scored' in results:
    for match in results['ngram_scored']:
      doc_node_values.append(match['doc_node_value'])

  return doc_node_values

loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...


## Preprocessing kategori & teks

In [5]:
df["role_text"] = df["job_title"] + " - " + df["job_description"]

## Embedding generation

In [6]:
EMBED_MODEL_NAME = "paraphrase-multilingual-MiniLM-L12-v2"
embed_model = SentenceTransformer(EMBED_MODEL_NAME)

def embed_texts(texts, batch_size=64):
    return embed_model.encode(texts, show_progress_bar=True, batch_size=batch_size, convert_to_numpy=True)

role_texts = df["role_text"].tolist()
role_embeddings = embed_texts(role_texts)
print("role_embeddings", role_embeddings.shape)

skills_texts = df["skills"].tolist()
skill_embeddings = embed_texts(skills_texts)
print("skill_embeddings", skill_embeddings.shape)

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

role_embeddings (100, 384)


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

skill_embeddings (100, 384)


## Indexing untuk retrieval

In [7]:
d = role_embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(role_embeddings)
print("index ntotal:", index.ntotal)

faiss.write_index(index, "faiss_role_index.idx")
np.save("role_embeddings.npy", role_embeddings)
np.save("skill_embeddings.npy", skill_embeddings)

index ntotal: 100


## Skill importance signal

In [8]:
tfidf = TfidfVectorizer(lowercase=True, token_pattern=r"(?u)\b\w+\b")
tfidf_matrix = tfidf.fit_transform(df["skills"])
print("tfidf shape", tfidf_matrix.shape)

tfidf shape (100, 26)


## Encode categorical features

In [9]:
cat_cols = ["career_level", "experience_level", "education_level"]

encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
enc_df = pd.DataFrame()
enc_cols = []
for c in cat_cols:
    enc = encoder.fit_transform(df[[c]])
    enc_df[c + "_enc"] = enc.ravel()
    enc_cols.append(c + "_enc")

enc_df.head()

Unnamed: 0,career_level_enc,experience_level_enc,education_level_enc
0,2.0,1.0,2.0
1,1.0,2.0,0.0
2,3.0,3.0,4.0
3,2.0,2.0,3.0
4,3.0,2.0,3.0


## Build final feature matrix

In [10]:
X_emb = np.hstack([role_embeddings, skill_embeddings])
print("X_emb shape:", X_emb.shape)

if len(enc_cols) > 0:
    cat_values = enc_df[enc_cols].values.astype(np.float32)
    X = np.hstack([X_emb, cat_values])
else:
    X = X_emb

print("Final feature X shape:", X.shape)

y = df["job_title"].fillna("unknown")
print("n_classes:", y.nunique())
print(y.value_counts().head())

X_emb shape: (100, 768)
Final feature X shape: (100, 771)
n_classes: 10
job_title
Frontend Developer           15
Business Analyst             14
Backend Developer            13
Data Analyst                 12
Machine Learning Engineer    11
Name: count, dtype: int64


## Train RandomForestClassifier

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    stratify=y, random_state=42)

rf = RandomForestClassifier(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)

pred = rf.predict(X_test)
print(classification_report(y_test, pred, zero_division=0))

feat_imp = rf.feature_importances_
print("feat_imp length", len(feat_imp))

                           precision    recall  f1-score   support

        Backend Developer       1.00      1.00      1.00         2
         Business Analyst       1.00      1.00      1.00         3
          Cloud Architect       1.00      1.00      1.00         2
             Data Analyst       0.67      1.00      0.80         2
           Data Scientist       0.00      0.00      0.00         1
          DevOps Engineer       1.00      1.00      1.00         2
       Frontend Developer       1.00      1.00      1.00         3
Machine Learning Engineer       1.00      1.00      1.00         2
          Product Manager       1.00      1.00      1.00         1
        Software Engineer       1.00      1.00      1.00         2

                 accuracy                           0.95        20
                macro avg       0.87      0.90      0.88        20
             weighted avg       0.92      0.95      0.93        20

feat_imp length 771


## Derive skill-level importance from the model

In [12]:
X_skill_tfidf = tfidf_matrix
print("skill token count", X_skill_tfidf.shape[1])

a = X_skill_tfidf
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(a, y, test_size=0.2,
                                                            stratify=y, random_state=42)

rf_skill = RandomForestClassifier(n_estimators=200, random_state=42)
rf_skill.fit(X_train_s, y_train_s)

pred_s = rf_skill.predict(X_test_s)
print(classification_report(y_test_s, pred_s, zero_division=0))

skill_tokens = np.array(tfidf.get_feature_names_out())
skill_importances = rf_skill.feature_importances_

skill_imp_dict = dict(zip(skill_tokens, skill_importances))

maxv = max(skill_importances) if len(skill_importances) > 0 else 1.0
skill_imp_norm = {k: float(v/maxv) for k,v in skill_imp_dict.items()}

skill token count 26
                           precision    recall  f1-score   support

        Backend Developer       0.14      0.50      0.22         2
         Business Analyst       0.00      0.00      0.00         3
          Cloud Architect       0.00      0.00      0.00         2
             Data Analyst       0.00      0.00      0.00         2
           Data Scientist       0.00      0.00      0.00         1
          DevOps Engineer       0.00      0.00      0.00         2
       Frontend Developer       0.00      0.00      0.00         3
Machine Learning Engineer       0.00      0.00      0.00         2
          Product Manager       0.00      0.00      0.00         1
        Software Engineer       0.00      0.00      0.00         2

                 accuracy                           0.05        20
                macro avg       0.01      0.05      0.02        20
             weighted avg       0.01      0.05      0.02        20



## Inference

In [13]:
def get_top_roles_by_text(user_text, k=3):
    emb = embed_texts([user_text])
    D, I = index.search(emb, k)
    return I[0], D[0]

role_tfidf_dense = tfidf_matrix.todense()

In [14]:
def score_missing_skill(role_idx, skill_token, role_text):
    token = skill_token.lower()
    tfidf_score = 0.0
    if token in tfidf.vocabulary_:
        col = tfidf.vocabulary_[token]
        tfidf_score = float(role_tfidf_dense[role_idx, col])

    emb_skill = embed_texts([skill_token])[0]
    emb_role = role_embeddings[role_idx]
    sim = float(cosine_similarity([emb_skill], [emb_role])[0,0])

    rf_imp = skill_imp_norm.get(token, 0.0)

    w_emb, w_tfidf, w_rf = 0.45, 0.35, 0.20
    combined = w_emb * sim + w_tfidf * tfidf_score + w_rf * rf_imp

    score_pct = float(combined * 100)
    return round(score_pct, 2)

In [15]:
def recommend_for_user(user_profile_text, k=3):
    user_skills = set(extract_skills(user_profile_text))
    user_text = user_profile_text
    role_idx_list, dists = get_top_roles_by_text(user_text, k=k)

    recs = []
    for ridx in role_idx_list:
        role_row = df.iloc[ridx]
        role_name = role_row.get("job_title", "Unknown Role")
        role_skills = set(role_row.get("skills_list", []))
        missing = sorted(list(role_skills - user_skills))

        scored = []
        for s in missing:
            sc = score_missing_skill(ridx, s, role_row.get("role_text", ""))
            scored.append((s, sc))
        scored = sorted(scored, key=lambda x: x[1], reverse=True)
        recs.append({"role": role_name, "skill_gap": scored})
    return recs

In [16]:
user_profile = """I have experience in Python, Excel, and basic data analysis. I
worked on automation tasks."""
recs = recommend_for_user(user_profile, k=3)

for i, r in enumerate(recs, 1):
    print(f"Rekomendasi: Role {i} - {r['role']}")
    top_skills = r['skill_gap'][:6]
    print("Skill Need to Add:", ", ".join([f"{s} ({sc}%)" for s,sc in top_skills]))
    print()

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Rekomendasi: Role 1 - Data Analyst
Skill Need to Add: Microservices (42.84%), Java (33.11%), Spring Boot (0.31%)

Rekomendasi: Role 2 - Data Analyst
Skill Need to Add: Kubernetes (38.0%), AWS (36.18%), Docker (35.52%)

Rekomendasi: Role 3 - Data Analyst
Skill Need to Add: Linux (28.32%), Git (26.75%), CI/CD (-2.1%)



In [17]:
np.save("role_embeddings.npy", role_embeddings)
faiss.write_index(index, "faiss_role_index.idx")
with open("tfidf_vocabulary.json", "w") as f:
    json.dump(tfidf.vocabulary_, f)

joblib.dump(rf_skill, "rf_skill_model.pkl")
joblib.dump(rf, "rf_fullmodel.pkl")

['rf_fullmodel.pkl']