In [1]:
import spacy
import faiss
import pandas as pd
import numpy as np
import joblib
import json
from skillNer.skill_extractor_class import SkillExtractor
from skillNer.general_params import SKILL_DB
from spacy.matcher import PhraseMatcher
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity
from math import isnan

In [2]:
DATA_PATH = "export/final_data.csv"

df = pd.read_csv(DATA_PATH)
print(df.shape)
df.head()

(17182, 18)


Unnamed: 0,id,job_title,location,salary_currency,career_level,experience_level,education_level,employment_type,job_function,job_benefits,company_process_time,company_size,company_industry,job_description,salary,job_description_cleaned,annotations,skills
0,1,Facility Maintenance & Smart Warehouse Manager,Bandung,IDR,Manajer/Asisten Manajer,5 tahun,"Sertifikat Professional, D3 (Diploma), D4 (Dip...",Penuh Waktu,"Manufaktur,Pemeliharaan",,,,,Deskripsi PekerjaanRequirements :D3/SI from re...,,deskripsi pekerjaanrequirements si from reputa...,{'text': 'deskripsi pekerjaanrequirements si f...,"['electrical inspection', 'management system',..."
1,2,Procurement Department Head,Jakarta Raya,IDR,Manajer/Asisten Manajer,5 tahun,"Sarjana (S1), Diploma Pascasarjana, Gelar Prof...",Penuh Waktu,"Manufaktur,Pembelian/Manajemen Material",,25 days,51 - 200 pekerja,Manajemen/Konsulting HR,Job Role: 1. Responsible for material availabi...,,job role responsible for material availabili...,{'text': 'job role responsible for material av...,"['heavy equipment', 'contract management', 'pr..."
2,3,SALES ADMIN,Jakarta Barat,IDR,Supervisor/Koordinator,4 tahun,Sarjana (S1),Penuh Waktu,"Penjualan / Pemasaran,Penjualan Ritel","Waktu regular, Senin - Jumat;Bisnis (contoh: K...",30 days,51 - 200 pekerja,Umum & Grosir,Internal Sales & AdminJob Description :We are ...,,internal sales adminjob description we are loo...,{'text': 'internal sales adminjob description ...,"['microsoft office', 'heat exchanger', 'carbon..."
3,4,City Operation Lead Shopee Express (Cirebon),Cirebon,IDR,Supervisor/Koordinator,5 tahun,"Sarjana (S1), Diploma Pascasarjana, Gelar Prof...",Penuh Waktu,"Pelayanan,Logistik/Rantai Pasokan","Tip;Waktu regular, Senin - Jumat;Kasual (conto...",21 days,2001 - 5000 pekerja,Retail/Merchandise,Job Description:Responsible for HSE implementa...,,job description responsible for hse implementa...,{'text': 'job description responsible for hse ...,"['operation management', 'analytical skill', '..."
4,5,Japanese Interpreter,Bekasi,IDR,Pegawai (non-manajemen & non-supervisor),2 tahun,"Sertifikat Professional, D3 (Diploma), D4 (Dip...",Penuh Waktu,"Lainnya,Jurnalis/Editor",,23 days,201 - 500 pekerja,Manajemen/Konsulting HR,Overview: Our clients is manufacture for autom...,,overview our clients is manufacture for automo...,{'text': 'overview our clients is manufacture ...,"['japanese', 'translator', 'english', 'non', '..."


In [3]:
# df["skills_list"] = df["skills"].apply(lambda x: [s.strip() for s in x.split(",")])

## Skill extraction

In [4]:
nlp = spacy.load("en_core_web_lg")
skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher)

def extract_skills(text):
  annotation_result = skill_extractor.annotate(text)
  if not isinstance(annotation_result, dict) or 'results' not in annotation_result:
    return []

  doc_node_values = []
  results = annotation_result['results']

  if 'full_matches' in results:
    for match in results['full_matches']:
      doc_node_values.append(match['doc_node_value'])

  if 'ngram_scored' in results:
    for match in results['ngram_scored']:
      doc_node_values.append(match['doc_node_value'])

  return doc_node_values

loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...


## Preprocessing kategori & teks

In [5]:
df["role_text"] = df["job_description"] + " - " + df['experience_level'] + " - " + df["education_level"]

## Embedding generation

In [6]:
EMBED_MODEL_NAME = "paraphrase-multilingual-MiniLM-L12-v2"
embed_model = SentenceTransformer(EMBED_MODEL_NAME)

def embed_texts(texts, batch_size=64):
    return embed_model.encode(texts, show_progress_bar=True, batch_size=batch_size, convert_to_numpy=True)

role_texts = df["role_text"].tolist()
role_embeddings = embed_texts(role_texts)
print("role_embeddings", role_embeddings.shape)

skills_texts = df["skills"].tolist()
skill_embeddings = embed_texts(skills_texts)
print("skill_embeddings", skill_embeddings.shape)

Batches:   0%|          | 0/269 [00:00<?, ?it/s]

role_embeddings (17182, 384)


Batches:   0%|          | 0/269 [00:00<?, ?it/s]

skill_embeddings (17182, 384)


## Indexing untuk retrieval

In [7]:
d = role_embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(role_embeddings)
print("index ntotal:", index.ntotal)

faiss.write_index(index, "faiss_role_index.idx")
np.save("role_embeddings.npy", role_embeddings)
np.save("skill_embeddings.npy", skill_embeddings)

index ntotal: 17182


In [8]:
metadata = df[[
    "id", "job_title", "job_function", "skills"
]].to_dict(orient="records")

In [9]:
import pickle

with open("job_metadata.bin", "wb") as f:
    pickle.dump(metadata, f)

## Skill importance signal

In [10]:
tfidf = TfidfVectorizer(lowercase=True, token_pattern=r"(?u)\b\w+\b")
tfidf_matrix = tfidf.fit_transform(df["skills"])
print("tfidf shape", tfidf_matrix.shape)

tfidf shape (17182, 5455)


## Encode categorical features

In [11]:
cat_cols = ["career_level", "experience_level", "education_level"]

encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
enc_df = pd.DataFrame()
enc_cols = []
for c in cat_cols:
    enc = encoder.fit_transform(df[[c]])
    enc_df[c + "_enc"] = enc.ravel()
    enc_cols.append(c + "_enc")

enc_df.head()

Unnamed: 0,career_level_enc,experience_level_enc,education_level_enc
0,2.0,13.0,15.0
1,2.0,13.0,10.0
2,4.0,12.0,9.0
3,4.0,13.0,10.0
4,3.0,9.0,15.0


## Build final feature matrix

In [12]:
X_emb = np.hstack([role_embeddings, skill_embeddings])
print("X_emb shape:", X_emb.shape)

if len(enc_cols) > 0:
    cat_values = enc_df[enc_cols].values.astype(np.float32)
    X = np.hstack([X_emb, cat_values])
else:
    X = X_emb

print("Final feature X shape:", X.shape)

y = df["job_title"].fillna("unknown")
print("n_classes:", y.nunique())
print(y.value_counts().head())

X_emb shape: (17182, 768)
Final feature X shape: (17182, 771)
n_classes: 10969
job_title
Management Trainee    153
Sales Executive       151
Digital Marketing      77
Graphic Designer       63
Content Creator        59
Name: count, dtype: int64


## Train RandomForestClassifier

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    stratify=y, random_state=42)

rf = RandomForestClassifier(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)

pred = rf.predict(X_test)
print(classification_report(y_test, pred, zero_division=0))

feat_imp = rf.feature_importances_
print("feat_imp length", len(feat_imp))

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

## Derive skill-level importance from the model

In [None]:
X_skill_tfidf = tfidf_matrix
print("skill token count", X_skill_tfidf.shape[1])

a = X_skill_tfidf
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(a, y, test_size=0.2,
                                                            stratify=y, random_state=42)

rf_skill = RandomForestClassifier(n_estimators=200, random_state=42)
rf_skill.fit(X_train_s, y_train_s)

pred_s = rf_skill.predict(X_test_s)
print(classification_report(y_test_s, pred_s, zero_division=0))

skill_tokens = np.array(tfidf.get_feature_names_out())
skill_importances = rf_skill.feature_importances_

skill_imp_dict = dict(zip(skill_tokens, skill_importances))

maxv = max(skill_importances) if len(skill_importances) > 0 else 1.0
skill_imp_norm = {k: float(v/maxv) for k,v in skill_imp_dict.items()}

## Inference

In [None]:
def get_top_roles_by_text(user_text, k=3):
    emb = embed_texts([user_text])
    D, I = index.search(emb, k)
    return I[0], D[0]

role_tfidf_dense = tfidf_matrix.todense()

In [None]:
def score_missing_skill(role_idx, skill_token, role_text):
    token = skill_token.lower()
    tfidf_score = 0.0
    if token in tfidf.vocabulary_:
        col = tfidf.vocabulary_[token]
        tfidf_score = float(role_tfidf_dense[role_idx, col])

    emb_skill = embed_texts([skill_token])[0]
    emb_role = role_embeddings[role_idx]
    sim = float(cosine_similarity([emb_skill], [emb_role])[0,0])

    rf_imp = skill_imp_norm.get(token, 0.0)

    w_emb, w_tfidf, w_rf = 0.45, 0.35, 0.20
    combined = w_emb * sim + w_tfidf * tfidf_score + w_rf * rf_imp

    score_pct = float(combined * 100)
    return round(score_pct, 2)

In [None]:
def recommend_for_user(user_profile_text, k=3):
    user_skills = set(extract_skills(user_profile_text))
    user_text = user_profile_text
    role_idx_list, dists = get_top_roles_by_text(user_text, k=k)

    recs = []
    for ridx in role_idx_list:
        role_row = df.iloc[ridx]
        role_name = role_row.get("job_title", "Unknown Role")
        role_skills = set(role_row.get("skills_list", []))
        missing = sorted(list(role_skills - user_skills))

        scored = []
        for s in missing:
            sc = score_missing_skill(ridx, s, role_row.get("role_text", ""))
            scored.append((s, sc))
        scored = sorted(scored, key=lambda x: x[1], reverse=True)
        recs.append({"role": role_name, "skill_gap": scored})
    return recs

In [None]:
user_profile = """I have experience in Python, Excel, and basic data analysis. I
worked on automation tasks."""
recs = recommend_for_user(user_profile, k=3)

for i, r in enumerate(recs, 1):
    print(f"Rekomendasi: Role {i} - {r['role']}")
    top_skills = r['skill_gap'][:6]
    print("Skill Need to Add:", ", ".join([f"{s} ({sc}%)" for s,sc in top_skills]))
    print()

In [None]:
np.save("role_embeddings.npy", role_embeddings)
faiss.write_index(index, "faiss_role_index.idx")
with open("tfidf_vocabulary.json", "w") as f:
    json.dump(tfidf.vocabulary_, f)

joblib.dump(rf_skill, "rf_skill_model.pkl")
joblib.dump(rf, "rf_fullmodel.pkl")