In [1]:
import spacy
import faiss
import pandas as pd
import numpy as np
import joblib
import json
import ast
from skillNer.skill_extractor_class import SkillExtractor
from skillNer.general_params import SKILL_DB
from spacy.matcher import PhraseMatcher
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity
from math import isnan

In [2]:
all_df = pd.read_csv('../../data/all.csv', delimiter='|')
all_df.head()

Unnamed: 0,id,job_title,location,salary_currency,career_level,experience_level,education_level,employment_type,job_function,job_benefits,company_process_time,company_size,company_industry,job_description,salary
0,1,Facility Maintenance & Smart Warehouse Manager,Bandung,IDR,Manajer/Asisten Manajer,5 tahun,"Sertifikat Professional, D3 (Diploma), D4 (Dip...",Penuh Waktu,"Manufaktur,Pemeliharaan",,,,,Deskripsi PekerjaanRequirements :D3/SI from re...,
1,2,Procurement Department Head,Jakarta Raya,IDR,Manajer/Asisten Manajer,5 tahun,"Sarjana (S1), Diploma Pascasarjana, Gelar Prof...",Penuh Waktu,"Manufaktur,Pembelian/Manajemen Material",,25 days,51 - 200 pekerja,Manajemen/Konsulting HR,Job Role: 1. Responsible for material availabi...,
2,3,SALES ADMIN,Jakarta Barat,IDR,Supervisor/Koordinator,4 tahun,Sarjana (S1),Penuh Waktu,"Penjualan / Pemasaran,Penjualan Ritel","Waktu regular, Senin - Jumat;Bisnis (contoh: K...",30 days,51 - 200 pekerja,Umum & Grosir,Internal Sales & AdminJob Description :We are ...,
3,4,City Operation Lead Shopee Express (Cirebon),Cirebon,IDR,Supervisor/Koordinator,5 tahun,"Sarjana (S1), Diploma Pascasarjana, Gelar Prof...",Penuh Waktu,"Pelayanan,Logistik/Rantai Pasokan","Tip;Waktu regular, Senin - Jumat;Kasual (conto...",21 days,2001 - 5000 pekerja,Retail/Merchandise,Job Description:Responsible for HSE implementa...,
4,5,Japanese Interpreter,Bekasi,IDR,Pegawai (non-manajemen & non-supervisor),2 tahun,"Sertifikat Professional, D3 (Diploma), D4 (Dip...",Penuh Waktu,"Lainnya,Jurnalis/Editor",,23 days,201 - 500 pekerja,Manajemen/Konsulting HR,Overview: Our clients is manufacture for autom...,


In [3]:
skill_df = pd.read_csv("skill_clean.csv")
skill_df.head()

Unnamed: 0,id,job_title,skills
0,1,Facility Maintenance & Smart Warehouse Manager,"['electrical inspection', 'management system',..."
1,2,Procurement Department Head,"['heavy equipment', 'contract management', 'pr..."
2,3,SALES ADMIN,"['microsoft office', 'heat exchanger', 'carbon..."
3,4,City Operation Lead Shopee Express (Cirebon),"['operation management', 'analytical skill', '..."
4,5,Japanese Interpreter,"['japanese', 'translator', 'english', 'non', '..."


In [4]:
merged_df = pd.merge(all_df, skill_df, on="id", how="inner")
merged_df.head(5)

Unnamed: 0,id,job_title_x,location,salary_currency,career_level,experience_level,education_level,employment_type,job_function,job_benefits,company_process_time,company_size,company_industry,job_description,salary,job_title_y,skills
0,1,Facility Maintenance & Smart Warehouse Manager,Bandung,IDR,Manajer/Asisten Manajer,5 tahun,"Sertifikat Professional, D3 (Diploma), D4 (Dip...",Penuh Waktu,"Manufaktur,Pemeliharaan",,,,,Deskripsi PekerjaanRequirements :D3/SI from re...,,Facility Maintenance & Smart Warehouse Manager,"['electrical inspection', 'management system',..."
1,2,Procurement Department Head,Jakarta Raya,IDR,Manajer/Asisten Manajer,5 tahun,"Sarjana (S1), Diploma Pascasarjana, Gelar Prof...",Penuh Waktu,"Manufaktur,Pembelian/Manajemen Material",,25 days,51 - 200 pekerja,Manajemen/Konsulting HR,Job Role: 1. Responsible for material availabi...,,Procurement Department Head,"['heavy equipment', 'contract management', 'pr..."
2,3,SALES ADMIN,Jakarta Barat,IDR,Supervisor/Koordinator,4 tahun,Sarjana (S1),Penuh Waktu,"Penjualan / Pemasaran,Penjualan Ritel","Waktu regular, Senin - Jumat;Bisnis (contoh: K...",30 days,51 - 200 pekerja,Umum & Grosir,Internal Sales & AdminJob Description :We are ...,,SALES ADMIN,"['microsoft office', 'heat exchanger', 'carbon..."
3,4,City Operation Lead Shopee Express (Cirebon),Cirebon,IDR,Supervisor/Koordinator,5 tahun,"Sarjana (S1), Diploma Pascasarjana, Gelar Prof...",Penuh Waktu,"Pelayanan,Logistik/Rantai Pasokan","Tip;Waktu regular, Senin - Jumat;Kasual (conto...",21 days,2001 - 5000 pekerja,Retail/Merchandise,Job Description:Responsible for HSE implementa...,,City Operation Lead Shopee Express (Cirebon),"['operation management', 'analytical skill', '..."
4,5,Japanese Interpreter,Bekasi,IDR,Pegawai (non-manajemen & non-supervisor),2 tahun,"Sertifikat Professional, D3 (Diploma), D4 (Dip...",Penuh Waktu,"Lainnya,Jurnalis/Editor",,23 days,201 - 500 pekerja,Manajemen/Konsulting HR,Overview: Our clients is manufacture for autom...,,Japanese Interpreter,"['japanese', 'translator', 'english', 'non', '..."


In [5]:
df = merged_df[["job_title_x", "career_level", "experience_level", "education_level", "job_description", "skills"]]
df = df.rename(columns={"job_title_x": "job_title"})
df.head()

Unnamed: 0,job_title,career_level,experience_level,education_level,job_description,skills
0,Facility Maintenance & Smart Warehouse Manager,Manajer/Asisten Manajer,5 tahun,"Sertifikat Professional, D3 (Diploma), D4 (Dip...",Deskripsi PekerjaanRequirements :D3/SI from re...,"['electrical inspection', 'management system',..."
1,Procurement Department Head,Manajer/Asisten Manajer,5 tahun,"Sarjana (S1), Diploma Pascasarjana, Gelar Prof...",Job Role: 1. Responsible for material availabi...,"['heavy equipment', 'contract management', 'pr..."
2,SALES ADMIN,Supervisor/Koordinator,4 tahun,Sarjana (S1),Internal Sales & AdminJob Description :We are ...,"['microsoft office', 'heat exchanger', 'carbon..."
3,City Operation Lead Shopee Express (Cirebon),Supervisor/Koordinator,5 tahun,"Sarjana (S1), Diploma Pascasarjana, Gelar Prof...",Job Description:Responsible for HSE implementa...,"['operation management', 'analytical skill', '..."
4,Japanese Interpreter,Pegawai (non-manajemen & non-supervisor),2 tahun,"Sertifikat Professional, D3 (Diploma), D4 (Dip...",Overview: Our clients is manufacture for autom...,"['japanese', 'translator', 'english', 'non', '..."


In [6]:
# df["skills"] = df["skills"].apply(lambda x: ast.literal_eval(x))

In [7]:
nlp = spacy.load("en_core_web_lg")
skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher)

def extract_skills(text):
  annotation_result = skill_extractor.annotate(text)
  if not isinstance(annotation_result, dict) or 'results' not in annotation_result:
    return []

  doc_node_values = []
  results = annotation_result['results']

  if 'full_matches' in results:
    for match in results['full_matches']:
      doc_node_values.append(match['doc_node_value'])

  if 'ngram_scored' in results:
    for match in results['ngram_scored']:
      doc_node_values.append(match['doc_node_value'])

  return doc_node_values

loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...


In [8]:
df["role_text"] = df["job_title"] + " - " + df["job_description"]

In [9]:
EMBED_MODEL_NAME = "paraphrase-multilingual-MiniLM-L12-v2"
embed_model = SentenceTransformer(EMBED_MODEL_NAME)

def embed_texts(texts, batch_size=64):
    return embed_model.encode(texts, show_progress_bar=True, batch_size=batch_size, convert_to_numpy=True)

role_texts = df["role_text"].tolist()
role_embeddings = embed_texts(role_texts)
print("role_embeddings", role_embeddings.shape)

skills_texts = df["skills"].tolist()
skill_embeddings = embed_texts(skills_texts)
print("skill_embeddings", skill_embeddings.shape)

Batches:   0%|          | 0/18 [00:00<?, ?it/s]

role_embeddings (1131, 384)


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

skill_embeddings (1131, 384)


In [10]:
d = role_embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(role_embeddings)
print("index ntotal:", index.ntotal)

faiss.write_index(index, "faiss_role_index.idx")
np.save("role_embeddings.npy", role_embeddings)
np.save("skill_embeddings.npy", skill_embeddings)

index ntotal: 1131


In [11]:
tfidf = TfidfVectorizer(lowercase=True, token_pattern=r"(?u)\b\w+\b")
tfidf_matrix = tfidf.fit_transform(df["skills"])
print("tfidf shape", tfidf_matrix.shape)

tfidf shape (1131, 2424)


In [12]:
cat_cols = ["career_level", "experience_level", "education_level"]

encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
enc_df = pd.DataFrame()
enc_cols = []
for c in cat_cols:
    enc = encoder.fit_transform(df[[c]])
    enc_df[c + "_enc"] = enc.ravel()
    enc_cols.append(c + "_enc")

enc_df.head()

Unnamed: 0,career_level_enc,experience_level_enc,education_level_enc
0,2.0,8.0,10.0
1,2.0,8.0,6.0
2,4.0,7.0,5.0
3,4.0,8.0,6.0
4,3.0,5.0,10.0


In [13]:
X_emb = np.hstack([role_embeddings, skill_embeddings])
print("X_emb shape:", X_emb.shape)

if len(enc_cols) > 0:
    cat_values = enc_df[enc_cols].values.astype(np.float32)
    X = np.hstack([X_emb, cat_values])
else:
    X = X_emb

print("Final feature X shape:", X.shape)

y = df["job_title"].fillna("unknown")
print("n_classes:", y.nunique())
print(y.value_counts().head())

X_emb shape: (1131, 768)
Final feature X shape: (1131, 771)
n_classes: 1027
job_title
Management Trainee     9
Sales Executive        6
IT Support             5
Account Manager        5
Marketing Executive    5
Name: count, dtype: int64


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf = RandomForestClassifier(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)

pred = rf.predict(X_test)
print(classification_report(y_test, pred, zero_division=0))

feat_imp = rf.feature_importances_
print("feat_imp length", len(feat_imp))

                                                                                              precision    recall  f1-score   support

                                                                              .NET Developer       0.00      0.00      0.00         0
                                                                             .NET Programmer       0.00      0.00      0.00         1
                                                                           ACCOUNT EXECUTIVE       0.00      0.00      0.00         2
                                                                 ACCOUNT MANAGER - TELCO- IT       0.00      0.00      0.00         0
                                                                             ACCOUNT PAYABLE       0.00      0.00      0.00         1
                                                                  ACCOUNTING & FINANCE STAFF       0.00      0.00      0.00         0
                                                             

In [15]:
X_skill_tfidf = tfidf_matrix
print("skill token count", X_skill_tfidf.shape[1])

a = X_skill_tfidf
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(a, y, test_size=0.2, random_state=42)

rf_skill = RandomForestClassifier(n_estimators=200, random_state=42)
rf_skill.fit(X_train_s, y_train_s)

pred_s = rf_skill.predict(X_test_s)
print(classification_report(y_test_s, pred_s, zero_division=0))

skill_tokens = np.array(tfidf.get_feature_names_out())
skill_importances = rf_skill.feature_importances_

skill_imp_dict = dict(zip(skill_tokens, skill_importances))

maxv = max(skill_importances) if len(skill_importances) > 0 else 1.0
skill_imp_norm = {k: float(v/maxv) for k,v in skill_imp_dict.items()}

skill token count 2424
                                                                                          precision    recall  f1-score   support

                                                                          .NET Developer       0.00      0.00      0.00         0
                                                                         .NET Programmer       0.00      0.00      0.00         1
                                                                          .Net Developer       0.00      0.00      0.00         0
                                                                       ACCOUNT EXECUTIVE       0.00      0.00      0.00         2
                                                             ACCOUNT MANAGER - TELCO- IT       0.00      0.00      0.00         0
                                                                         ACCOUNT PAYABLE       0.00      0.00      0.00         1
                                                                  

In [16]:
def get_top_roles_by_text(user_text, k=3):
    emb = embed_texts([user_text])
    D, I = index.search(emb, k)
    return I[0], D[0]

role_tfidf_dense = tfidf_matrix.todense()

In [17]:
def score_missing_skill(role_idx, skill_token, role_text):
    token = skill_token.lower()
    tfidf_score = 0.0
    if token in tfidf.vocabulary_:
        col = tfidf.vocabulary_[token]
        tfidf_score = float(role_tfidf_dense[role_idx, col])

    emb_skill = embed_texts([skill_token])[0]
    emb_role = role_embeddings[role_idx]
    sim = float(cosine_similarity([emb_skill], [emb_role])[0,0])

    rf_imp = skill_imp_norm.get(token, 0.0)

    w_emb, w_tfidf, w_rf = 0.45, 0.35, 0.20
    combined = w_emb * sim + w_tfidf * tfidf_score + w_rf * rf_imp

    score_pct = float(combined * 100)
    return round(score_pct, 2)

In [28]:
def recommend_for_user(user_profile_text, k=3):
    user_skills = set(extract_skills(user_profile_text))
    user_text = user_profile_text
    role_idx_list, dists = get_top_roles_by_text(user_text, k=k)

    recs = []
    for ridx in role_idx_list:
        role_row = df.iloc[ridx]
        role_name = role_row.get("job_title", "Unknown Role")
        role_skills = set(ast.literal_eval(role_row.get("skills")))
        missing = sorted(list(role_skills - user_skills))

        scored = []
        for s in missing:
            sc = score_missing_skill(ridx, s, role_row.get("role_text", ""))
            scored.append((s, sc))
        scored = sorted(scored, key=lambda x: x[1], reverse=True)
        recs.append({"role": role_name, "skill_gap": scored})
    return recs

In [30]:
user_profile = """Riko 'Si Gesit' ini punya pekerjaan sebagai Spesialis Multitasking dan Tukang Nge-Fix. Secara teknis, 
dia itu lumayan jagoan di dunia web development. Dia lancar banget pakai JavaScript buat urusan tampilan depan (frontend) 
biar website-nya jadi interaktif, dan dia juga sedikit-sedikit bisa PHP buat urusan di belakang layar. Data-data penting 
biasanya dia simpan rapi pakai MySQL. Kalau soal tampilan website, nggak usah ditanya, dia mahir banget di HTML dan CSS, 
bahkan udah mulai nyentuh React biar tampilannya makin keren. Sebagai tools sehari-hari, dia pasti pakai VS Code dan sering 
kolaborasi pakai Git/GitHub. Nah, selain jago coding, Riko ini orangnya enak banget diajak ngobrol, nggak gampang panik kalau
ada masalah mendadak, dan yang paling penting: dia itu kreatif dan bisa kerja mandiri tanpa harus diawasi terus."""
recs = recommend_for_user(user_profile, k=3)

for i, r in enumerate(recs, 1):
    print(f"Rekomendasi: Role {i} - {r['role']}")
    top_skills = r['skill_gap'][:6]
    print("Skill Need to Add:", ", ".join([f"{s} ({sc}%)" for s,sc in top_skills]))
    print()

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Rekomendasi: Role 1 - Full Stack Website Developer
Skill Need to Add: management (31.88%), sql (29.13%), html (22.18%), website management (22.09%), website architecture (21.97%), css (20.67%)

Rekomendasi: Role 2 - GURU TETAP FISIKA SINOTIF
Skill Need to Add: drawing (28.49%), software (27.16%), zoom (21.17%), android (18.52%), online office (14.55%)

Rekomendasi: Role 3 - Senior PHP Developer / Programmer
Skill Need to Add: manager (29.91%), postgresql (27.53%), timeline (26.01%), laravel (19.81%), source code (11.71%)

