## Import library

In [1]:
import pandas as pd
import numpy as np
import faiss
import spacy
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from skillNer.skill_extractor_class import SkillExtractor
from skillNer.general_params import SKILL_DB
from spacy.matcher import PhraseMatcher

## Load dataset

In [2]:
df = pd.read_csv("dummy_jobs.csv")
df.head()

Unnamed: 0,job_title,career_level,experience_level,education_level,job_description,skills
0,Backend Developer,Mid,1-3 years,High School,Collaborates with stakeholders to define produ...,"Java, Spring Boot, Microservices"
1,Cloud Architect,Lead,3-5 years,Bachelor,Builds machine learning models and deploys the...,"TensorFlow, Deep Learning, NLP"
2,Frontend Developer,Senior,5+ years,PhD,Collaborates with stakeholders to define produ...,"JavaScript, React, CSS"
3,Frontend Developer,Mid,3-5 years,Master,Responsible for analyzing large datasets and g...,"Python, SQL, Machine Learning"
4,Data Analyst,Senior,3-5 years,Master,Responsible for analyzing large datasets and g...,"Git, CI/CD, Linux"


## Join columns

In [3]:
df["role_text"] = df['job_title'] + " - " + df["career_level"] + " - " + df["experience_level"] + " - " + df["education_level"] + " - " + df["job_description"]
print(df["role_text"][0])

Backend Developer - Mid - 1-3 years - High School - Collaborates with stakeholders to define product requirements and roadmap.


## Split skills

In [4]:
df["skills_list"] = df["skills"].apply(lambda x: [s.strip() for s in x.split(",")])

## Create embeddings

In [5]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

role_embeddings = model.encode(df["role_text"].tolist(), show_progress_bar=True)

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

In [6]:
dimension = role_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(role_embeddings))

## Vectorizer skills

In [7]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df["skills"])
tfidf_vocab = tfidf.vocabulary_

In [8]:
nlp = spacy.load("en_core_web_lg")
skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher)

loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...


## Extract skills

In [9]:
def extract_skills(text):
  annotation_result = skill_extractor.annotate(text)
  if not isinstance(annotation_result, dict) or 'results' not in annotation_result:
    return []

  doc_node_values = []
  results = annotation_result['results']

  if 'full_matches' in results:
    for match in results['full_matches']:
      doc_node_values.append(match['doc_node_value'])

  if 'ngram_scored' in results:
    for match in results['ngram_scored']:
      doc_node_values.append(match['doc_node_value'])

  return doc_node_values

In [10]:
skills = extract_skills("Collaborates with stakeholders to define production ready aps with python")
print(skills)

['collaborates', 'python']


In [18]:
user_profile = """Alya Rahmadani is 26 years old, lives in Jakarta, and 
works as a Data Analyst with 2–3 years of experience. She has a 
Bachelor’s degree in Informatics Engineering and is skilled in Python, 
SQL, and data visualization. She also uses Pandas, NumPy, and 
understands basic statistics."""

user_skills = extract_skills(user_profile)
print(user_skills)

['informatic engineering', 'python', 'sql', 'data visualization', 'pandas', 'numpy', 'statistics']


## Get recommendation

In [12]:
def get_top_roles(user_text, k=3):
    user_emb = model.encode([user_text])
    D, I = index.search(user_emb, k)
    return I[0]

In [13]:
# importance = (tfidf_score(role, skill) + embedding_similarity(skill, role_text)) / 2

def skill_importance(role_idx, missing_skill, role_text):
    tfidf_score = 0
    if missing_skill.lower() in tfidf_vocab:
        tfidf_score = tfidf_matrix[role_idx, tfidf_vocab[missing_skill.lower()]]

    emb_skill = model.encode([missing_skill])[0]
    emb_role = model.encode([role_text])[0]
    sim = np.dot(emb_skill, emb_role) / (np.linalg.norm(emb_skill) * np.linalg.norm(emb_role))

    importance = ((tfidf_score + sim) / 2) * 100
    return round(float(importance), 2)

In [16]:
def generate_recommendation(user_text):
    user_skills = set(extract_skills(user_text))
    top_roles = get_top_roles(user_text, k=3)
    results = []

    for rank, idx in enumerate(top_roles):
        role = df.iloc[idx]
        required = set(role["skills_list"])
        missing = required - user_skills
        
        scored_missing = []
        for skill in missing:
            imp = skill_importance(idx, skill, role["job_title"])
            scored_missing.append((skill, imp))

        scored_missing = sorted(scored_missing, key=lambda x: x[1], reverse=True)

        results.append({
            "role": role["job_title"],
            "skill_gap": scored_missing[:10]
        })
    return results

In [19]:
# user_profile = """
# I have experience in Python, Excel, and deep learning. 
# I worked with data analysis and automation.
# """

recommendations = generate_recommendation(user_profile)

for i, rec in enumerate(recommendations, 1):
    print(f"Rekomendasi: {rec['role']}")
    skills_text = ", ".join([f"{s} ({sc}%)" for s, sc in rec['skill_gap']])
    print("Skill Need to Add:", skills_text)
    print()

Rekomendasi: Data Scientist
Skill Need to Add: Linux (38.7%), Git (36.94%), CI/CD (8.15%)

Rekomendasi: Data Analyst
Skill Need to Add: Java (41.33%), Microservices (31.48%), Spring Boot (4.16%)

Rekomendasi: Data Analyst
Skill Need to Add: AWS (41.89%), Docker (34.92%), Kubernetes (32.79%)

