In [1]:
import sys
!{sys.executable} -m pip install sentence-transformers pandas openpyxl



In [2]:
import pandas as pd
import numpy as np
import re
import torch
from sentence_transformers import SentenceTransformer, util
from IPython.display import display, Markdown

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# === 1. Load dữ liệu ===
df = pd.read_excel("../courses_flattened.xlsx")

# === 2. Chuẩn hóa kỹ năng ===
def preprocess(skill):
    return re.sub(r"[^\w\s\-+/]", "", skill.strip().lower())

In [4]:
all_raw_skills = df["Skill"].dropna().str.split(',').explode().str.strip()
unique_skills = sorted(set(all_raw_skills))
standard_skills = sorted(set(preprocess(s) for s in unique_skills if len(s.split()) <= 5 and len(s) > 2))

In [5]:
model = SentenceTransformer('all-MiniLM-L6-v2')
standard_embeddings = model.encode(standard_skills, convert_to_tensor=True)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [6]:
def semantic_match_skills(raw_skills: str, threshold: float = 0.6):
    matches = []
    if not isinstance(raw_skills, str):
        return matches
    for skill in raw_skills.split(','):
        skill_clean = preprocess(skill)
        if not skill_clean:
            continue
        query_embedding = model.encode(skill_clean, convert_to_tensor=True)
        cosine_scores = util.cos_sim(query_embedding, standard_embeddings)[0]
        best_score, best_idx = torch.max(cosine_scores, dim=0)
        matched_skill = standard_skills[best_idx]
        if best_score.item() >= threshold:
            matches.append(matched_skill)
        else:
            matches.append(f"[Unmatched] {skill}")
    return matches


In [7]:
def semantic_score_stats(raw_skills: str, threshold: float = 0.6):
    if not isinstance(raw_skills, str):
        return 0, 0, 0.0, None
    total, matched, scores = 0, 0, []
    for skill in raw_skills.split(','):
        skill_clean = preprocess(skill)
        if not skill_clean:
            continue
        total += 1
        query_embedding = model.encode(skill_clean, convert_to_tensor=True)
        cosine_scores = util.cos_sim(query_embedding, standard_embeddings)[0]
        best_score, best_idx = torch.max(cosine_scores, dim=0)
        scores.append(best_score.item())
        if best_score.item() >= threshold:
            matched += 1
    percent = round(matched / total * 100, 2) if total else 0.0
    avg_score = round(np.mean(scores), 4) if scores else None
    return matched, total, percent, avg_score

In [8]:
df["Semantic_Matched_Skills"] = df["Skill"].apply(lambda s: semantic_match_skills(s, threshold=0.6))
df[["Semantic_Matched_Count", "Total_Skills", "Semantic_Match_Rate(%)", "Semantic_Match_Score"]] = df["Skill"].apply(
    lambda s: pd.Series(semantic_score_stats(s, threshold=0.6))
)

In [9]:
total_matched = df["Semantic_Matched_Count"].sum()
total_skills = df["Total_Skills"].sum()

precision = round(total_matched / total_skills, 4) if total_skills else 0.0
recall = precision  # giả định không có ground truth
f1_score = round(2 * precision * recall / (precision + recall), 4) if precision + recall > 0 else 0.0
avg_semantic_score = round(df["Semantic_Match_Score"].mean(), 4)

In [10]:
display(Markdown(f"""
## 🧠 Semantic Matching – Sentence-BERT

- **Tổng kỹ năng**: `{total_skills}`
- **Tổng kỹ năng match được**: `{total_matched}`
- **Precision (approx)**: `{precision * 100:.2f}%`
- **Recall (approx)**: `{recall * 100:.2f}%`
- **F1 Score (approx)**: `{f1_score * 100:.2f}%`
- **Điểm cosine trung bình**: `{avg_semantic_score:.4f}`
"""))


## 🧠 Semantic Matching – Sentence-BERT

- **Tổng kỹ năng**: `6128.0`
- **Tổng kỹ năng match được**: `6118.0`
- **Precision (approx)**: `99.84%`
- **Recall (approx)**: `99.84%`
- **F1 Score (approx)**: `99.84%`
- **Điểm cosine trung bình**: `0.9986`


In [11]:
display(df[["Tên MH", "Skill", "Semantic_Matched_Skills", "Semantic_Match_Rate(%)", "Semantic_Match_Score"]].head(10))
df.to_excel("courses_with_semantic_matching.xlsx", index=False)


Unnamed: 0,Tên MH,Skill,Semantic_Matched_Skills,Semantic_Match_Rate(%),Semantic_Match_Score
0,Hệ thống thông tin kế toán,"Accounting Cycle Analysis, Business Process Mo...","[accounting cycle analysis, business process m...",100.0,1.0
1,Hoạch định nguồn lực doanh nghiệp,"ERP System Analysis, Business Process Modeling...","[erp system analysis, business process modelin...",100.0,0.9795
2,Giới thiệu ngành Kỹ Thuật Máy tính,"Industry Awareness, Career Planning, ICT Trend...","[industry awareness, career planning, ict tren...",100.0,1.0
3,Vi xử lý-vi điều khiển,Microprocessor and Microcontroller Fundamental...,[microprocessor and microcontroller fundamenta...,100.0,0.9732
4,Xử lý tín hiệu số,"Digital Signal Processing, Discrete-Time Syste...","[digital signal processing, discrete-time syst...",100.0,1.0
5,Thiết kế luận lý số,"Sequential Circuit Design, Memory Component An...","[sequential circuit design, memory component a...",100.0,1.0
6,Thực hành Kiến trúc máy tính,"FPGA System Development, Nios II Soft Processo...","[fpga system development, nios ii soft process...",100.0,1.0
7,Lý thuyết mạch điện,"Electrical Circuit Analysis, Equivalent Circui...","[electrical circuit analysis, equivalent circu...",100.0,1.0
8,Các thiết bị và mạch điện tử,"Electronic Circuit Analysis, Amplifier Design,...","[electronic circuit analysis, amplifier design...",100.0,1.0
9,Đồ án 1,"Basic Circuit Design, Embedded Software Develo...","[basic circuit design, embedded software devel...",100.0,1.0
