In [13]:
import sys
!{sys.executable} -m pip install rapidfuzz





In [14]:
import pandas as pd
from rapidfuzz import fuzz, process
import re

In [15]:
df = pd.read_excel("../courses_flattened.xlsx")  

all_raw_skills = df["Skill"].dropna().str.split(',').explode().str.strip()
unique_skills = sorted(set(all_raw_skills))

In [16]:
filtered_skills = [
    re.sub(r"[^\w\s\-+/]", "", skill.strip().lower())
    for skill in unique_skills
    if len(skill.split()) <= 5
]

In [17]:
standard_skills = sorted(set(s for s in filtered_skills if len(s) > 2))

In [18]:
def match_skills(raw_skills: str, standard_skills: list, threshold: int = 80):
    matched = []
    if not isinstance(raw_skills, str):
        return matched

    for skill in raw_skills.split(','):
        skill = skill.strip()
        if not skill:
            continue
        best_match, score, _ = process.extractOne(skill, standard_skills, scorer=fuzz.token_sort_ratio)
        if score >= threshold:
            matched.append(best_match)
        else:
            matched.append(f"[Unmatched] {skill}")
    return matched

In [19]:
def average_matching_score(raw_skills: str, standard_skills: list):
    scores = []
    if not isinstance(raw_skills, str):
        return None
    for skill in raw_skills.split(','):
        skill = skill.strip()
        if not skill:
            continue
        _, score, _ = process.extractOne(skill, standard_skills, scorer=fuzz.token_sort_ratio)
        scores.append(score)
    return round(sum(scores) / len(scores), 2) if scores else None

In [20]:
# === 5. Thống kê độ chính xác: matched/total
def matching_stats(raw_skills: str, standard_skills: list, threshold: int = 80):
    total, matched = 0, 0
    if not isinstance(raw_skills, str):
        return 0, 0, 0.0
    for skill in raw_skills.split(','):
        skill = skill.strip()
        if not skill:
            continue
        total += 1
        _, score, _ = process.extractOne(skill, standard_skills, scorer=fuzz.token_sort_ratio)
        if score >= threshold:
            matched += 1
    percent = round(matched / total * 100, 2) if total else 0.0
    return matched, total, percent

In [21]:
df["Matched_Skills"] = df["Skill"].apply(lambda s: match_skills(s, standard_skills))
df["Matching_Score"] = df["Skill"].apply(lambda s: average_matching_score(s, standard_skills))
df[["Matched_Count", "Total_Skills", "Match_Rate(%)"]] = df["Skill"].apply(
    lambda s: pd.Series(matching_stats(s, standard_skills))
)

In [22]:
total_matched = df["Matched_Count"].sum()
total_skills = df["Total_Skills"].sum()

precision = round(total_matched / total_skills, 4) if total_skills else 0.0
recall = precision  # giả định không có false negative vì không có ground truth
f1_score = round(2 * precision * recall / (precision + recall), 4) if precision + recall > 0 else 0.0
avg_match_score = round(df["Matching_Score"].mean(), 2)


In [23]:
from IPython.display import display, Markdown

display(Markdown(f"""
## 📊 Đánh giá độ chính xác fuzzy matching

- **Tổng kỹ năng**: `{total_skills}`
- **Tổng kỹ năng match được**: `{total_matched}`
- **Precision (approx)**: `{precision * 100:.2f}%`
- **Recall (approx)**: `{recall * 100:.2f}%`
- **F1 Score (approx)**: `{f1_score * 100:.2f}%`
- **Điểm fuzzy trung bình**: `{avg_match_score}/100`
"""))


## 📊 Đánh giá độ chính xác fuzzy matching

- **Tổng kỹ năng**: `6128.0`
- **Tổng kỹ năng match được**: `4527.0`
- **Precision (approx)**: `73.87%`
- **Recall (approx)**: `73.87%`
- **F1 Score (approx)**: `73.87%`
- **Điểm fuzzy trung bình**: `82.81/100`


In [24]:
display(df[["Tên MH", "Skill", "Matched_Skills", "Match_Rate(%)", "Matching_Score"]].head(10))

# === 10. Ghi file nếu muốn
df.to_excel("courses_with_matched_skills.xlsx", index=False)


Unnamed: 0,Tên MH,Skill,Matched_Skills,Match_Rate(%),Matching_Score
0,Hệ thống thông tin kế toán,"Accounting Cycle Analysis, Business Process Mo...","[accounting cycle analysis, business process m...",93.33,87.51
1,Hoạch định nguồn lực doanh nghiệp,"ERP System Analysis, Business Process Modeling...","[[Unmatched] ERP System Analysis, [Unmatched] ...",71.43,82.46
2,Giới thiệu ngành Kỹ Thuật Máy tính,"Industry Awareness, Career Planning, ICT Trend...","[industry awareness, career planning, [Unmatch...",88.89,87.21
3,Vi xử lý-vi điều khiển,Microprocessor and Microcontroller Fundamental...,[microprocessor and microcontroller fundamenta...,80.0,83.66
4,Xử lý tín hiệu số,"Digital Signal Processing, Discrete-Time Syste...","[digital signal processing, discrete-time syst...",66.67,82.14
5,Thiết kế luận lý số,"Sequential Circuit Design, Memory Component An...","[sequential circuit design, memory component a...",87.5,87.08
6,Thực hành Kiến trúc máy tính,"FPGA System Development, Nios II Soft Processo...","[[Unmatched] FPGA System Development, nios ii ...",80.0,84.17
7,Lý thuyết mạch điện,"Electrical Circuit Analysis, Equivalent Circui...","[electrical circuit analysis, equivalent circu...",100.0,88.18
8,Các thiết bị và mạch điện tử,"Electronic Circuit Analysis, Amplifier Design,...","[electronic circuit analysis, amplifier design...",72.73,76.69
9,Đồ án 1,"Basic Circuit Design, Embedded Software Develo...","[basic circuit design, embedded software devel...",80.0,86.51
