# Technical Test PT Allure - CV Matcher - Muhammad Luthfi Juliandri

In [145]:
!pip install pdfplumber gradio sentence-transformers pandas scikit-learn



In [146]:
import gradio as gr
import pdfplumber
import pandas as pd
import numpy as np
import re
from sklearn.ensemble import RandomForestClassifier
from sentence_transformers import SentenceTransformer, util
from datetime import datetime

model_embed = SentenceTransformer('all-MiniLM-L6-v2')
clf = None

In [147]:
def extract_text_from_pdf(file_path):
    with pdfplumber.open(file_path) as pdf:
        return "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())

In [148]:
def estimate_experience_years(text):
    import re
    from datetime import datetime

    now = datetime.now()
    lines = text.lower().splitlines()

    experience_headings = [
        'work experience', 'work experiences', 'professional experience',
        'experiences', 'pengalaman kerja', 'riwayat karier', 'riwayat pekerjaan'
    ]
    stop_headings = ['education', 'pendidikan', 'academic background', 'school', 'university']


    def normalize(line):
        return line.replace(' ', '').strip()

    start_idx = None
    stop_idx = None

    for i, line in enumerate(lines):
        norm_line = normalize(line)
        if start_idx is None and any(normalize(k) in norm_line for k in experience_headings):
            start_idx = i
        elif start_idx is not None and any(normalize(k) in norm_line for k in stop_headings):
            stop_idx = i
            break

    if start_idx is None:
        return 0.0

    section = lines[start_idx : stop_idx or start_idx + 40]

    pattern = re.compile(r'(20\d{2})\s*[-–]\s*(20\d{2}|present|sekarang|saat ini)', flags=re.IGNORECASE)

    years = []
    for line in section:
        for sy, ey in pattern.findall(line):
            try:
                start_year = int(sy)
                end_year = int(ey) if ey.isdigit() else now.year
                years.append((start_year, end_year))
            except:
                continue

    if not years:
        return 0.0

    min_start = min(y[0] for y in years)
    max_end = max(y[1] for y in years)

    return round (max_end - min_start, 1)


In [149]:
def count_skill_matches(text, skills):
    return sum(1 for s in skills if s.lower() in text.lower())

In [150]:
def train_realistic_model():
    global clf
    data = [
        [3, 5, 70, 1200, 1],
        [2, 4, 60, 1000, 1],
        [1, 10, 30, 1500, 0],
        [0, 0, 20, 900, 0],
        [3, 2, 65, 1100, 1],
        [0, 7, 25, 800, 0],
        [2, 1, 55, 950, 1],
        [1, 0, 40, 700, 0]
    ]
    df = pd.DataFrame(data, columns=["skill", "exp", "sim", "length", "label"])
    X = df[["skill", "exp", "sim", "length"]]
    y = df["label"]
    clf = RandomForestClassifier()
    clf.fit(X, y)

In [151]:
def match_cv_realistic(job_title, skills_csv, min_exp, cvs):
    skills = [s.strip().lower() for s in skills_csv.split(',') if s.strip()]
    job_text = f"Kami mencari {job_title} yang menguasai {', '.join(skills)}"
    results = []
    for cv in cvs:
        try:
            text = extract_text_from_pdf(cv.name)
            skill_score = count_skill_matches(text, skills)
            exp_years = estimate_experience_years(text)
            emb_cv = model_embed.encode(text, convert_to_tensor=True)
            emb_job = model_embed.encode(job_text, convert_to_tensor=True)
            sim_score = util.cos_sim(emb_cv, emb_job).item() * 100
            length = len(text.split())
            features = np.array([[skill_score, exp_years, sim_score * 2, length]])
            pred = clf.predict_proba(features)[0][1]
            alasan = []
            if exp_years < min_exp:
                pred *= 0.5
                alasan.append(f'Pengalaman kurang dari {min_exp} tahun')
            if skill_score == 0:
                alasan.append('Tidak ada skill yang cocok')
            if sim_score < 40:
                alasan.append('Similarity rendah')
            if not alasan:
                alasan.append('Cocok dengan deskripsi pekerjaan')
            results.append({
                'Filename': cv.name.split('/')[-1],
                'Skill Match': skill_score,
                'Experience (yrs)': round(exp_years, 1),
                'Similarity': round(sim_score, 2),
                'Score': round(pred * 100, 2),
                'Alasan': '; '.join(alasan)
            })
        except Exception as e:
            results.append({
                'Filename': getattr(cv, 'name', 'cv.pdf'),
                'Skill Match': 0,
                'Experience (±yrs)': 0,
                'Similarity': 0,
                'Score': 0,
                'Alasan': f"❌ Gagal diproses: {e}"
            })
    df = pd.DataFrame(results).sort_values(by='Score', ascending=False)
    return df.to_html(index=False, border=0)

In [152]:
train_realistic_model()
gr.Interface(
    fn=lambda job_title, skills_csv, min_exp, cvs: match_cv_realistic(job_title, skills_csv, min_exp, cvs),
    inputs=[
        gr.Textbox(label="Job Title"),
        gr.Textbox(label="Skill yang Dibutuhkan (pisahkan dengan koma)"),
        gr.Number(label="Minimal Tahun Pengalaman"),
        gr.File(label="Upload Banyak CV (PDF)", file_types=[".pdf"], file_count="multiple"),
    ],
    outputs=gr.HTML(label="Hasil Ranking dengan Model"),
    title="Teknikal Tes AI CV Matcher PT. Allure - Muhammad Luthfi Juliandri"
).launch()

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://240382c7428a7314eb.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


