# STEP 0: Persiapan Environment
Instalasi dan import pustaka yang diperlukan untuk preprocessing teks.

In [7]:
# Install library tambahan
!pip uninstall -y nltk Sastrawi
!pip install nltk Sastrawi swifter tqdm gensim sentence-transformers


Found existing installation: nltk 3.9.1
Uninstalling nltk-3.9.1:
  Successfully uninstalled nltk-3.9.1
Found existing installation: Sastrawi 1.0.1
Uninstalling Sastrawi-1.0.1:
  Successfully uninstalled Sastrawi-1.0.1
Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting Sastrawi
  Using cached Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Using cached Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
Installing collected packages: Sastrawi, nltk
Successfully installed Sastrawi-1.0.1 nltk-3.9.1



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: C:\Users\nurul\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [8]:
# Import pustaka
import pandas as pd
import re
import string
import nltk
import swifter
import os
import pickle
import numpy as np
import json
#import gdown
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# --- SETTINGS UNTUK TAMPILAN PANDAS ---
pd.set_option('display.max_columns', None)  # Tampilkan semua kolom
pd.set_option('display.width', 1000)      # Atur lebar tampilan di konsol
pd.set_option('display.max_rows', None)   # Tampilkan semua baris (gunakan hati-hati untuk DataFrame besar)
# ------------------------------------

tqdm.pandas() # Untuk progress bar di swifter

# Download resource NLTK
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

swifter.set_defaults(display_progress=True)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nurul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nurul\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\nurul\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# STEP 1: Load Dataset

Di hands-on kedua ini, saya akan membuat sistem rekomendasi pekerjaan berdasarkan deskripsi course.

Jadi, sistem rekomendasi ini bisa digunakan untuk merekomendasikan kepada user untuk mendaftar ke sebuah pekerjaan setelah ia berhasil menyelesaikan sebuah course.

Dataset yang digunakan ada 2, yaitu:

## 1. Dealls Jobs Dataset
Dataset ini terdiri dari 4000+ baris data dengan isian berbagai kolom sebagai berikut:
- `Course Name`: Judul nama kursus.
- `Course Description`: Deskripsi detail isi kursus
- `Skills`: Kumpulan skill yang diajarkan dalam kursus

## 2. Coursera Courses Dataset
Dataset ini diambil dari kaggle yang memuat informasi kursus yang tersedia di Coursera, kolom yang digunakan hanya kolom nama kursus, deskripsi, dan skill yang diajarkan.

Kolom dataset:
- `Course Name`: Judul nama kursus.
- `Course Description`: Deskripsi detail isi kursus
- `Skills`: Kumpulan skill yang diajarkan dalam kursus

Sumber Dataset:
- Dealls Jobs Dataset = diperoleh dari hasil scrapping langsung menggunakan selenium.
- [Coursera Courses Dataset 2021](https://www.kaggle.com/datasets/khusheekapoor/coursera-courses-dataset-2021)

In [23]:
# Set Up Folder untuk Simpan Model
MODEL_DIR = 'recommender_models'
os.makedirs(MODEL_DIR, exist_ok=True)

In [11]:
# Import Dataset Job
df_jobs = pd.read_csv('jobs_data.csv')
df_jobs.head()

Unnamed: 0,Kategori,Job Title,Company,Work Type,Work Mode,Experience,Salary,Detail Link,Job Description,Qualification,Benefit
0,Data & Product,Product - Director,Chery Indonesia,Penuh waktu,On-site • Jakarta,Min. 5+ years of experience,Negotiable,https://dealls.com/loker/product-director~cher...,"Membangun roadmap dan life-cycle produk, terma...",10+ tahun pengalaman dalam prinsipal otomotif ...,Competitive Salary\nMedical Insurance\nProfess...
1,Data & Product,Product Planning - Staff,Chery Indonesia,Penuh waktu,On-site • Jakarta,Min. 4 years of experience,Negotiable,https://dealls.com/loker/product-planning-staf...,Menganalisis data pasar dan survei untuk menge...,Pengalaman lebih dari 1 tahun sebagai product ...,Competitive Salary\nMedical Insurance\nProfess...
2,Data & Product,Strategic Sourcing - Officer,PT Merdeka Copper Gold Tbk,Penuh waktu,On-site • Jakarta,Min. 1 years of experience,Negotiable,https://dealls.com/loker/strategic-sourcing-of...,Menganalisis kebutuhan pengguna dalam hal spes...,Pengalaman minimal 1 tahun di bidang Manajemen...,Competitive Salary\nMedical Insurance\nProfess...
3,Data & Product,Process Management System Analyst,"PT. Garudafood Putra Putri Jaya, Tbk",Penuh waktu,On-site • Jakarta,Min. 1 years of experience,Negotiable,https://dealls.com/loker/process-management-sy...,1. Memfasilitasi standardisasi proses\n2. Meny...,1. Mempunyai pemahaman terkait standardisasi p...,Competitive Salary\nMedical Insurance\nProfess...
4,Data & Product,Measurement Product - Specialist,Google Indonesia,Penuh waktu,On-site • Jakarta,Min. 4 years of experience,Negotiable,https://dealls.com/loker/measurement-product-s...,Experience with implementation/configuration o...,Bachelor's degree or equivalent practical expe...,Competitive Salary\nMedical Insurance\nProfess...


In [12]:
# Import Dataset Course
df_courses = pd.read_csv('courses_data.csv')
df_courses.head()

Unnamed: 0,Title,Institution,Type,Level,Duration,Description,Rating,Category,Subcategory,Skills,Modules Name,Modules Description
0,.NET FullStack Developer,Board Infinity,Specialization,Intermediate,1 - 3 Months,Build Fullstack webapp with .NET technologies....,4.1,Computer Science,Mobile and Web Development,"HTML and CSS, Web Services, Javascript, Micros...","['.Net Full Stack Foundation', 'Frontend Devel...",['Build dynamic web applications using ASP.NET...
1,3D Interaction Design in Virtual Reality,University of London,Course,Intermediate,1 - 4 Weeks,This course is part of Virtual Reality Special...,4.6,Computer Science,Design and Product,"Prototyping, Human Computer Interaction, Usabi...","['Interaction in VR', 'Moving around in VR', '...","[""Welcome to Week 1! In this week, we will cov..."
2,3D Modeling for 3D Printing and Laser Cutting ...,Packt,Course,Intermediate,1 - 3 Months,,5.0,Physical Science and Engineering,Mechanical Engineering,"Autodesk, Engineering Tolerance, Visualization...","['Getting started', '3D Modeling Fundamentals'...","['In this module, we will explore the foundati..."
3,3D Printing and Additive Manufacturing,University of Illinois Urbana-Champaign,Specialization,Beginner,3 - 6 Months,Turn Your Ideas into Objects with 3D Printing....,4.6,Computer Science,Design and Product,"3D Modeling, Hardware Troubleshooting, Design ...","['The 3D Printing Revolution', '3D Printing Ap...","['Obtain a rich understanding of 3D printing, ..."
4,3D Printing Hardware,University of Illinois Urbana-Champaign,Course,Beginner,1 - 3 Months,This course is part of 3D Printing and Additiv...,4.2,Business,Entrepreneurship,"Electronic Components, Hardware Design, Manufa...",['Course Orientation & Module 1 Origins of Des...,"['In this welcome module, you will become fami..."


# STEP 2: Preprocessing
Di tahap ini, akan dilakukan pembersihan pada data. Untuk dataset jobs, akan dilakukan penggabungan pada kolom `Job Title`, `Job Description`, dan `Qualification` menjadi satu kolom, yaitu `Job Text`. Demikian pula untuk dataset course, akan diambil beberapa kolom yang mewakili informasi kursus, yaitu kolom `Title`, `Description`, dan `Skills` yang akan digabungkan menjadi satu kolom dengan nama `Course Text`.



In [13]:
df_jobs['Job Text'] = df_jobs['Job Title'] +  ' ' + df_jobs['Job Description'] + ' ' + df_jobs['Qualification']
df_jobs.head()

Unnamed: 0,Kategori,Job Title,Company,Work Type,Work Mode,Experience,Salary,Detail Link,Job Description,Qualification,Benefit,Job Text
0,Data & Product,Product - Director,Chery Indonesia,Penuh waktu,On-site • Jakarta,Min. 5+ years of experience,Negotiable,https://dealls.com/loker/product-director~cher...,"Membangun roadmap dan life-cycle produk, terma...",10+ tahun pengalaman dalam prinsipal otomotif ...,Competitive Salary\nMedical Insurance\nProfess...,Product - Director Membangun roadmap dan life-...
1,Data & Product,Product Planning - Staff,Chery Indonesia,Penuh waktu,On-site • Jakarta,Min. 4 years of experience,Negotiable,https://dealls.com/loker/product-planning-staf...,Menganalisis data pasar dan survei untuk menge...,Pengalaman lebih dari 1 tahun sebagai product ...,Competitive Salary\nMedical Insurance\nProfess...,Product Planning - Staff Menganalisis data pas...
2,Data & Product,Strategic Sourcing - Officer,PT Merdeka Copper Gold Tbk,Penuh waktu,On-site • Jakarta,Min. 1 years of experience,Negotiable,https://dealls.com/loker/strategic-sourcing-of...,Menganalisis kebutuhan pengguna dalam hal spes...,Pengalaman minimal 1 tahun di bidang Manajemen...,Competitive Salary\nMedical Insurance\nProfess...,Strategic Sourcing - Officer Menganalisis kebu...
3,Data & Product,Process Management System Analyst,"PT. Garudafood Putra Putri Jaya, Tbk",Penuh waktu,On-site • Jakarta,Min. 1 years of experience,Negotiable,https://dealls.com/loker/process-management-sy...,1. Memfasilitasi standardisasi proses\n2. Meny...,1. Mempunyai pemahaman terkait standardisasi p...,Competitive Salary\nMedical Insurance\nProfess...,Process Management System Analyst 1. Memfasili...
4,Data & Product,Measurement Product - Specialist,Google Indonesia,Penuh waktu,On-site • Jakarta,Min. 4 years of experience,Negotiable,https://dealls.com/loker/measurement-product-s...,Experience with implementation/configuration o...,Bachelor's degree or equivalent practical expe...,Competitive Salary\nMedical Insurance\nProfess...,Measurement Product - Specialist Experience wi...


In [14]:
df_courses['Course Text'] = df_courses['Title'] +  ' ' + df_courses['Description'] + ' ' + df_courses['Skills']
df_courses.head()

Unnamed: 0,Title,Institution,Type,Level,Duration,Description,Rating,Category,Subcategory,Skills,Modules Name,Modules Description,Course Text
0,.NET FullStack Developer,Board Infinity,Specialization,Intermediate,1 - 3 Months,Build Fullstack webapp with .NET technologies....,4.1,Computer Science,Mobile and Web Development,"HTML and CSS, Web Services, Javascript, Micros...","['.Net Full Stack Foundation', 'Frontend Devel...",['Build dynamic web applications using ASP.NET...,.NET FullStack Developer Build Fullstack webap...
1,3D Interaction Design in Virtual Reality,University of London,Course,Intermediate,1 - 4 Weeks,This course is part of Virtual Reality Special...,4.6,Computer Science,Design and Product,"Prototyping, Human Computer Interaction, Usabi...","['Interaction in VR', 'Moving around in VR', '...","[""Welcome to Week 1! In this week, we will cov...",3D Interaction Design in Virtual Reality This ...
2,3D Modeling for 3D Printing and Laser Cutting ...,Packt,Course,Intermediate,1 - 3 Months,,5.0,Physical Science and Engineering,Mechanical Engineering,"Autodesk, Engineering Tolerance, Visualization...","['Getting started', '3D Modeling Fundamentals'...","['In this module, we will explore the foundati...",
3,3D Printing and Additive Manufacturing,University of Illinois Urbana-Champaign,Specialization,Beginner,3 - 6 Months,Turn Your Ideas into Objects with 3D Printing....,4.6,Computer Science,Design and Product,"3D Modeling, Hardware Troubleshooting, Design ...","['The 3D Printing Revolution', '3D Printing Ap...","['Obtain a rich understanding of 3D printing, ...",3D Printing and Additive Manufacturing Turn Yo...
4,3D Printing Hardware,University of Illinois Urbana-Champaign,Course,Beginner,1 - 3 Months,This course is part of 3D Printing and Additiv...,4.2,Business,Entrepreneurship,"Electronic Components, Hardware Design, Manufa...",['Course Orientation & Module 1 Origins of Des...,"['In this welcome module, you will become fami...",3D Printing Hardware This course is part of 3D...



Setelah itu akan dilakukan pembersihan data untuk masing - masing dataset dengan menghilangkan URL, simbol, angka, emoji, dan spasi berlebih. Dilakukan juga lowerisasi, tokenisasi, stopword removal, dan juga stemming.

In [15]:
# Inisialisasi Stemmer
factory_stemmer = StemmerFactory()
stemmer = factory_stemmer.create_stemmer()

factory_stopword = StopWordRemoverFactory()
stopword_remover = factory_stopword.create_stop_word_remover()

stop_words_en = set(stopwords.words('english'))
stop_words_id = set([
    'yang', 'dan', 'di', 'ke', 'dari', 'untuk', 'dengan', 'atau', 'pada', 'ini', 'itu',
    'sebagai', 'jika', 'ada', 'karena', 'saat', 'oleh', 'juga', 'agar', 'dalam', 'tidak',
    'adalah', 'bahwa', 'maupun', 'bagi', 'dapat', 'telah', 'sudah', 'lebih', 'harus',
    'setiap', 'kita', 'kami', 'saya', 'anda', 'mereka', 'semua', 'bisa', 'masih',
    'akan', 'dan', 'pun', 'hingga', 'dengan', 'berikut', 'namun', 'sehingga'
])
stop_words_all = stop_words_en.union(stop_words_id)

In [17]:
# Definisi Fungsi Preprocessing
def clean_noise(text):
    if not isinstance(text, str): # Pastikan input adalah string
        return ""
    text = re.sub(r'http\S+|www\S+', '', text)                       # hapus URL
    text = re.sub(r'\d+', '', text)                                  # hapus angka
    text = text.translate(str.maketrans('', '', string.punctuation)) # hapus tanda baca & simbol
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)                       # hapus emoji dan karakter aneh
    text = re.sub(r'\s+', ' ', text).strip()                         # trim spasi berlebih
    return text

def tokenisasi(text):
    return word_tokenize(text)

def stopword(tokens):
    return [word for word in tokens if word not in stop_words_all and len(word) > 2] # Filter kata < 3 huruf

def stemming(token_list):
    return [stemmer.stem(word) for word in token_list]

# Fungsi gabungan untuk preprocessing teks baru (pipeline)
def cleaning_pipeline(text):
    # Step 1: Bersihkan URL, simbol, angka, emoji
    text = clean_noise(text)

    # Step 2: Lowercase
    text = text.lower()

    # Step 3: Tokenisasi
    tokens = tokenisasi(text)

    # Step 4: Stopword Removal
    tokens = stopword(tokens)

    # Step 5: Stemming
    stemmed = stemming(tokens)

    return ' '.join(stemmed)


In [18]:
df_jobs['Job Text'] = df_jobs['Job Text'].swifter.apply(cleaning_pipeline)
df_jobs.head()

Pandas Apply: 100%|██████████| 900/900 [07:00<00:00,  2.14it/s]


Unnamed: 0,Kategori,Job Title,Company,Work Type,Work Mode,Experience,Salary,Detail Link,Job Description,Qualification,Benefit,Job Text
0,Data & Product,Product - Director,Chery Indonesia,Penuh waktu,On-site • Jakarta,Min. 5+ years of experience,Negotiable,https://dealls.com/loker/product-director~cher...,"Membangun roadmap dan life-cycle produk, terma...",10+ tahun pengalaman dalam prinsipal otomotif ...,Competitive Salary\nMedical Insurance\nProfess...,product director bangun roadmap lifecycle prod...
1,Data & Product,Product Planning - Staff,Chery Indonesia,Penuh waktu,On-site • Jakarta,Min. 4 years of experience,Negotiable,https://dealls.com/loker/product-planning-staf...,Menganalisis data pasar dan survei untuk menge...,Pengalaman lebih dari 1 tahun sebagai product ...,Competitive Salary\nMedical Insurance\nProfess...,product planning staff analis data pasar surve...
2,Data & Product,Strategic Sourcing - Officer,PT Merdeka Copper Gold Tbk,Penuh waktu,On-site • Jakarta,Min. 1 years of experience,Negotiable,https://dealls.com/loker/strategic-sourcing-of...,Menganalisis kebutuhan pengguna dalam hal spes...,Pengalaman minimal 1 tahun di bidang Manajemen...,Competitive Salary\nMedical Insurance\nProfess...,strategic sourcing officer analis butuh guna h...
3,Data & Product,Process Management System Analyst,"PT. Garudafood Putra Putri Jaya, Tbk",Penuh waktu,On-site • Jakarta,Min. 1 years of experience,Negotiable,https://dealls.com/loker/process-management-sy...,1. Memfasilitasi standardisasi proses\n2. Meny...,1. Mempunyai pemahaman terkait standardisasi p...,Competitive Salary\nMedical Insurance\nProfess...,process management system analyst fasilitas st...
4,Data & Product,Measurement Product - Specialist,Google Indonesia,Penuh waktu,On-site • Jakarta,Min. 4 years of experience,Negotiable,https://dealls.com/loker/measurement-product-s...,Experience with implementation/configuration o...,Bachelor's degree or equivalent practical expe...,Competitive Salary\nMedical Insurance\nProfess...,measurement product specialist experience impl...


In [19]:
# Pecah kolom 'Work Mode' pada dataset Job menjadi dua kolom baru
df_jobs[['Work Arrangement', 'Location']] = df_jobs['Work Mode'].str.split(' • ', expand=True)

# Bersihkan spasi berlebih
df_jobs['Work Arrangement'] = df_jobs['Work Arrangement'].str.strip()
df_jobs['Location'] = df_jobs['Location'].str.strip()

print("✅ Work Mode dipecah menjadi Work Arrangement dan Location:")
df_jobs[['Work Mode', 'Work Arrangement', 'Location']].head()


✅ Work Mode dipecah menjadi Work Arrangement dan Location:


Unnamed: 0,Work Mode,Work Arrangement,Location
0,On-site • Jakarta,On-site,Jakarta
1,On-site • Jakarta,On-site,Jakarta
2,On-site • Jakarta,On-site,Jakarta
3,On-site • Jakarta,On-site,Jakarta
4,On-site • Jakarta,On-site,Jakarta


In [20]:
# Simpan ke CSV supaya tidak perlu preprocess ulang selama data masih sama
df_jobs.to_csv('df_jobs_cleaned.csv', index=False)
print("✅ Preprocessing data job selesai, data disimpan ke df_jobs_cleaned.csv")


✅ Preprocessing data job selesai, data disimpan ke df_jobs_cleaned.csv


In [21]:
df_courses['Course Text'] = df_courses['Course Text'].swifter.apply(cleaning_pipeline)
df_courses.head()

Pandas Apply: 100%|██████████| 3338/3338 [01:29<00:00, 37.16it/s] 


Unnamed: 0,Title,Institution,Type,Level,Duration,Description,Rating,Category,Subcategory,Skills,Modules Name,Modules Description,Course Text
0,.NET FullStack Developer,Board Infinity,Specialization,Intermediate,1 - 3 Months,Build Fullstack webapp with .NET technologies....,4.1,Computer Science,Mobile and Web Development,"HTML and CSS, Web Services, Javascript, Micros...","['.Net Full Stack Foundation', 'Frontend Devel...",['Build dynamic web applications using ASP.NET...,net fullstack developer build fullstack webapp...
1,3D Interaction Design in Virtual Reality,University of London,Course,Intermediate,1 - 4 Weeks,This course is part of Virtual Reality Special...,4.6,Computer Science,Design and Product,"Prototyping, Human Computer Interaction, Usabi...","['Interaction in VR', 'Moving around in VR', '...","[""Welcome to Week 1! In this week, we will cov...",interaction design virtual reality course part...
2,3D Modeling for 3D Printing and Laser Cutting ...,Packt,Course,Intermediate,1 - 3 Months,,5.0,Physical Science and Engineering,Mechanical Engineering,"Autodesk, Engineering Tolerance, Visualization...","['Getting started', '3D Modeling Fundamentals'...","['In this module, we will explore the foundati...",
3,3D Printing and Additive Manufacturing,University of Illinois Urbana-Champaign,Specialization,Beginner,3 - 6 Months,Turn Your Ideas into Objects with 3D Printing....,4.6,Computer Science,Design and Product,"3D Modeling, Hardware Troubleshooting, Design ...","['The 3D Printing Revolution', '3D Printing Ap...","['Obtain a rich understanding of 3D printing, ...",printing additive manufacturing turn ideas obj...
4,3D Printing Hardware,University of Illinois Urbana-Champaign,Course,Beginner,1 - 3 Months,This course is part of 3D Printing and Additiv...,4.2,Business,Entrepreneurship,"Electronic Components, Hardware Design, Manufa...",['Course Orientation & Module 1 Origins of Des...,"['In this welcome module, you will become fami...",printing hardware course part printing additiv...


In [24]:
# Simpan ke CSV supaya tidak perlu preprocess ulang selama data masih sama
df_courses.to_csv('df_courses_cleaned.csv', index=False)
print("✅ Preprocessing data course selesai, data disimpan ke df_courses_cleaned.csv")


✅ Preprocessing data course selesai, data disimpan ke df_courses_cleaned.csv


In [26]:
# Contoh input
new_text = "Dicari kandidat yang memiliki pengalaman data engineering minimal 2 tahun dan siap kerja remote."

# Preprocess
cleaned_tokens = cleaning_pipeline(new_text)

print("Teks awal :", new_text)
print("\nTeks akhir:", cleaned_tokens)


Teks awal : Dicari kandidat yang memiliki pengalaman data engineering minimal 2 tahun dan siap kerja remote.

Teks akhir: cari kandidat milik alam data engineering minimal tahun siap kerja remote


# STEP 3: Data Vectorization

## TF-IDF

In [27]:
# Inisialisasi TF-IDF Vectorizer

# max_features: Batas jumlah kata unik yang digunakan sebagai fitur.
#               Membantu mengurangi dimensi dan fokus pada kata-kata paling penting.
tfidfv = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)

In [28]:
# Fill NaN values in 'Job Text' with empty strings before fitting
df_jobs['Job Text'] = df_jobs['Job Text'].fillna('')

# Fit dan transform teks job menjadi vektor
job_vectors = tfidfv.fit_transform(df_jobs['Job Text'])

# Menampilkan fitur (kata unik)
print("Fitur (Kata unik):")
print(tfidfv.get_feature_names_out())

# Menampilkan matriks fitur dalam bentuk array
print("\nMatriks TF-IDF:")
print(job_vectors.toarray())


Fitur (Kata unik):
['aadk' 'abilities' 'ability' ... 'youtube' 'youtube etc' 'youtube shorts']

Matriks TF-IDF:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [29]:
# Fill NaN values in 'Course Text' with empty strings before fitting
df_courses['Course Text'] = df_courses['Course Text'].fillna('')

# Fit dan transform teks course menjadi vektor
course_vectors = tfidfv.transform(df_courses['Course Text'])

# Menampilkan fitur (kata unik)
print("Fitur (Kata unik):")
print(tfidfv.get_feature_names_out())

# Menampilkan matriks fitur dalam bentuk array
print("\nMatriks TF-IDF:")
print(course_vectors.toarray())


Fitur (Kata unik):
['aadk' 'abilities' 'ability' ... 'youtube' 'youtube etc' 'youtube shorts']

Matriks TF-IDF:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


## S-BERT

In [30]:
# Inisialisasi Model S-BERT
model_sbert = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

In [31]:
# Encode text job menjadi vektor dengan S-BERT
job_sbert = model_sbert.encode(df_jobs['Job Text'].tolist(), show_progress_bar=True)

# Menampilkan matriks fitur dalam bentuk array
print("\nMatriks Job BERT:")
print(job_sbert)

Batches: 100%|██████████| 29/29 [00:31<00:00,  1.07s/it]


Matriks Job BERT:
[[ 0.01020522  0.03859282 -0.02703663 ... -0.018368   -0.00933469
   0.13681997]
 [-0.01811393  0.09558722 -0.05240084 ...  0.00405787 -0.1051678
   0.05774349]
 [-0.06891727  0.07826266 -0.01722825 ...  0.05014372 -0.02776447
   0.06943014]
 ...
 [-0.1269154   0.02821202  0.00195615 ...  0.0251647  -0.01020336
   0.23226303]
 [ 0.01710818  0.00580347 -0.2612509  ... -0.6322176  -0.24471328
   0.29764432]
 [-0.1993295  -0.13659391 -0.13237616 ... -0.05296428 -0.15967247
   0.09054585]]





In [32]:
# Encode text course menjadi vektor dengan S-BERT
course_sbert = model_sbert.encode(df_courses['Course Text'].tolist(), show_progress_bar=True)

# Menampilkan matriks fitur dalam bentuk array
print("\nMatriks Course BERT:")
print(course_sbert)

Batches: 100%|██████████| 105/105 [01:08<00:00,  1.54it/s]


Matriks Course BERT:
[[ 0.06210362 -0.06608525 -0.01965539 ...  0.17173962  0.13702078
  -0.05078768]
 [ 0.16371545  0.08803593  0.19690618 ...  0.21472326 -0.38258287
   0.08740444]
 [ 0.22691739  0.08178422  0.02354267 ... -0.09982928 -0.03107583
   0.074184  ]
 ...
 [ 0.22691739  0.08178422  0.02354267 ... -0.09982928 -0.03107583
   0.074184  ]
 [ 0.22691739  0.08178422  0.02354267 ... -0.09982928 -0.03107583
   0.074184  ]
 [ 0.22691739  0.08178422  0.02354267 ... -0.09982928 -0.03107583
   0.074184  ]]





## STEP 4: Similarity Mapping



In [33]:
# Perhitungan Cosine Similarity dengan TF-IDF
similarity_tfidf = cosine_similarity(job_vectors, course_vectors)

# Matrix hasil similaritas vector
print("Matrix Cosine Similarity dengan TF-IDF:")
print(similarity_tfidf)

Matrix Cosine Similarity dengan TF-IDF:
[[0.02066029 0.         0.         ... 0.         0.         0.        ]
 [0.02199384 0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.0043136  0.00300357 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [34]:
# Perhitungan Cosine Similarity dengan S-BERT
similarity_sbert = cosine_similarity(job_sbert, course_sbert)

print("Matrix Cosine Similarity dengan S-BERT:")
print(similarity_sbert)

Matrix Cosine Similarity dengan S-BERT:
[[0.28353763 0.2859233  0.12197686 ... 0.12197686 0.12197686 0.12197686]
 [0.261913   0.35098007 0.10088413 ... 0.10088413 0.10088413 0.10088413]
 [0.19131748 0.1933697  0.02685588 ... 0.02685588 0.02685588 0.02685588]
 ...
 [0.28225386 0.3713048  0.06370115 ... 0.06370115 0.06370115 0.06370115]
 [0.04816552 0.19075666 0.34666908 ... 0.34666908 0.34666908 0.34666908]
 [0.10744783 0.17865388 0.3114782  ... 0.3114782  0.3114782  0.3114782 ]]


## STEP 5: Membangun Recommender System

In [35]:
# Similarity Mapping tahap persiapan data ---
df_jobs_for_recommendation = df_jobs[['Job Title', 'Company', 'Job Description', 'Qualification', 'Benefit', 'Job Text']]
df_jobs_for_recommendation.head()

Unnamed: 0,Job Title,Company,Job Description,Qualification,Benefit,Job Text
0,Product - Director,Chery Indonesia,"Membangun roadmap dan life-cycle produk, terma...",10+ tahun pengalaman dalam prinsipal otomotif ...,Competitive Salary\nMedical Insurance\nProfess...,product director bangun roadmap lifecycle prod...
1,Product Planning - Staff,Chery Indonesia,Menganalisis data pasar dan survei untuk menge...,Pengalaman lebih dari 1 tahun sebagai product ...,Competitive Salary\nMedical Insurance\nProfess...,product planning staff analis data pasar surve...
2,Strategic Sourcing - Officer,PT Merdeka Copper Gold Tbk,Menganalisis kebutuhan pengguna dalam hal spes...,Pengalaman minimal 1 tahun di bidang Manajemen...,Competitive Salary\nMedical Insurance\nProfess...,strategic sourcing officer analis butuh guna h...
3,Process Management System Analyst,"PT. Garudafood Putra Putri Jaya, Tbk",1. Memfasilitasi standardisasi proses\n2. Meny...,1. Mempunyai pemahaman terkait standardisasi p...,Competitive Salary\nMedical Insurance\nProfess...,process management system analyst fasilitas st...
4,Measurement Product - Specialist,Google Indonesia,Experience with implementation/configuration o...,Bachelor's degree or equivalent practical expe...,Competitive Salary\nMedical Insurance\nProfess...,measurement product specialist experience impl...


In [36]:
# Recommendation Modelling: Fungsi Umum untuk Kedua Model ---
def get_job_recommendations(course_description, model_type, num_recommendations=5):
    if model_type == 'tfidf':
        course_vector = tfidfv.transform([course_description])
        similarity_matrix = cosine_similarity(course_vector, job_vectors)
    elif model_type == 'sbert':
        course_vector = model_sbert.encode([course_description])
        similarity_matrix = cosine_similarity(course_vector, job_sbert)
    else:
        raise ValueError("Invalid model_type. Choose 'tfidf' or 'sbert'.")

    # Flatten the similarity_matrix before getting top indices
    similarity_scores = similarity_matrix[0]

    # Get indices of top recommendations
    top_indices = similarity_scores.argsort()[-num_recommendations:][::-1]

    # Filter out recommendations with 0 or negative similarity score
    top_indices = [idx for idx in top_indices if similarity_scores[idx] > 0]


    if not top_indices:
        print("Peringatan: Tidak ada rekomendasi pekerjaan yang kuat (skor similaritas semuanya nol atau tidak ada).")
        return pd.DataFrame()

    recommended_jobs = df_jobs.iloc[top_indices].copy()
    recommended_jobs['Similarity Score'] = similarity_scores[top_indices]

    # Urutkan berdasarkan Similarity_Score secara menurun
    recommended_jobs = recommended_jobs.sort_values(by='Similarity Score', ascending=False)

    return recommended_jobs[['Job Title', 'Company', 'Job Description', 'Qualification', 'Benefit', 'Similarity Score']]

In [37]:
# Recommendation Modelling: Contoh Penggunaan dan Perbandingan
test_course_desc = "Saya baru saja menyelesaikan kursus mendalam tentang UI/UX Designer, saya memahami dengan baik metode design thinking"

print(f"\nQuery: {test_course_desc}")

# Rekomendasi menggunakan TF-IDF
print("\n--- Rekomendasi dengan TF-IDF ---")
recommended_jobs_tfidf = get_job_recommendations(
    test_course_desc,
    model_type='tfidf',
    num_recommendations=5
)
if not recommended_jobs_tfidf.empty:
    print(recommended_jobs_tfidf)
else:
    print("Tidak ada rekomendasi TF-IDF ditemukan.")

# Rekomendasi menggunakan S-BERT
print("\n\n\n--- Rekomendasi dengan S-BERT ---")
recommended_jobs_sbert = get_job_recommendations(
    test_course_desc,
    model_type='sbert',
    num_recommendations=5
)
if not recommended_jobs_sbert.empty:
    print(recommended_jobs_sbert)
else:
    print("Tidak ada rekomendasi S-BERT ditemukan.")


Query: Saya baru saja menyelesaikan kursus mendalam tentang UI/UX Designer, saya memahami dengan baik metode design thinking

--- Rekomendasi dengan TF-IDF ---
                                  Job Title                            Company                                    Job Description                                      Qualification                                            Benefit  Similarity Score
178       Product Designer (Based in Medan)                           Topremit  Collaborate with cross-functional teams to bra...  Minimum of 1 year experience in product design...  Competitive Salary\nMedical Insurance\nDental ...          0.148151
208  Spatial Designer (Visual Merchandiser)                             MODENA    Managing, analyzing, evaluating, and executing:  Minimum bachelor's degree\nMinimum 4 years of ...  Competitive Salary\nMedical Insurance\nTHR / B...          0.136355
199               Graphic Design Internship  PT Bank Digital BCA (BCA Digital)           

In [38]:
# Uji sistem rekomendasi dengan 5 data pertama

example_recommendations = {}

for i, course_name in enumerate(df_courses['Title'].tolist()[:5]):
    recs = get_job_recommendations(
        course_name,
        model_type='sbert',
        num_recommendations=10  # Total jumlah rekomendasi per query
    )
    print(f"\n{'='*100}")
    print(f"Query [{i+1}]: {course_name}")
    print(f"{'-'*100}")

    if not recs.empty:
        example_recommendations[f"{i+1}"] = {
            "query_text": course_name,
            "recommendations": recs.to_dict(orient='records')
        }

        for j, (_, row) in enumerate(recs.iterrows(), start=1):
            print(f"{j}. {row['Job Title']} at {row['Company']}")
            print(f"   Score: {row['Similarity Score']:.4f}")
    else:
        example_recommendations[f"{i+1}"] = {
            "query_text": course_name,
            "recommendations": []
        }
        print("No recommendations found.")


Query [1]: .NET FullStack Developer
----------------------------------------------------------------------------------------------------
1. Software Engineer (Fullstack) at Sprout Digital Labs
   Score: 0.6730
2. Software - Engineer at The Flex
   Score: 0.6452
3. Fullstack Developer at AssistX Enterprise
   Score: 0.6329
4. Full Stack Developer at United Creative
   Score: 0.5868
5. Fullstack Go Javascript Developer at ITOMMEY Group
   Score: 0.5858
6. Fullstack Developer at ITOMMEY Group
   Score: 0.5841
7. Fullstack Web Developer at PT Gree Electric Appliances Indonesia
   Score: 0.5802
8. Full Stack Developer at PT. Mandiri International Technology
   Score: 0.5790
9. .NET Developer at First Resources Ltd.
   Score: 0.5750
10. Full Stack Developer at Kharisma Potensia Indonesia
   Score: 0.5733

Query [2]: 3D Interaction Design in Virtual Reality
----------------------------------------------------------------------------------------------------
1. Visual Merchandise Project Speci

In [None]:
# Blok kode ini tidak jadi dijalankan karena model S-BERT memiliki hasil rekomendasi yang lebih baik daripada TF-IDF

# # Simpan hasil TF-IDF ke file JSON
# recommendations_tfidf_filename = 'recommendations_tfidf.json'
# with open(recommendations_tfidf_filename, 'w', encoding='utf-8') as f:
#     json.dump(example_recommendations, f, ensure_ascii=False, indent=4)
# print(f"\n✅ Hasil rekomendasi TF-IDF disimpan ke '{recommendations_tfidf_filename}'")

In [40]:
# Simpan hasil S-BERT ke file JSON
recommendations_sbert_filename = 'recommendations_sbert.json'
with open(recommendations_sbert_filename, 'w', encoding='utf-8') as f:
    json.dump(example_recommendations, f, ensure_ascii=False, indent=4)
print(f"\n✅ Hasil rekomendasi S-BERT disimpan ke '{recommendations_sbert_filename}'")


✅ Hasil rekomendasi S-BERT disimpan ke 'recommendations_sbert.json'


In [41]:
# ✅ Simpan Model, Embeddings, dan Data
model_sbert.save(os.path.join(MODEL_DIR, 'sbert_model_dir'))

with open(os.path.join(MODEL_DIR, 'sbert_embeddings.pkl'), 'wb') as f:
    pickle.dump(job_sbert, f)

with open(os.path.join(MODEL_DIR, 'df_for_recommendation.pkl'), 'wb') as f:
    pickle.dump(df_jobs_for_recommendation, f)

print("✅ Semua model, embeddings, dan data berhasil disimpan di folder 'recommender_models'.")

✅ Semua model, embeddings, dan data berhasil disimpan di folder 'recommender_models'.
