## Skill Extraction Notebook

### Import Library

In [3]:
import pandas as pd

### Data Wrangling

#### Gathering Data

In [2]:
df = pd.read_csv('../../data/all.csv', delimiter='|')
df.head()

Unnamed: 0,id,job_title,location,salary_currency,career_level,experience_level,education_level,employment_type,job_function,job_benefits,company_process_time,company_size,company_industry,job_description,salary
0,1,Facility Maintenance & Smart Warehouse Manager,Bandung,IDR,Manajer/Asisten Manajer,5 tahun,"Sertifikat Professional, D3 (Diploma), D4 (Dip...",Penuh Waktu,"Manufaktur,Pemeliharaan",,,,,Deskripsi PekerjaanRequirements :D3/SI from re...,
1,2,Procurement Department Head,Jakarta Raya,IDR,Manajer/Asisten Manajer,5 tahun,"Sarjana (S1), Diploma Pascasarjana, Gelar Prof...",Penuh Waktu,"Manufaktur,Pembelian/Manajemen Material",,25 days,51 - 200 pekerja,Manajemen/Konsulting HR,Job Role: 1. Responsible for material availabi...,
2,3,SALES ADMIN,Jakarta Barat,IDR,Supervisor/Koordinator,4 tahun,Sarjana (S1),Penuh Waktu,"Penjualan / Pemasaran,Penjualan Ritel","Waktu regular, Senin - Jumat;Bisnis (contoh: K...",30 days,51 - 200 pekerja,Umum & Grosir,Internal Sales & AdminJob Description :We are ...,
3,4,City Operation Lead Shopee Express (Cirebon),Cirebon,IDR,Supervisor/Koordinator,5 tahun,"Sarjana (S1), Diploma Pascasarjana, Gelar Prof...",Penuh Waktu,"Pelayanan,Logistik/Rantai Pasokan","Tip;Waktu regular, Senin - Jumat;Kasual (conto...",21 days,2001 - 5000 pekerja,Retail/Merchandise,Job Description:Responsible for HSE implementa...,
4,5,Japanese Interpreter,Bekasi,IDR,Pegawai (non-manajemen & non-supervisor),2 tahun,"Sertifikat Professional, D3 (Diploma), D4 (Dip...",Penuh Waktu,"Lainnya,Jurnalis/Editor",,23 days,201 - 500 pekerja,Manajemen/Konsulting HR,Overview: Our clients is manufacture for autom...,


#### Assessing Data

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34746 entries, 0 to 34745
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    34746 non-null  int64  
 1   job_title             34746 non-null  object 
 2   location              34746 non-null  object 
 3   salary_currency       34742 non-null  object 
 4   career_level          34746 non-null  object 
 5   experience_level      30205 non-null  object 
 6   education_level       34746 non-null  object 
 7   employment_type       33402 non-null  object 
 8   job_function          34746 non-null  object 
 9   job_benefits          27330 non-null  object 
 10  company_process_time  24555 non-null  object 
 11  company_size          29103 non-null  object 
 12  company_industry      33132 non-null  object 
 13  job_description       34745 non-null  object 
 14  salary                9352 non-null   float64
dtypes: float64(1), int6

In [4]:
df.isnull().sum()

id                          0
job_title                   0
location                    0
salary_currency             4
career_level                0
experience_level         4541
education_level             0
employment_type          1344
job_function                0
job_benefits             7416
company_process_time    10191
company_size             5643
company_industry         1614
job_description             1
salary                  25394
dtype: int64

In [5]:
duplicate = df.duplicated().sum()
print(f"duplikat data: {duplicate}")

duplikat data: 0


#### Cleaning Data

In [6]:
df_clean = df[['id','job_title', 'job_description']].copy()

df_clean.isnull().sum()

id                 0
job_title          0
job_description    1
dtype: int64

In [7]:
df_clean = df_clean.dropna(subset=['job_description'])

df_clean.isnull().sum()

id                 0
job_title          0
job_description    0
dtype: int64

### EDA - Skill extraction

#### Set Dataframe

In [10]:
df_eda = df_clean.iloc[1000:2000].copy()
df_eda.head()

Unnamed: 0,id,job_title,job_description
1000,1001,PROCUREMENT & EXIM SUPERVISOR,Procurement & Exim SupervisorREQUIREMENT1. ...
1001,1002,Staff Admin,Kualifikasi :Terbuka untuk segala usiaPendidik...
1002,1003,Motion Graphic Designer,RequirementsBachelor's Degree or equivalent in...
1003,1004,Asisten Associate Manager,Apakah Anda mendambakan kebebasan bekerja. Kel...
1004,1005,Staff Design Development,Job Responsibilities : Responsible for the pro...


##### Cleaning Job description text

In [11]:
import re
import string

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\w*\d\w*', ' ', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub(' +', ' ', text)
    # text = remove_stopwords(text)
    # text = lemmatize(text)
    return text

df_eda['job_description_cleaned'] = df_eda['job_description'].apply(clean_text)


In [19]:
df_eda.head()

Unnamed: 0,id,job_title,job_description,job_description_cleaned
0,1,Facility Maintenance & Smart Warehouse Manager,Deskripsi PekerjaanRequirements :D3/SI from re...,deskripsi pekerjaanrequirements si from reputa...
1,2,Procurement Department Head,Job Role: 1. Responsible for material availabi...,job role responsible for material availabili...
2,3,SALES ADMIN,Internal Sales & AdminJob Description :We are ...,internal sales adminjob description we are loo...
3,4,City Operation Lead Shopee Express (Cirebon),Job Description:Responsible for HSE implementa...,job description responsible for hse implementa...
4,5,Japanese Interpreter,Overview: Our clients is manufacture for autom...,overview our clients is manufacture for automo...


#### Extraction

Extraction Setup

In [12]:
import spacy
from skillNer.skill_extractor_class import SkillExtractor
from spacy.matcher import PhraseMatcher
from skillNer.general_params import SKILL_DB

nlp = spacy.load("en_core_web_lg")

skill_extractor = SkillExtractor(
    nlp,
    SKILL_DB,
    PhraseMatcher
)

def extract_doc_node_values(annotation_result):
    if not isinstance(annotation_result, dict) or 'results' not in annotation_result:
        return []

    doc_node_values = []
    results = annotation_result['results']

    if 'full_matches' in results:
        for match in results['full_matches']:
            doc_node_values.append(match['doc_node_value'])

    if 'ngram_scored' in results:
        for match in results['ngram_scored']:
            doc_node_values.append(match['doc_node_value'])

    return doc_node_values


loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...


In [21]:
test = "We need someone with strong procurement, logistics, and leadership skills."
ann = skill_extractor.annotate(test)
extract_doc_node_values(ann)


['procurement', 'logistics', 'leadership']

Extracting Proccess

In [None]:
from tqdm import tqdm

# annotations = []
# for idx, text in tqdm(enumerate(df_eda['job_description_cleaned'].astype(str)), 
#                        total=len(df_eda),
#                        desc="Extracting skills"):
#     try:
#         ann = skill_extractor.annotate(text)
#     except Exception as e:
#         print(f"[SKIPPED] Index {idx} | Error: {e} | Text: {text[:80]}...")
#         ann = None   # biar aman
#     annotations.append(ann)

# df_eda['annotations'] = annotations

# df_eda['skills'] = df_eda['annotations'].apply(
#     lambda x: extract_doc_node_values(x) if isinstance(x, dict) else []
# )


  vec_similarity = token1.similarity(token2)
Extracting skills:  28%|██▊       | 278/1000 [07:52<47:52,  3.98s/it]  

[SKIPPED] Index 277 | Error: 'grow' is not in list | Text: xendit provides payment infrastructure across southeast asia with a focus on ind...


Extracting skills:  67%|██████▋   | 671/1000 [21:51<1:27:59, 16.05s/it]

[SKIPPED] Index 670 | Error: 'mark' is not in list | Text: ericsson country unit  indonesialocation  jakartatravel required  up to open to ...


Extracting skills: 100%|██████████| 1000/1000 [30:18<00:00,  1.82s/it] 


In [None]:
from tqdm import tqdm

# annotations = []
# for idx, text in tqdm(enumerate(df_eda['job_description_cleaned'].astype(str)), 
#                        total=len(df_eda),
#                        desc="Extracting skills"):
#     try:
#         ann = skill_extractor.annotate(text)
#     except Exception as e:
#         print(f"[SKIPPED] Index {idx} | Error: {e} | Text: {text[:80]}...")
#         ann = None   # biar aman
#     annotations.append(ann)

# df_eda['annotations'] = annotations

# df_eda['skills'] = df_eda['annotations'].apply(
#     lambda x: extract_doc_node_values(x) if isinstance(x, dict) else []
# )


  vec_similarity = token1.similarity(token2)
Extracting skills:   6%|▌         | 58/1000 [02:33<1:20:47,  5.15s/it]

[SKIPPED] Index 56 | Error: 'grow' is not in list | Text: xendit provides payment infrastructure across southeast asia with a focus on ind...


Extracting skills:  32%|███▏      | 315/1000 [08:45<2:37:28, 13.79s/it]

[SKIPPED] Index 314 | Error: 'grow' is not in list | Text: xendit provides payment infrastructure across southeast asia with a focus on ind...


Extracting skills:  94%|█████████▍| 941/1000 [24:06<00:37,  1.56it/s]  

[SKIPPED] Index 939 | Error: list index out of range | Text: about the rolethey say no man is an island a notion that holds particularly true...


Extracting skills: 100%|██████████| 1000/1000 [24:55<00:00,  1.50s/it]


In [14]:
df_eda.head()

Unnamed: 0,id,job_title,job_description,job_description_cleaned,annotations,skills
1000,1001,PROCUREMENT & EXIM SUPERVISOR,Procurement & Exim SupervisorREQUIREMENT1. ...,procurement exim at least or bachelor’s d...,{'text': 'procurement exim at least or bachelo...,"[microsoft excel, sustainable procurement, pro..."
1001,1002,Staff Admin,Kualifikasi :Terbuka untuk segala usiaPendidik...,kualifikasi terbuka untuk segala usiapendidika...,{'text': 'kualifikasi terbuka untuk segala usi...,[checklist]
1002,1003,Motion Graphic Designer,RequirementsBachelor's Degree or equivalent in...,requirementsbachelor s degree or equivalent in...,{'text': 'requirementsbachelor s degree or equ...,"[visual communication, motion graphic, problem..."
1003,1004,Asisten Associate Manager,Apakah Anda mendambakan kebebasan bekerja. Kel...,apakah anda mendambakan kebebasan bekerja kelu...,{'text': 'apakah anda mendambakan kebebasan be...,"[dari, manager, associate manager, manager]"
1004,1005,Staff Design Development,Job Responsibilities : Responsible for the pro...,job responsibilities responsible for the proc...,{'text': 'job responsibilities responsible for...,"[renovating, hospital industry, autocad, sketc..."


#### Export Dataframe

In [None]:
# df_eda.to_csv("skill_extraction_2.csv", index=False)


### Feature Enginerring

#### Import Result Data

In [27]:
df1 = pd.read_csv("skill_extraction.csv")
df2 = pd.read_csv("skill_extraction_2.csv")

df_skill = pd.concat([df1, df2], ignore_index=True)

In [28]:
df_skill[['job_title', 'skills']].head()

Unnamed: 0,job_title,skills
0,Facility Maintenance & Smart Warehouse Manager,"['electrical inspection', 'management system',..."
1,Procurement Department Head,"['heavy equipment', 'contract management', 'he..."
2,SALES ADMIN,"['microsoft office', 'heat exchanger', 'heat e..."
3,City Operation Lead Shopee Express (Cirebon),"['job description', 'operation management', 'a..."
4,Japanese Interpreter,"['job description', 'japanese', 'japanese', 'j..."


In [30]:
import pandas as pd

url = "https://raw.githubusercontent.com/Caknoooo/provinces-cities-indonesia/main/csv/regencies.csv"

df_cities = pd.read_csv(url)
cities = df_cities['regency'].str.lower().unique().tolist()

df_cities.head()


Unnamed: 0,id,province_id,regency,type
0,1101,11,Simeulue,Kabupaten
1,1102,11,Aceh Singkil,Kabupaten
2,1103,11,Aceh Selatan,Kabupaten
3,1104,11,Aceh Tenggara,Kabupaten
4,1105,11,Aceh Timur,Kabupaten


#### Cleaning Noise

In [30]:
import ast
import re

# 1. pastiin skills jadi list
df_skill['skills'] = df_skill['skills'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

# 2. noise words
noise_words = {'job description', 'dari', 'akan', 'ada', 'hal', 'www'}

# 3. cleaning skills
def clean_skills(skills):
    cleaned = []
    for s in skills:
        s = s.lower().strip()

        # hapus kata dobel: "data data" → "data"
        s = re.sub(r'\b(\w+)\s+\1\b', r'\1', s)

        # skip noise
        if s in noise_words:
            continue

        cleaned.append(s)

    # hapus duplikat tapi urutan tetap
    return list(dict.fromkeys(cleaned))


df_skill['skills'] = df_skill['skills'].apply(clean_skills)


In [31]:
df_skill[['job_title', 'skills']].head()

Unnamed: 0,job_title,skills
0,Facility Maintenance & Smart Warehouse Manager,"[electrical inspection, management system, cor..."
1,Procurement Department Head,"[heavy equipment, contract management, product..."
2,SALES ADMIN,"[microsoft office, heat exchanger, carbon stee..."
3,City Operation Lead Shopee Express (Cirebon),"[operation management, analytical skill, leade..."
4,Japanese Interpreter,"[japanese, translator, english, non, translate]"


In [32]:
df_skill = df_skill[df_skill['skills'].apply(len) > 0]

In [33]:
df_skill[['job_title', 'skills']].info()

<class 'pandas.core.frame.DataFrame'>
Index: 1803 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   job_title  1803 non-null   object
 1   skills     1803 non-null   object
dtypes: object(2)
memory usage: 42.3+ KB
