## Skill Extraction Notebook

### Import Library

In [1]:
import pandas as pd

### Data Wrangling

#### Gathering Data

In [2]:
df = pd.read_csv('../../data/all.csv', delimiter='|')
df.head()

Unnamed: 0,id,job_title,location,salary_currency,career_level,experience_level,education_level,employment_type,job_function,job_benefits,company_process_time,company_size,company_industry,job_description,salary
0,1,Facility Maintenance & Smart Warehouse Manager,Bandung,IDR,Manajer/Asisten Manajer,5 tahun,"Sertifikat Professional, D3 (Diploma), D4 (Dip...",Penuh Waktu,"Manufaktur,Pemeliharaan",,,,,Deskripsi PekerjaanRequirements :D3/SI from re...,
1,2,Procurement Department Head,Jakarta Raya,IDR,Manajer/Asisten Manajer,5 tahun,"Sarjana (S1), Diploma Pascasarjana, Gelar Prof...",Penuh Waktu,"Manufaktur,Pembelian/Manajemen Material",,25 days,51 - 200 pekerja,Manajemen/Konsulting HR,Job Role: 1. Responsible for material availabi...,
2,3,SALES ADMIN,Jakarta Barat,IDR,Supervisor/Koordinator,4 tahun,Sarjana (S1),Penuh Waktu,"Penjualan / Pemasaran,Penjualan Ritel","Waktu regular, Senin - Jumat;Bisnis (contoh: K...",30 days,51 - 200 pekerja,Umum & Grosir,Internal Sales & AdminJob Description :We are ...,
3,4,City Operation Lead Shopee Express (Cirebon),Cirebon,IDR,Supervisor/Koordinator,5 tahun,"Sarjana (S1), Diploma Pascasarjana, Gelar Prof...",Penuh Waktu,"Pelayanan,Logistik/Rantai Pasokan","Tip;Waktu regular, Senin - Jumat;Kasual (conto...",21 days,2001 - 5000 pekerja,Retail/Merchandise,Job Description:Responsible for HSE implementa...,
4,5,Japanese Interpreter,Bekasi,IDR,Pegawai (non-manajemen & non-supervisor),2 tahun,"Sertifikat Professional, D3 (Diploma), D4 (Dip...",Penuh Waktu,"Lainnya,Jurnalis/Editor",,23 days,201 - 500 pekerja,Manajemen/Konsulting HR,Overview: Our clients is manufacture for autom...,


#### Assessing Data

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34746 entries, 0 to 34745
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    34746 non-null  int64  
 1   job_title             34746 non-null  object 
 2   location              34746 non-null  object 
 3   salary_currency       34742 non-null  object 
 4   career_level          34746 non-null  object 
 5   experience_level      30205 non-null  object 
 6   education_level       34746 non-null  object 
 7   employment_type       33402 non-null  object 
 8   job_function          34746 non-null  object 
 9   job_benefits          27330 non-null  object 
 10  company_process_time  24555 non-null  object 
 11  company_size          29103 non-null  object 
 12  company_industry      33132 non-null  object 
 13  job_description       34745 non-null  object 
 14  salary                9352 non-null   float64
dtypes: float64(1), int6

In [4]:
df.isnull().sum()

id                          0
job_title                   0
location                    0
salary_currency             4
career_level                0
experience_level         4541
education_level             0
employment_type          1344
job_function                0
job_benefits             7416
company_process_time    10191
company_size             5643
company_industry         1614
job_description             1
salary                  25394
dtype: int64

In [5]:
duplicate = df.duplicated().sum()
print(f"duplikat data: {duplicate}")

duplikat data: 0


#### Cleaning Data

In [6]:
df_clean = df[['id','job_title', 'job_description']].copy()

df_clean.isnull().sum()

id                 0
job_title          0
job_description    1
dtype: int64

In [7]:
df_clean = df_clean.dropna(subset=['job_description'])

df_clean.isnull().sum()

id                 0
job_title          0
job_description    0
dtype: int64

### EDA - Skill extraction

#### Set Dataframe

In [18]:
df_eda = df_clean.head(100).copy()
df_eda.head()

Unnamed: 0,id,job_title,job_description
0,1,Facility Maintenance & Smart Warehouse Manager,Deskripsi PekerjaanRequirements :D3/SI from re...
1,2,Procurement Department Head,Job Role: 1. Responsible for material availabi...
2,3,SALES ADMIN,Internal Sales & AdminJob Description :We are ...
3,4,City Operation Lead Shopee Express (Cirebon),Job Description:Responsible for HSE implementa...
4,5,Japanese Interpreter,Overview: Our clients is manufacture for autom...


#### Cleaning text

##### Cleaning Job title text

In [10]:
# import string

# # 1. Ganti simbol & / - / () dengan koma
# df_eda['job_title_clean'] = (
#     df_eda['job_title']
#     .str.replace(f"[{string.punctuation}]", ",", regex=True)
#     .str.replace(r"\s+", " ", regex=True)       # rapikan spasi
#     .str.strip()
# )

# # 2. Ubah menjadi list (array) dan lowercase tiap elemen
# df_eda['job_title_array'] = df_eda['job_title_clean'].str.split(',')

# # 3. Hapus kolom sementara
# df_eda.drop('job_title_clean', axis=1, inplace=True)

# df_eda.head(10)


In [11]:
# df_with_punct = df_eda[df_eda['job_title'].str.contains(r'[^\w\s]')]
# df_with_punct[['job_title', 'job_title_array']].head(20)


##### Cleaning Job description text

In [19]:
import re
import string

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\w*\d\w*', ' ', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub(' +', ' ', text)
    # text = remove_stopwords(text)
    # text = lemmatize(text)
    return text

df_eda['job_description_cleaned'] = df_eda['job_description'].apply(clean_text)


In [22]:
df_eda.head()

Unnamed: 0,id,job_title,job_description,job_description_cleaned
0,1,Facility Maintenance & Smart Warehouse Manager,Deskripsi PekerjaanRequirements :D3/SI from re...,deskripsi pekerjaanrequirements si from reputa...
1,2,Procurement Department Head,Job Role: 1. Responsible for material availabi...,job role responsible for material availabili...
2,3,SALES ADMIN,Internal Sales & AdminJob Description :We are ...,internal sales adminjob description we are loo...
3,4,City Operation Lead Shopee Express (Cirebon),Job Description:Responsible for HSE implementa...,job description responsible for hse implementa...
4,5,Japanese Interpreter,Overview: Our clients is manufacture for autom...,overview our clients is manufacture for automo...


#### Extraction

In [16]:
import spacy
from skillNer.skill_extractor_class import SkillExtractor
from spacy.matcher import PhraseMatcher
from skillNer.general_params import SKILL_DB

nlp = spacy.load("en_core_web_lg")

skill_extractor = SkillExtractor(
    nlp,
    SKILL_DB,
    PhraseMatcher
)

def extract_doc_node_values(annotation_result):
    if not isinstance(annotation_result, dict) or 'results' not in annotation_result:
        return []

    doc_node_values = []
    results = annotation_result['results']

    if 'full_matches' in results:
        for match in results['full_matches']:
            doc_node_values.append(match['doc_node_value'])

    if 'ngram_scored' in results:
        for match in results['ngram_scored']:
            doc_node_values.append(match['doc_node_value'])

    return doc_node_values


loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...


In [23]:
test = "We need someone with strong procurement, logistics, and leadership skills."
ann = skill_extractor.annotate(test)
extract_doc_node_values(ann)


['procurement', 'logistics', 'leadership']

In [24]:
df_eda['annotations'] = df_eda['job_description_cleaned'].apply(
    lambda x: skill_extractor.annotate(x)
)

df_eda['skills'] = df_eda['annotations'].apply(extract_doc_node_values)
print("Exctration success")

  vec_similarity = token1.similarity(token2)


Exctration success


In [25]:
df_eda.head()

Unnamed: 0,id,job_title,job_description,job_description_cleaned,annotations,skills
0,1,Facility Maintenance & Smart Warehouse Manager,Deskripsi PekerjaanRequirements :D3/SI from re...,deskripsi pekerjaanrequirements si from reputa...,{'text': 'deskripsi pekerjaanrequirements si f...,"[electrical inspection, management system, cor..."
1,2,Procurement Department Head,Job Role: 1. Responsible for material availabi...,job role responsible for material availabili...,{'text': 'job role responsible for material av...,"[heavy equipment, contract management, heavy e..."
2,3,SALES ADMIN,Internal Sales & AdminJob Description :We are ...,internal sales adminjob description we are loo...,{'text': 'internal sales adminjob description ...,"[microsoft office, heat exchanger, heat exchan..."
3,4,City Operation Lead Shopee Express (Cirebon),Job Description:Responsible for HSE implementa...,job description responsible for hse implementa...,{'text': 'job description responsible for hse ...,"[job description, operation management, analyt..."
4,5,Japanese Interpreter,Overview: Our clients is manufacture for autom...,overview our clients is manufacture for automo...,{'text': 'overview our clients is manufacture ...,"[job description, japanese, japanese, japanese..."


In [27]:
df_eda[['job_title', 'skills']].head(20)

Unnamed: 0,job_title,skills
0,Facility Maintenance & Smart Warehouse Manager,"[electrical inspection, management system, cor..."
1,Procurement Department Head,"[heavy equipment, contract management, heavy e..."
2,SALES ADMIN,"[microsoft office, heat exchanger, heat exchan..."
3,City Operation Lead Shopee Express (Cirebon),"[job description, operation management, analyt..."
4,Japanese Interpreter,"[job description, japanese, japanese, japanese..."
5,KEPALA PABRIK,"[microsoft word, leadership]"
6,Admin Online Shop,"[e commerce, social medium, hal, broadcast, ad..."
7,Accounting Staff,"[fix asset, analytical thinking, reconciliatio..."
8,MEP Design - Cikarang,"[design review, structural engineering, engine..."
9,Tax Supervisor,"[income tax, tax compliance, digital electroni..."
