In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import classification_report

from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing import text, sequence

In [2]:
df = pd.read_csv("export/final_data.csv")
df.head()

Unnamed: 0,id,job_title,location,salary_currency,career_level,experience_level,education_level,employment_type,job_function,job_benefits,company_process_time,company_size,company_industry,job_description,salary,job_description_cleaned,annotations,skills
0,1,Facility Maintenance & Smart Warehouse Manager,Bandung,IDR,Manajer/Asisten Manajer,5 tahun,"Sertifikat Professional, D3 (Diploma), D4 (Dip...",Penuh Waktu,"Manufaktur,Pemeliharaan",,,,,Deskripsi PekerjaanRequirements :D3/SI from re...,,deskripsi pekerjaanrequirements si from reputa...,{'text': 'deskripsi pekerjaanrequirements si f...,"['electrical inspection', 'management system',..."
1,2,Procurement Department Head,Jakarta Raya,IDR,Manajer/Asisten Manajer,5 tahun,"Sarjana (S1), Diploma Pascasarjana, Gelar Prof...",Penuh Waktu,"Manufaktur,Pembelian/Manajemen Material",,25 days,51 - 200 pekerja,Manajemen/Konsulting HR,Job Role: 1. Responsible for material availabi...,,job role responsible for material availabili...,{'text': 'job role responsible for material av...,"['heavy equipment', 'contract management', 'pr..."
2,3,SALES ADMIN,Jakarta Barat,IDR,Supervisor/Koordinator,4 tahun,Sarjana (S1),Penuh Waktu,"Penjualan / Pemasaran,Penjualan Ritel","Waktu regular, Senin - Jumat;Bisnis (contoh: K...",30 days,51 - 200 pekerja,Umum & Grosir,Internal Sales & AdminJob Description :We are ...,,internal sales adminjob description we are loo...,{'text': 'internal sales adminjob description ...,"['microsoft office', 'heat exchanger', 'carbon..."
3,4,City Operation Lead Shopee Express (Cirebon),Cirebon,IDR,Supervisor/Koordinator,5 tahun,"Sarjana (S1), Diploma Pascasarjana, Gelar Prof...",Penuh Waktu,"Pelayanan,Logistik/Rantai Pasokan","Tip;Waktu regular, Senin - Jumat;Kasual (conto...",21 days,2001 - 5000 pekerja,Retail/Merchandise,Job Description:Responsible for HSE implementa...,,job description responsible for hse implementa...,{'text': 'job description responsible for hse ...,"['operation management', 'analytical skill', '..."
4,5,Japanese Interpreter,Bekasi,IDR,Pegawai (non-manajemen & non-supervisor),2 tahun,"Sertifikat Professional, D3 (Diploma), D4 (Dip...",Penuh Waktu,"Lainnya,Jurnalis/Editor",,23 days,201 - 500 pekerja,Manajemen/Konsulting HR,Overview: Our clients is manufacture for autom...,,overview our clients is manufacture for automo...,{'text': 'overview our clients is manufacture ...,"['japanese', 'translator', 'english', 'non', '..."


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17182 entries, 0 to 17181
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       17182 non-null  int64  
 1   job_title                17182 non-null  object 
 2   location                 17182 non-null  object 
 3   salary_currency          17178 non-null  object 
 4   career_level             17182 non-null  object 
 5   experience_level         17182 non-null  object 
 6   education_level          17182 non-null  object 
 7   employment_type          17182 non-null  object 
 8   job_function             17182 non-null  object 
 9   job_benefits             14440 non-null  object 
 10  company_process_time     13377 non-null  object 
 11  company_size             15488 non-null  object 
 12  company_industry         16412 non-null  object 
 13  job_description          17182 non-null  object 
 14  salary                

In [4]:
# Get a NumPy array of all unique values
unique_values_list = df['job_function'].unique()
print(len(unique_values_list))

68


In [5]:
job_function_mapping = {

    # Finance & Accounting
    "Akuntansi / Keuangan,Audit & Pajak": "Finance & Accounting",
    "Akuntansi / Keuangan,Akuntansi Umum / Pembiayaan": "Finance & Accounting",
    "Akuntansi / Keuangan,Perbankan / Jasa Finansial ": "Finance & Accounting",
    "Akuntansi / Keuangan,Keuangan / Investasi Perusahaan ": "Finance & Accounting",
    "Sains,Aktuaria/Statistik": "Finance & Accounting",

    # IT & Software
    "Komputer/Teknologi Informasi,IT-Perangkat Lunak": "IT & Software",
    "Komputer/Teknologi Informasi,IT-Admin Jaringan/Sistem/Database": "IT & Software",
    "Komputer/Teknologi Informasi,IT-Perangkat Keras": "IT & Software",

    # Engineering (Non-IT)
    "Teknik,Teknik Lingkungan": "Engineering",
    "Teknik,Teknik Lainnya": "Engineering",
    "Teknik,Mekanikal": "Engineering",
    "Teknik,Teknik Elektronika": "Engineering",
    "Teknik,Teknik Elektro": "Engineering",
    "Teknik,Teknik Industri": "Engineering",
    "Teknik,Teknik Kimia": "Engineering",
    "Bangunan/Konstruksi,Teknik Sipil/Konstruksi Bangunan": "Engineering",
    "Bangunan/Konstruksi,Survei Kuantitas": "Engineering",
    "Sains,Pertanian": "Engineering",
    "Sains,Teknologi Makanan/Ahli Gizi": "Engineering",
    "Sains,Sains & Teknologi": "Engineering",

    # Manufacturing & Operations
    "Manufaktur,Manufaktur": "Manufacturing & Operations",
    "Manufaktur,Pemeliharaan": "Manufacturing & Operations",
    "Manufaktur,Pembelian/Manajemen Material": "Manufacturing & Operations",
    "Manufaktur,Penjaminan Kualitas / QA": "Manufacturing & Operations",
    "Pelayanan,Logistik/Rantai Pasokan": "Manufacturing & Operations",

    # Sales & Marketing
    "Penjualan / Pemasaran,Pemasaran/Pengembangan Bisnis": "Sales & Marketing",
    "Penjualan / Pemasaran,Digital Marketing": "Sales & Marketing",
    "Penjualan / Pemasaran,Penjualan - Jasa Keuangan": "Sales & Marketing",
    "Penjualan / Pemasaran,Penjualan - Teknik/Teknikal/IT": "Sales & Marketing",
    "Penjualan / Pemasaran,Penjualan Ritel": "Sales & Marketing",
    "Penjualan / Pemasaran,Merchandising": "Sales & Marketing",
    "Penjualan / Pemasaran,Penjualan - Korporasi": "Sales & Marketing",
    "Penjualan / Pemasaran,Telesales/Telemarketing": "Sales & Marketing",
    "Penjualan / Pemasaran,E-commerce": "Sales & Marketing",
    "Seni/Media/Komunikasi,Periklanan": "Sales & Marketing",

    # HR & Administration
    "Sumber Daya Manusia/Personalia,Sumber Daya Manusia / HR": "HR & Administration",
    "Sumber Daya Manusia/Personalia,Staf / Administrasi umum": "HR & Administration",
    "Sumber Daya Manusia/Personalia,Sekretaris": "HR & Administration",
    "Sumber Daya Manusia/Personalia,Top Management / Manajemen Tingkat Atas": "HR & Administration",
    "Pelayanan,Pengacara / Asisten Legal": "HR & Administration",
    "Pelayanan,Angkatan Bersenjata": "HR & Administration",

    # Creative & Media
    "Seni/Media/Komunikasi,Seni / Desain Kreatif": "Creative & Media",
    "Seni/Media/Komunikasi,Hubungan Masyarakat": "Creative & Media",
    "Seni/Media/Komunikasi,Hiburan": "Creative & Media",
    "Lainnya,Jurnalis/Editor": "Creative & Media",

    # Education & Training
    "Pendidikan/Pelatihan,Pendidikan": "Education",
    "Pendidikan/Pelatihan,Pelatihan & Pengembangan": "Education",

    # Hospitality & Service
    "Hotel/Restoran,Makanan/Minuman/Pelayanan Restoran": "Hospitality & Service",
    "Hotel/Restoran,Hotel/Pariwisata": "Hospitality & Service",
    "Pelayanan,Layanan Pelanggan": "Hospitality & Service",
    "Pelayanan,Teknikal & Bantuan Pelanggan": "Hospitality & Service",
    "Pelayanan,Perawatan Pribadi": "Hospitality & Service",
    "Layanan Kesehatan,Diagnosa/Lainnya": "Hospitality & Service",
    "Layanan Kesehatan,Farmasi": "Hospitality & Service",
    "Layanan Kesehatan,Praktisi/Asisten Medis": "Hospitality & Service",
    "Sains,Bioteknologi": "Hospitality & Service",
    "Sains,Biomedis": "Hospitality & Service",
    "Sains,Kimia": "Hospitality & Service",
    "Sains,Geologi/Geofisika": "Hospitality & Service",

    # Other
    "Bangunan/Konstruksi,Arsitek/Desain Interior": "Other",
    "Bangunan/Konstruksi,Properti/Real Estate": "Other",
    "Lainnya,Pekerjaan Umum": "Other",
    "Lainnya,Lainnya/Kategori tidak tersedia": "Other",
}

df["job_function_group"] = (
    df["job_function"]
    .map(job_function_mapping)
    .fillna("Other")
)

In [6]:
encoders_dict = {}

columns_to_encode = ['job_function_group']
for col in columns_to_encode:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders_dict[col] = le

df.head()

Unnamed: 0,id,job_title,location,salary_currency,career_level,experience_level,education_level,employment_type,job_function,job_benefits,company_process_time,company_size,company_industry,job_description,salary,job_description_cleaned,annotations,skills,job_function_group
0,1,Facility Maintenance & Smart Warehouse Manager,Bandung,IDR,Manajer/Asisten Manajer,5 tahun,"Sertifikat Professional, D3 (Diploma), D4 (Dip...",Penuh Waktu,"Manufaktur,Pemeliharaan",,,,,Deskripsi PekerjaanRequirements :D3/SI from re...,,deskripsi pekerjaanrequirements si from reputa...,{'text': 'deskripsi pekerjaanrequirements si f...,"['electrical inspection', 'management system',...",7
1,2,Procurement Department Head,Jakarta Raya,IDR,Manajer/Asisten Manajer,5 tahun,"Sarjana (S1), Diploma Pascasarjana, Gelar Prof...",Penuh Waktu,"Manufaktur,Pembelian/Manajemen Material",,25 days,51 - 200 pekerja,Manajemen/Konsulting HR,Job Role: 1. Responsible for material availabi...,,job role responsible for material availabili...,{'text': 'job role responsible for material av...,"['heavy equipment', 'contract management', 'pr...",7
2,3,SALES ADMIN,Jakarta Barat,IDR,Supervisor/Koordinator,4 tahun,Sarjana (S1),Penuh Waktu,"Penjualan / Pemasaran,Penjualan Ritel","Waktu regular, Senin - Jumat;Bisnis (contoh: K...",30 days,51 - 200 pekerja,Umum & Grosir,Internal Sales & AdminJob Description :We are ...,,internal sales adminjob description we are loo...,{'text': 'internal sales adminjob description ...,"['microsoft office', 'heat exchanger', 'carbon...",9
3,4,City Operation Lead Shopee Express (Cirebon),Cirebon,IDR,Supervisor/Koordinator,5 tahun,"Sarjana (S1), Diploma Pascasarjana, Gelar Prof...",Penuh Waktu,"Pelayanan,Logistik/Rantai Pasokan","Tip;Waktu regular, Senin - Jumat;Kasual (conto...",21 days,2001 - 5000 pekerja,Retail/Merchandise,Job Description:Responsible for HSE implementa...,,job description responsible for hse implementa...,{'text': 'job description responsible for hse ...,"['operation management', 'analytical skill', '...",7
4,5,Japanese Interpreter,Bekasi,IDR,Pegawai (non-manajemen & non-supervisor),2 tahun,"Sertifikat Professional, D3 (Diploma), D4 (Dip...",Penuh Waktu,"Lainnya,Jurnalis/Editor",,23 days,201 - 500 pekerja,Manajemen/Konsulting HR,Overview: Our clients is manufacture for autom...,,overview our clients is manufacture for automo...,{'text': 'overview our clients is manufacture ...,"['japanese', 'translator', 'english', 'non', '...",0


In [7]:
y = df["job_function_group"].fillna("unknown")
print("n_classes:", y.nunique())
print(y.value_counts())

n_classes: 10
job_function_group
9    5424
6    2895
3    1929
7    1638
4    1476
2    1302
5     767
0     714
8     599
1     438
Name: count, dtype: int64


In [8]:
RANDOM_STATE = 42
MAX_PER_CLASS = 600

dfs = []

for label, group in df.groupby("job_function_group"):
    if len(group) > MAX_PER_CLASS:
        group = group.sample(
            n=MAX_PER_CLASS,
            random_state=RANDOM_STATE
        )
    dfs.append(group)

df = pd.concat(dfs).reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5837 entries, 0 to 5836
Data columns (total 19 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       5837 non-null   int64  
 1   job_title                5837 non-null   object 
 2   location                 5837 non-null   object 
 3   salary_currency          5835 non-null   object 
 4   career_level             5837 non-null   object 
 5   experience_level         5837 non-null   object 
 6   education_level          5837 non-null   object 
 7   employment_type          5837 non-null   object 
 8   job_function             5837 non-null   object 
 9   job_benefits             4876 non-null   object 
 10  company_process_time     4531 non-null   object 
 11  company_size             5244 non-null   object 
 12  company_industry         5564 non-null   object 
 13  job_description          5837 non-null   object 
 14  salary                  

In [9]:
# tfidf = TfidfVectorizer(
#     max_features=1000,        # penting agar RF tidak overload
#     ngram_range=(1, 2),
#     # stop_words="english",
#     min_df=2,
#     max_df=0.9,
#     sublinear_tf=True
# )

In [10]:
df["role_text"] = df["job_description"] + " - " + df['experience_level'] + " - " + df["education_level"]

In [11]:
df.head()

Unnamed: 0,id,job_title,location,salary_currency,career_level,experience_level,education_level,employment_type,job_function,job_benefits,company_process_time,company_size,company_industry,job_description,salary,job_description_cleaned,annotations,skills,job_function_group,role_text
0,5894,Photographer,Jakarta Raya,IDR,Pegawai (non-manajemen & non-supervisor),2 tahun,Tidak terspesifikasi,Kontrak,"Seni/Media/Komunikasi,Seni / Desain Kreatif","Waktu regular, Senin - Jumat;Kasual (contoh: K...",24 days,501 - 1000 pekerja,Transportasi/Logistik,What is ShipperShipper is a growing technology...,8000000.0,what is shippershipper is a growing technology...,{'text': 'what is shippershipper is a growing ...,"['warehouse management', 'post production', 's...",0,What is ShipperShipper is a growing technology...
1,16927,HEAD CREATIVE,Jakarta Raya,IDR,Supervisor/Koordinator,2 tahun,"Sertifikat Professional, D3 (Diploma), D4 (Dip...",Penuh Waktu,"Seni/Media/Komunikasi,Seni / Desain Kreatif",Asuransi kesehatan;Bisnis (contoh: Kemeja),25 days,201 - 500 pekerja,Kesehatan/Medis,HEAD CREATIVEKualifikasiMemiliki 2 tahun penga...,,head creativekualifikasimemiliki tahun pengala...,{'text': 'head creativekualifikasimemiliki tah...,"['social medium', 'e commerce', 'target market...",0,HEAD CREATIVEKualifikasiMemiliki 2 tahun penga...
2,2243,Japanese Interpreter,Jakarta Barat,IDR,Pegawai (non-manajemen & non-supervisor),3 tahun,"Sarjana (S1), Diploma Pascasarjana, Gelar Prof...",Penuh Waktu,"Lainnya,Jurnalis/Editor","Tip;Asuransi kesehatan;Waktu regular, Senin - ...",5 days,51 - 200 pekerja,Manajemen/Konsulting HR,JAPANESE INTERPRETER (TANGERANG) [50020]COMPAN...,,japanese interpreter tangerang company categor...,{'text': 'japanese interpreter tangerang compa...,"['japanese', 'electrical', 'communication', 'e...",0,JAPANESE INTERPRETER (TANGERANG) [50020]COMPAN...
3,15212,Content Creator,Jakarta Barat,IDR,Pegawai (non-manajemen & non-supervisor),3 tahun,Sarjana (S1),Penuh Waktu,"Seni/Media/Komunikasi,Seni / Desain Kreatif","Waktu regular, Senin - Jumat;Bisnis (contoh: K...",24 days,1- 50 pekerja,Makanan & Minuman/Katering/Restoran,CONTENT CREATORLocation : ...,5000000.0,content creatorlocation j...,{'text': 'content creatorlocation jakarta bara...,"['digital marketing', 'social medium', 'adobe ...",0,CONTENT CREATORLocation : ...
4,32356,Graphic Designer,Bali,IDR,Pegawai (non-manajemen & non-supervisor),1 tahun,Sarjana (S1),Penuh Waktu,"Seni/Media/Komunikasi,Seni / Desain Kreatif",,,,Olahraga,Responsibilities :Taking design briefs to unde...,3600000.0,responsibilities taking design briefs to under...,{'text': 'responsibilities taking design brief...,"['design brief', 'graphic design', 'adobe phot...",0,Responsibilities :Taking design briefs to unde...


In [12]:
X_text = df["role_text"].astype(str)
y = df["job_function_group"].astype(int)

num_classes = y.nunique()

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X_text,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

data = (X_train, y_train), (X_test, y_test)

### Sequence Vectors

In [14]:
TOP_K = 20000
MAX_SEQUENCE_LENGTH = 500

def sequence_vectorize(train_texts, val_texts):
    tokenizer = text.Tokenizer(num_words=TOP_K)
    tokenizer.fit_on_texts(train_texts)

    X_train = tokenizer.texts_to_sequences(train_texts)
    X_val = tokenizer.texts_to_sequences(val_texts)

    max_length = len(max(X_train, key=len))
    if max_length > MAX_SEQUENCE_LENGTH:
        max_length = MAX_SEQUENCE_LENGTH

    X_train = sequence.pad_sequences(X_train, maxlen=max_length)
    X_val = sequence.pad_sequences(X_val, maxlen=max_length)
    return X_train, X_val, tokenizer.word_index

In [15]:
# X_train, X_test, index = sequence_vectorize(X_train, X_test)

### Last Layer

In [16]:
def _get_last_layer_units_and_activation(num_classes):
    if num_classes == 2:
        activation = 'sigmoid'
        units = 1
    else:
        activation = 'softmax'
        units = num_classes
    return units, activation

### Build sepcnn model

In [17]:
def sepcnn_model(
    blocks,
    filters,
    kernel_size,
    embedding_dim,
    dropout_rate,
    pool_size,
    input_shape,
    num_classes,
    num_features,
    use_pretrained_embedding=False,
    is_embedding_trainable=False,
    embedding_matrix=None
):
    op_units, op_activation = _get_last_layer_units_and_activation(num_classes)
    model = models.Sequential()
    # model.add(layers.Input(shape=input_shape))

    if use_pretrained_embedding:
        model.add(layers.Embedding(
            input_dim=num_features,
            output_dim=embedding_dim,
            input_length=input_shape[0],
            weights=[embedding_matrix],
            trainable=is_embedding_trainable
        ))
    else:
        model.add(layers.Embedding(
            input_dim=num_features,
            output_dim=embedding_dim,
            input_length=input_shape[0]
        ))

    for _ in range(blocks-1):
        model.add(layers.Dropout(rate=dropout_rate))
        model.add(layers.SeparableConv1D(
            filters=filters,
            kernel_size=kernel_size,
            activation='relu',
            bias_initializer='random_uniform',
            depthwise_initializer='random_uniform',
            padding='same'
        ))
        model.add(layers.SeparableConv1D(
            filters=filters,
            kernel_size=kernel_size,
            activation='relu',
            bias_initializer='random_uniform',
            depthwise_initializer='random_uniform',
            padding='same'
        ))
        model.add(layers.MaxPooling1D(pool_size=pool_size))

    model.add(layers.SeparableConv1D(
            filters=filters * 2,
            kernel_size=kernel_size,
            activation='relu',
            bias_initializer='random_uniform',
            depthwise_initializer='random_uniform',
            padding='same'
        ))
    model.add(layers.SeparableConv1D(
            filters=filters * 2,
            kernel_size=kernel_size,
            activation='relu',
            bias_initializer='random_uniform',
            depthwise_initializer='random_uniform',
            padding='same'
        ))
    model.add(layers.GlobalAveragePooling1D())
    model.add(layers.Dropout(rate=dropout_rate))
    model.add(layers.Dense(op_units, activation=op_activation))
    return model

### Train Your Model

In [18]:
FLAGS = None
TOP_K = 20000

def train_sequence_model(
    data,
    learning_rate=1e-3,
    epochs=1000,
    batch_size=128,
    blocks=2,
    filters=64,
    dropout_rate=0.2,
    embedding_dim=200,
    kernel_size=3,
    pool_size=3
):
    (train_texts, train_labels), (val_texts, val_labels) = data

    X_train, X_val, word_index = sequence_vectorize(
        train_texts, val_texts
    )
    num_features = min(len(word_index) + 1, TOP_K)

    model = sepcnn_model(blocks=blocks,
                                     filters=filters,
                                     kernel_size=kernel_size,
                                     embedding_dim=embedding_dim,
                                     dropout_rate=dropout_rate,
                                     pool_size=pool_size,
                                     input_shape=X_train.shape[1:],
                                     num_classes=num_classes,
                                     num_features=num_features)

    if num_classes == 2:
        loss = 'binary_crossentropy'
    else:
        loss = 'sparse_categorical_crossentropy'
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])

    callbacks = [tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=2)]

    history = model.fit(
            X_train,
            train_labels,
            epochs=epochs,
            callbacks=callbacks,
            validation_data=(X_val, val_labels),
            verbose=2,  # Logs once per epoch.
            batch_size=batch_size)

    history = history.history
    print('Validation accuracy: {acc}, loss: {loss}'.format(
            acc=history['val_acc'][-1], loss=history['val_loss'][-1]))

    model.save('rotten_tomatoes_sepcnn_model.keras')
    return history['val_acc'][-1], history['val_loss'][-1]

In [19]:
train_sequence_model(data)

Epoch 1/1000




37/37 - 5s - 147ms/step - acc: 0.0953 - loss: 2.3026 - val_acc: 0.1027 - val_loss: 2.3005
Epoch 2/1000
37/37 - 4s - 105ms/step - acc: 0.1034 - loss: 2.3000 - val_acc: 0.1027 - val_loss: 2.2993
Epoch 3/1000
37/37 - 4s - 104ms/step - acc: 0.0975 - loss: 2.2994 - val_acc: 0.1027 - val_loss: 2.2990
Epoch 4/1000
37/37 - 4s - 105ms/step - acc: 0.1024 - loss: 2.2991 - val_acc: 0.1027 - val_loss: 2.2990
Epoch 5/1000
37/37 - 4s - 105ms/step - acc: 0.0902 - loss: 2.2995 - val_acc: 0.1027 - val_loss: 2.2990
Epoch 6/1000
37/37 - 4s - 114ms/step - acc: 0.0996 - loss: 2.2994 - val_acc: 0.1027 - val_loss: 2.2990
Validation accuracy: 0.10273972898721695, loss: 2.2989625930786133


(0.10273972898721695, 2.2989625930786133)