# Load Preprocessed Data

In [10]:
import numpy as np
import pandas as pd

df = pd.read_csv('cleaned-data-kbk.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,id,judul,abstrak,kbk,judul_tokens,abstrak_tokens
0,0,1,pengembangan sistem pendukung keputusan untuk ...,sistem pendukung keputusan spk merupakan suatu...,Pengembangan Aplikasi dan Media Pembelajaran T...,kembang sistem dukung putus untuk tentu dosen ...,sistem dukung putus spk rupa suatu sistem yang...
1,1,3,hubungan efikasi diri dengan kesiapan kerja lu...,pandemi covid 19 yang melanda dunia terutama i...,Ketenegakerjaan Teknologi dan Kejuruan,hubung efikasi diri dengan kesiap kerja lulus ...,pandemi covid 19 yang landa dunia utama indone...
2,2,4,alat bantu penyandang tuetra berbasis deteksi ...,tujuan dilakukannya penelitian ini untuk memba...,Biomedic and Intelligent Assistive Technology ...,alat bantu sandang tuetra bas deteksi objek ca...,tuju laku teliti ini untuk bantu sandang tuetr...
3,3,6,analisis thermovisi penghantar akibat transmis...,gardu induk waru merupakan sub transmisi listr...,Intelligent Power Electronics and Smart Grid (...,analisis thermovisi hantar akibat transmission...,gardu induk waru rupa sub transmisi listrik ya...
4,4,7,pengembangan modulberbasis production based ed...,mata pelajaran dasar desain grafis merupakan m...,Pengembangan Aplikasi dan Media Pembelajaran T...,kembang modulberbasis production based educati...,mata ajar dasar desain grafis rupa mata ajar y...


# Split Data

In [11]:
from sklearn.model_selection import train_test_split

X = df['judul_tokens']
y = df['kbk']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Feature Extraction: TF-IDF

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

X_train_tfidf.shape

(886, 2452)


# Oversampling: SMOTE

In [14]:
y_train.value_counts()

Pengembangan Aplikasi dan Media Pembelajaran Teknologi dan Kejuruan              198
Strategi Pembelajaran Teknologi dan Kejuruan                                     148
Kurikulum Pendidikan Teknologi dan Kejuruan                                       70
Rekayasa pengetahuan dan ilmu data (Knowledge Engineering and Data Science)       68
Intelligent Power Electronics and Smart Grid (IPESG)                              64
Intelligent Power and Advanced energy System (IPAES)                              59
Game Technology and Machine Learning Applications                                 56
Ketenegakerjaan Teknologi dan Kejuruan                                            55
Evaluasi dan Pengelolaan Pendidikan Kejuruan                                      44
Telematics loT System and Devices                                                 42
Teknologi Digital Cerdas (Ubiquitous Computing Technique)                         41
Biomedic and Intelligent Assistive Technology (TAT)              

In [15]:
from imblearn.over_sampling import SMOTE

X_train_resampled, y_train_resampled = SMOTE(random_state=42).fit_resample(X_train_tfidf, y_train)

y_train_resampled.value_counts()

Sistem Dinamis, Kendali, dan Robotika (Dynamic Systems, Control and Robotics)    198
Evaluasi dan Pengelolaan Pendidikan Kejuruan                                     198
Telematics loT System and Devices                                                198
Game Technology and Machine Learning Applications                                198
Biomedic and Intelligent Assistive Technology (TAT)                              198
Kurikulum Pendidikan Teknologi dan Kejuruan                                      198
Intelligent Power Electronics and Smart Grid (IPESG)                             198
Pengembangan Aplikasi dan Media Pembelajaran Teknologi dan Kejuruan              198
Intelligent Power and Advanced energy System (IPAES)                             198
Strategi Pembelajaran Teknologi dan Kejuruan                                     198
Rekayasa pengetahuan dan ilmu data (Knowledge Engineering and Data Science)      198
Ketenegakerjaan Teknologi dan Kejuruan                           

# Training Model

In [16]:
from sklearn.svm import LinearSVC
clf = LinearSVC()

model = clf.fit(X_train_resampled,y_train_resampled)

# Evaluation

In [18]:
predictions = model.predict(X_test_tfidf)

In [19]:
# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[ 3  0  0  0  0  0  0  0  0  0  1  0  3]
 [ 0  5  0  0  0  2  4  2  0  0  5  0  0]
 [ 0  0 10  0  0  0  0  1 10  0  2  1  0]
 [ 0  0  0 16  5  1  1  0  0  0  0  1  0]
 [ 0  0  1  7 21  0  0  0  0  1  0  0  1]
 [ 0  3  0  0  0 14  9  2  0  0  1  0  1]
 [ 0  1  0  0  0  3 15  2  0  0  5  0  0]
 [ 0  1  3  0  0  0  0 72  1  0  7  1  0]
 [ 0  0  4  0  0  0  0  1 25  0  0  1  0]
 [ 2  0  1  1  0  0  0  0  0  2  0  0  0]
 [ 0  7  0  0  0  0  4 20  0  0 30  0  0]
 [ 1  1  0  0  0  0  0  1  2  0  0 16  0]
 [ 4  0  0  2  0  0  0  0  0  0  0  1  9]]


In [20]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

                                                                               precision    recall  f1-score   support

                          Biomedic and Intelligent Assistive Technology (TAT)       0.30      0.43      0.35         7
                                 Evaluasi dan Pengelolaan Pendidikan Kejuruan       0.28      0.28      0.28        18
                            Game Technology and Machine Learning Applications       0.53      0.42      0.47        24
                         Intelligent Power Electronics and Smart Grid (IPESG)       0.62      0.67      0.64        24
                         Intelligent Power and Advanced energy System (IPAES)       0.81      0.68      0.74        31
                                       Ketenegakerjaan Teknologi dan Kejuruan       0.70      0.47      0.56        30
                                  Kurikulum Pendidikan Teknologi dan Kejuruan       0.45      0.58      0.51        26
          Pengembangan Aplikasi dan Media Pembe

In [27]:
# Print the overall metrics
print('Accuracy score : ', accuracy_score(y_test, predictions))
print('Precision score : ', precision_score(y_test, predictions, average='weighted'))
print('Recall score : ', recall_score(y_test, predictions, average='weighted'))
print('F1 score : ', f1_score(y_test, predictions, average='weighted'))

Accuracy score :  0.6263157894736842
Precision score :  0.6303008330215241
Recall score :  0.6263157894736842
F1 score :  0.6212402131664754


# Export Model

In [28]:
from joblib import dump

dump(model, filename="svm-model.joblib")

['svm-model.joblib']