# Load Preprocessed Data

In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('cleaned-data-kbk.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,id,judul,abstrak,kbk,judul_tokens,abstrak_tokens
0,0,1,pengembangan sistem pendukung keputusan untuk ...,sistem pendukung keputusan spk merupakan suatu...,Pengembangan Aplikasi dan Media Pembelajaran T...,kembang sistem dukung putus untuk tentu dosen ...,sistem dukung putus spk rupa suatu sistem yang...
1,1,3,hubungan efikasi diri dengan kesiapan kerja lu...,pandemi covid 19 yang melanda dunia terutama i...,Ketenegakerjaan Teknologi dan Kejuruan,hubung efikasi diri dengan kesiap kerja lulus ...,pandemi covid 19 yang landa dunia utama indone...
2,2,4,alat bantu penyandang tuetra berbasis deteksi ...,tujuan dilakukannya penelitian ini untuk memba...,Biomedic and Intelligent Assistive Technology ...,alat bantu sandang tuetra bas deteksi objek ca...,tuju laku teliti ini untuk bantu sandang tuetr...
3,3,6,analisis thermovisi penghantar akibat transmis...,gardu induk waru merupakan sub transmisi listr...,Intelligent Power Electronics and Smart Grid (...,analisis thermovisi hantar akibat transmission...,gardu induk waru rupa sub transmisi listrik ya...
4,4,7,pengembangan modulberbasis production based ed...,mata pelajaran dasar desain grafis merupakan m...,Pengembangan Aplikasi dan Media Pembelajaran T...,kembang modulberbasis production based educati...,mata ajar dasar desain grafis rupa mata ajar y...


In [2]:
df['judtrak'] = [' '.join(i) for i in zip(df['judul'], df['abstrak'])]

# Split Data

In [3]:
from sklearn.model_selection import train_test_split

X = df['judtrak']
y = df['kbk']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Feature Extraction: TF-IDF

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

X_train_tfidf.shape

(884, 12047)

In [5]:
pd.set_option('display.max_rows', None)


# Oversampling: SMOTE

In [6]:
pd.reset_option("^display")

In [7]:
terms = vectorizer.get_feature_names()

# sum tfidf frequency of each term through documents
sums = X_train_tfidf.sum(axis=0)

# connecting term to its sums frequency
data = []
for col, term in enumerate(terms):
    data.append( (term, sums[0,col] ))

ranking = pd.DataFrame(data, columns=['term','rank'])
print(ranking.sort_values('rank', ascending=False))

               term       rank
12002          yang  78.149528
2172            dan  66.565850
8106   pembelajaran  55.467583
10354         siswa  49.920031
2306         dengan  40.473841
...             ...        ...
4549           isbn   0.010362
399        872e4de9   0.010362
9892       sciences   0.010362
5218           kesy   0.010362
4592        jakarta   0.010362

[12047 rows x 2 columns]




In [8]:
y_train.value_counts()

Pengembangan Aplikasi dan Media Pembelajaran Teknologi dan Kejuruan              194
Strategi Pembelajaran Teknologi dan Kejuruan                                     147
Kurikulum Pendidikan Teknologi dan Kejuruan                                       72
Intelligent Power and Advanced energy System (IPAES)                              68
Rekayasa pengetahuan dan ilmu data (Knowledge Engineering and Data Science)       66
Intelligent Power Electronics and Smart Grid (IPESG)                              64
Ketenegakerjaan Teknologi dan Kejuruan                                            54
Game Technology and Machine Learning Applications                                 53
Evaluasi dan Pengelolaan Pendidikan Kejuruan                                      47
Telematics loT System and Devices                                                 44
Teknologi Digital Cerdas (Ubiquitous Computing Technique)                         41
Biomedic and Intelligent Assistive Technology (TAT)              

In [9]:
from imblearn.over_sampling import SMOTE

X_train_resampled, y_train_resampled = SMOTE(random_state=42).fit_resample(X_train_tfidf, y_train)

y_train_resampled.value_counts()

Game Technology and Machine Learning Applications                                194
Teknologi Digital Cerdas (Ubiquitous Computing Technique)                        194
Strategi Pembelajaran Teknologi dan Kejuruan                                     194
Rekayasa pengetahuan dan ilmu data (Knowledge Engineering and Data Science)      194
Biomedic and Intelligent Assistive Technology (TAT)                              194
Pengembangan Aplikasi dan Media Pembelajaran Teknologi dan Kejuruan              194
Intelligent Power Electronics and Smart Grid (IPESG)                             194
Evaluasi dan Pengelolaan Pendidikan Kejuruan                                     194
Sistem Dinamis, Kendali, dan Robotika (Dynamic Systems, Control and Robotics)    194
Kurikulum Pendidikan Teknologi dan Kejuruan                                      194
Intelligent Power and Advanced energy System (IPAES)                             194
Ketenegakerjaan Teknologi dan Kejuruan                           

# Training Model

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

parameters = {'criterion':['gini','entropy'],'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]}
dtc = DecisionTreeClassifier()
clf = GridSearchCV(dtc, parameters, cv=10)

model = clf.fit(X_train_resampled,y_train_resampled)
sorted(model.cv_results_.keys())

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_criterion',
 'param_max_depth',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'split5_test_score',
 'split6_test_score',
 'split7_test_score',
 'split8_test_score',
 'split9_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

# Evaluation

In [11]:
predictions = model.predict(X_test_tfidf)

In [12]:
# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[ 1  0  5  1  0  0  0  3  1  0  0  0  2]
 [ 0  7  0  0  0  2  1  0  0  0  5  0  0]
 [ 1  1 12  0  0  0  0  6  5  0  0  2  0]
 [ 0  0  2  9  7  0  0  0  2  0  0  1  3]
 [ 0  1  2  6 13  0  0  0  0  0  0  0  0]
 [ 0  5  1  1  1 14  2  3  3  0  1  0  0]
 [ 0  4  1  0  0  5  3  2  0  0  7  1  0]
 [ 1  4  4  0  2  2  3 45  0  1 22  2  2]
 [ 0  0  8  0  2  0  0  2 18  2  0  1  0]
 [ 0  0  1  1  1  0  0  1  0  2  0  0  1]
 [ 1  3  0  0  0  3  5 15  2  0 31  2  0]
 [ 1  1  1  1  0  0  0  2  2  4  3  6  0]
 [ 1  0  1  4  1  0  0  3  0  0  0  0  4]]


In [13]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

                                                                               precision    recall  f1-score   support

                          Biomedic and Intelligent Assistive Technology (TAT)       0.17      0.08      0.11        13
                                 Evaluasi dan Pengelolaan Pendidikan Kejuruan       0.27      0.47      0.34        15
                            Game Technology and Machine Learning Applications       0.32      0.44      0.37        27
                         Intelligent Power Electronics and Smart Grid (IPESG)       0.39      0.38      0.38        24
                         Intelligent Power and Advanced energy System (IPAES)       0.48      0.59      0.53        22
                                       Ketenegakerjaan Teknologi dan Kejuruan       0.54      0.45      0.49        31
                                  Kurikulum Pendidikan Teknologi dan Kejuruan       0.21      0.13      0.16        23
          Pengembangan Aplikasi dan Media Pembe

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Print the overall metrics
print('Accuracy score : ', accuracy_score(y_test, predictions))
print('Precision score : ', precision_score(y_test, predictions, average='weighted'))
print('Recall score : ', recall_score(y_test, predictions, average='weighted'))
print('F1 score : ', f1_score(y_test, predictions, average='weighted'))

Accuracy score :  0.4342105263157895
Precision score :  0.43448999386505077
Recall score :  0.4342105263157895
F1 score :  0.429662633547873


# Export Model

In [15]:
# from joblib import dump

# dump(model, filename="svm-model.joblib")