In [1]:
import numpy as np
import pickle
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

# Load Model Classifier

In [2]:
#SVM
svm =  pickle.load(open("data/model/svm_linear_model.pkl", "rb"))
svm_cv = pickle.load(open("data/model/svm_cv_model.pkl", "rb"))
svm_ada = pickle.load(open("data/model/svm_ada_model.pkl", "rb"))
svm_smt = pickle.load(open("data/model/svm_smt_model.pkl", "rb"))

# RandomForest
rf = pickle.load(open("data/model/rf_model.pkl", "rb"))
rf_cv = pickle.load(open("data/model/rf_cv_model.pkl", "rb"))
rf_ada = pickle.load(open("data/model/rf_ada_model.pkl", "rb"))
rf_smt = pickle.load(open("data/model/rf_smt_model.pkl", "rb"))

#NaiveBayes
nb = pickle.load(open("data/model/nb_model.pkl", "rb"))
nb_cv = pickle.load(open("data/model/nb_cv_model.pkl", "rb"))
nb_ada = pickle.load(open("data/model/nb_ada_model.pkl", "rb"))
nb_smt = pickle.load(open("data/model/nb_smt_model.pkl", "rb"))

#TFIDF
tfidf = pickle.load(open("data/model/vectorizer.pkl", "rb"))

# Preproses Data Baru

In [3]:
import json
from tqdm import tqdm_notebook as tqdm
from polyglot.downloader import downloader
from polyglot.text import Text
import re

#download data pos tagging
downloader.download("embeddings2.id")
downloader.download("pos2.id")

#membersihkan data text
def cleanText (text):
    textClear = re.sub ('[^ a-zA-Z]', ' ',text.lower())
    return textClear

def getPosTag(tagText):
    jmlKata = len(tagText.split())
    propn = 0
    takso = 0
    noun = 0
    subyek = 0
    posTag=Text(tagText, hint_language_code='id')

    #menambahkan rule
    listTag=['NOUN', 'PROPN']
    bloom = open("data/taksonomi.txt").read().splitlines()
    subjek = open("data/subjek.txt").read().splitlines()
    
    for kata in posTag.pos_tags:
        if kata[0] in subjek:
            subyek = 1
        if kata[0] in bloom:
            takso = 1
        if kata[1] in listTag:
            if kata[1] == 'PROPN':
                propn = 1
            noun += 1
    propn = (propn/jmlKata)
    takso = (takso/jmlKata)
    noun = (noun/jmlKata)
    
    kalimat = [tagText]
    tv = tfidf.transform(kalimat)
    tv
    features = tv.toarray()
    features.shape
    return [jmlKata, noun, propn, subyek, takso] + features.tolist()[0]

def strToPredict(algo,string):
    kalimat = cleanText(string)
    ct = getPosTag(kalimat)
    ftr=np.array(ct).reshape(1, -1)
#     print(ftr)
    hasil=algo.predict(ftr)
    
    if hasil == 0:
        return "Nama Mata Kuliah"
    elif hasil == 1:
        return "Deskripsi Mata Kuliah"
    elif hasil == 2:
        return "Capaian Pembelajaran"
    elif hasil == 3:
        return "Pokok Bahasan"

[polyglot_data] Downloading package embeddings2.id to C:\Users\Purina
[polyglot_data]     QA\AppData\Roaming\polyglot_data...
[polyglot_data]   Package embeddings2.id is already up-to-date!
[polyglot_data] Downloading package pos2.id to C:\Users\Purina
[polyglot_data]     QA\AppData\Roaming\polyglot_data...
[polyglot_data]   Package pos2.id is already up-to-date!


# Uji Coba dengan Data Baru

In [4]:
text = 'This tutorial has been prepared for the beginners to help them understand the basic to advanced concepts related to Java Programming language.'

print("hasil_svm: ", strToPredict(svm,text))
print("hasil_svm_cv: ", strToPredict(svm_cv,text))
print("hasil_svm_ada: ", strToPredict(svm_ada,text))
print("hasil_svm_smt: ", strToPredict(svm_smt,text))

print("hasil_rf: ", strToPredict(rf,text))
print("hasil_rf_cv: ", strToPredict(rf_cv,text))
print("hasil_rf_ada: ", strToPredict(rf_ada,text))
print("hasil_rf_smt: ", strToPredict(rf_smt,text))

print("hasil_nb: ", strToPredict(nb,text))
print("hasil_nb_cv: ", strToPredict(nb_cv,text))
print("hasil_nb_ada: ", strToPredict(nb_ada,text))
print("hasil_nb_smt: ", strToPredict(nb_smt,text))

hasil_svm:  Capaian Pembelajaran
hasil_svm_cv:  Capaian Pembelajaran
hasil_svm_ada:  Pokok Bahasan
hasil_svm_smt:  Pokok Bahasan
hasil_rf:  Capaian Pembelajaran
hasil_rf_cv:  Capaian Pembelajaran
hasil_rf_ada:  Capaian Pembelajaran
hasil_rf_smt:  Capaian Pembelajaran
hasil_nb:  Capaian Pembelajaran
hasil_nb_cv:  Capaian Pembelajaran
hasil_nb_ada:  Pokok Bahasan
hasil_nb_smt:  Pokok Bahasan
