# Load Dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
train_df = pd.read_csv("data/dataset_mooc_label.csv")
train_df.head()

Unnamed: 0,kalimat,type
0,Algorithm and Object Oriented Programming Methods,nm
1,Mata kuliah algoritma dan metode object orient...,dm
2,CP 1: menggunakan sintaks dan fungsi di dalam ...,cp
3,1. Pengertian Algoritma dan Pemrograman; Perbe...,pb
4,Basis Data,nm


In [3]:
label = train_df['type'].unique()
label_to_id = {}
assign_id = 0
for lbl in label:
    label_to_id[lbl] = assign_id
    assign_id += 1  ## Get a new id for label
    
##  Print the dictionary created
for key, values in label_to_id.items():
    print (key, values)

nm 0
dm 1
cp 2
pb 3


In [4]:
## convert the labels to id --> So when we predict the result humans can understand

id_to_label = {v: k for k, v in label_to_id.items()}
for key, values in id_to_label.items():
    print (key, values)

0 nm
1 dm
2 cp
3 pb


In [5]:
## Add a new column to pandas dataframe, with the author name mapping
def get_label_id(label):
    return label_to_id[label]

train_df['label_id'] = train_df['type'].map(get_label_id)

In [6]:
kalimat = train_df["kalimat"].values
kalimat

array(['Algorithm and Object Oriented Programming Methods',
       'Mata kuliah algoritma dan metode object oriented programming memuat konsep-konsep dasar dalam membuat algoritma dan pemrograman yang berorientasi object dengan menggunakan bahasa C++. Fitur utamanya adalah element dasar dalam c++, input output stream, struktur data, control structures, fungsi, algoritma pengurutan data, algoritma pencarian, file stream. inheritance, polymorphism dan generic programming.',
       'CP 1: menggunakan sintaks dan fungsi di dalam bahasa C++', ...,
       'algoritma terdistribusi yang andal bagian 1',
       'kursus ini adalah kursus pertama dalam rangkaian dua. kedua kursus memberikan landasan yang kuat di bidang komputasi terdistribusi yang andal termasuk model hasil konsep utama dan algoritma di lapangan. ntoday global it infrastructure adalah sistem terdistribusi dari internet ke pusat data komputasi awan yang menjadi bahan bakar revolusi layanan global saat ini. di inti layanan ini Anda

In [7]:
train_df

Unnamed: 0,kalimat,type,label_id
0,Algorithm and Object Oriented Programming Methods,nm,0
1,Mata kuliah algoritma dan metode object orient...,dm,1
2,CP 1: menggunakan sintaks dan fungsi di dalam ...,cp,2
3,1. Pengertian Algoritma dan Pemrograman; Perbe...,pb,3
4,Basis Data,nm,0
5,Mata kuliah ini menjelaskan tentang konsep dan...,dm,1
6,Konsep Basis Data Objek Basis Data Relasiona...,pb,3
7,data center management menggunakan docker cont...,nm,0
8,process isolation diperlukan dalam rangka memb...,dm,1
9,1. Siswa mampu memahami cara kerja docker cont...,cp,2


# Preproses Dataset

In [8]:
import json
from tqdm import tqdm_notebook as tqdm
from polyglot.downloader import downloader
from polyglot.text import Text
import re

#download data pos tagging
downloader.download("embeddings2.id")
downloader.download("pos2.id")

#membersihkan data text
def cleanText (text):
    textClear = re.sub ('[^ a-zA-Z]', ' ',text.lower())
    return textClear

def getPosTag(tagText):
    jmlKata = len(tagText.split())
    propn = 0
    takso = 0
    naon = 0
    subyek = 0
    posTag=Text(tagText, hint_language_code='id')

    #menambahkan rule
    listTag=['NOUN', 'PROPN']
    bloom = open("data/taksonomi.txt").read().splitlines()
    subjek = open("data/subjek.txt").read().splitlines()
    
    for kata in posTag.pos_tags:
        if kata[0] in subjek:
            subyek = 1
        if kata[0] in bloom:
            takso = 1
        if kata[1] in listTag:
            if kata[1] == 'PROPN':
                propn = 1
            naon += 1
#     naon = (naon/jmlKata)
#     propn = (propn/jmlKata)
#     takso = (takso/jmlKata)
    naon = (naon/jmlKata)
    return propn, takso, naon, subyek, jmlKata

#open file
deskr=[]

for index, desk in tqdm(enumerate(kalimat)):
#     print(desk)
    ct={}   
    ct['kalimat'] = cleanText(desk)
    ct['propn'],ct['takso'], ct['naon'], ct['subyek'], ct['jk']=getPosTag(ct['kalimat'])
    ct['label']=train_df["label_id"][index]
    deskr.append(ct)

[polyglot_data] Downloading package embeddings2.id to C:\Users\Purina
[polyglot_data]     QA\AppData\Roaming\polyglot_data...
[polyglot_data]   Package embeddings2.id is already up-to-date!
[polyglot_data] Downloading package pos2.id to C:\Users\Purina
[polyglot_data]     QA\AppData\Roaming\polyglot_data...
[polyglot_data]   Package pos2.id is already up-to-date!


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [9]:
deskr_p = pd.DataFrame(deskr)
deskr_p

Unnamed: 0,jk,kalimat,label,naon,propn,subyek,takso
0,6,algorithm and object oriented programming methods,0,0.833333,1,0,0
1,51,mata kuliah algoritma dan metode object orient...,1,0.686275,1,1,1
2,9,cp menggunakan sintaks dan fungsi di dalam ...,2,0.555556,1,0,1
3,52,pengertian algoritma dan pemrograman perbe...,3,0.769231,1,0,0
4,2,basis data,0,1.000000,0,0,0
5,86,mata kuliah ini menjelaskan tentang konsep dan...,1,0.639535,1,1,1
6,24,konsep basis data objek basis data relasiona...,3,0.916667,1,0,0
7,6,data center management menggunakan docker cont...,0,0.833333,1,0,1
8,44,process isolation diperlukan dalam rangka memb...,1,0.522727,1,1,0
9,7,siswa mampu memahami cara kerja docker cont...,2,0.714286,1,1,0


In [10]:
deskr_p.to_csv('data/fiturntm.csv')

# NLP Without Machine Learning

In [11]:
from sklearn.metrics import confusion_matrix
countAcc = 0
pred = []
deskr_ptr=deskr_p.transpose()

for index, data in enumerate(deskr_ptr):
    kata = len(deskr_ptr[index]['kalimat'].split())
    if (kata<=12) and (deskr_ptr[index]["naon"] >= 0.5) and (deskr_ptr[index]["subyek"] == 0) and (deskr_ptr[index]["label"]==0):
        countAcc += 1
        pred.append(0)
    elif (deskr_ptr[index]["subyek"] == 1) and (deskr_ptr[index]["propn"] == 1) and(deskr_ptr[index]["label"]==1):
        countAcc += 1
        pred.append(1)
    elif (deskr_ptr[index]["subyek"] == 0) and (deskr_ptr[index]["takso"] == 1) and (deskr_ptr[index]["label"]==2):
        countAcc += 1
        pred.append(2)
    elif (kata>12) and (deskr_ptr[index]["naon"] >= 0.5) and (deskr_ptr[index]["subyek"] == 0) and (deskr_ptr[index]["label"]==3):
        countAcc += 1
        pred.append(3)
    else:
        pred.append(4)

summ = np.sum(confusion_matrix(deskr_p['label'], pred))
a = (countAcc / summ) * 100
print ('akurasi: ', a)

akurasi:  71.74295774647888


# UJI COBA

In [12]:
import json
from tqdm import tqdm_notebook as tqdm
from polyglot.downloader import downloader
from polyglot.text import Text
import re

#download data pos tagging
downloader.download("embeddings2.id")
downloader.download("pos2.id")

#membersihkan data text
def cleanText (text):
    textClear = re.sub ('[^ a-zA-Z]', ' ',text.lower())
    return textClear

def getPosTag(tagText):
    jmlKata = len(tagText.split())
    propn = 0
    takso = 0
    naon = 0
    subyek = 0
    posTag=Text(tagText, hint_language_code='id')

    #menambahkan rule
    listTag=['NOUN', 'PROPN']
    bloom = open("data/taksonomi.txt").read().splitlines()
    subjek = open("data/subjek.txt").read().splitlines()
    
    for kata in posTag.pos_tags:
        if kata[0] in subjek:
            subyek = 1
        if kata[0] in bloom:
            takso = 1
        if kata[1] in listTag:
            if kata[1] == 'PROPN':
                propn = 1
            naon += 1
    naon = (naon/jmlKata)
    return propn, takso, naon, subyek, jmlKata

desk = "Materi kuliah ini adalah: Pengantar Data Besar, Statistics Descriptive, Association Rules, Clustering, Classification."
ct={}   
ct['kalimat'] = cleanText(desk)
ct['propn'],ct['takso'], ct['naon'], ct['subyek'], ct['jk']=getPosTag(ct['kalimat'])

# print(ct['kalimat'])
# print(ct['propn'])
# print(ct['takso'])
# print( ct['naon'])
# print(ct['subyek'])
# print(ct['jk'])

print('\nHasil:')
if (ct['jk']<=12) and (ct["naon"] >= 0.5) and (ct["subyek"] == 0):
    print ('Nama Mata Kuliah')
elif (ct["subyek"] == 1) and (ct["propn"] == 1):
     print ('Deskripsi Mata Kuliah')
elif (ct["subyek"] == 0) and (ct["takso"] == 1):
     print ('Capaian Pembelajaran')
elif (ct['jk']>12) and (ct[index]["naon"] >= 0.5) and (ct["subyek"] == 0):
     print ('Pokok Bahasan')

[polyglot_data] Downloading package embeddings2.id to C:\Users\Purina
[polyglot_data]     QA\AppData\Roaming\polyglot_data...
[polyglot_data]   Package embeddings2.id is already up-to-date!
[polyglot_data] Downloading package pos2.id to C:\Users\Purina
[polyglot_data]     QA\AppData\Roaming\polyglot_data...
[polyglot_data]   Package pos2.id is already up-to-date!

Hasil:
Deskripsi Mata Kuliah
