# Load Dataset

In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
train_df = pd.read_csv("data/dataset_mooc_label.csv")
train_df.head()

Unnamed: 0,kalimat,type
0,Algorithm and Object Oriented Programming Methods,nm
1,Mata kuliah algoritma dan metode object orient...,dm
2,CP 1: menggunakan sintaks dan fungsi di dalam ...,cp
3,1. Pengertian Algoritma dan Pemrograman; Perbe...,pb
4,Basis Data,nm


In [3]:
label = train_df['type'].unique()
label_to_id = {}
assign_id = 0
for lbl in label:
    label_to_id[lbl] = assign_id
    assign_id += 1  ## Get a new id for label
    
##  Print the dictionary created
for key, values in label_to_id.items():
    print (key, values)

nm 0
dm 1
cp 2
pb 3


In [4]:
## convert the labels to id --> So when we predict the result humans can understand

id_to_label = {v: k for k, v in label_to_id.items()}
for key, values in id_to_label.items():
    print (key, values)

0 nm
1 dm
2 cp
3 pb


In [5]:
## Add a new column to pandas dataframe, with the author name mapping
def get_label_id(label):
    return label_to_id[label]

train_df['label_id'] = train_df['type'].map(get_label_id)

In [6]:
kalimat = train_df["kalimat"].values
kalimat

array(['Algorithm and Object Oriented Programming Methods',
       'Mata kuliah algoritma dan metode object oriented programming memuat konsep-konsep dasar dalam membuat algoritma dan pemrograman yang berorientasi object dengan menggunakan bahasa C++. Fitur utamanya adalah element dasar dalam c++, input output stream, struktur data, control structures, fungsi, algoritma pengurutan data, algoritma pencarian, file stream. inheritance, polymorphism dan generic programming.',
       'CP 1: menggunakan sintaks dan fungsi di dalam bahasa C++', ...,
       'algoritma terdistribusi yang andal bagian 1',
       'kursus ini adalah kursus pertama dalam rangkaian dua. kedua kursus memberikan landasan yang kuat di bidang komputasi terdistribusi yang andal termasuk model hasil konsep utama dan algoritma di lapangan. ntoday global it infrastructure adalah sistem terdistribusi dari internet ke pusat data komputasi awan yang menjadi bahan bakar revolusi layanan global saat ini. di inti layanan ini Anda

In [7]:
train_df

Unnamed: 0,kalimat,type,label_id
0,Algorithm and Object Oriented Programming Methods,nm,0
1,Mata kuliah algoritma dan metode object orient...,dm,1
2,CP 1: menggunakan sintaks dan fungsi di dalam ...,cp,2
3,1. Pengertian Algoritma dan Pemrograman; Perbe...,pb,3
4,Basis Data,nm,0
5,Mata kuliah ini menjelaskan tentang konsep dan...,dm,1
6,Konsep Basis Data Objek Basis Data Relasiona...,pb,3
7,data center management menggunakan docker cont...,nm,0
8,process isolation diperlukan dalam rangka memb...,dm,1
9,1. Siswa mampu memahami cara kerja docker cont...,cp,2


# Preproses Dataset

In [8]:
import json
from tqdm import tqdm_notebook as tqdm
from polyglot.downloader import downloader
from polyglot.text import Text
import re

#download data pos tagging
downloader.download("embeddings2.id")
downloader.download("pos2.id")

#membersihkan data text
def cleanText (text):
    textClear = re.sub ('[^ a-zA-Z]', ' ',text.lower())
    return textClear

def getPosTag(tagText):
    jmlKata = len(tagText.split())
    propn = 0
    takso = 0
    noun = 0
    subyek = 0
    posTag=Text(tagText, hint_language_code='id')

    #menambahkan rule
    listTag=['NOUN', 'PROPN']
    bloom = open("data/taksonomi.txt").read().splitlines()
    subjek = open("data/subjek.txt").read().splitlines()
    
    for kata in posTag.pos_tags:
        if kata[0] in subjek:
            subyek = 1
        if kata[0] in bloom:
            takso += 1
        if kata[1] in listTag:
            if kata[1] == 'PROPN':
                propn += 1
            if kata[1] == 'NOUN':
                noun += 1
    propn = (propn/jmlKata)
    takso = (takso/jmlKata)
    noun = (noun/jmlKata)
    return propn, takso, noun, subyek, jmlKata

#open file
deskr=[]

for index, desk in tqdm(enumerate(kalimat)):
#     print(desk)
    ct={}   
    ct['kalimat'] = cleanText(desk)
    ct['propn'],ct['takso'], ct['noun'], ct['subyek'], ct['jk']=getPosTag(ct['kalimat'])
    ct['label']=train_df["label_id"][index]
    deskr.append(ct)

[polyglot_data] Downloading package embeddings2.id to C:\Users\Purina
[polyglot_data]     QA\AppData\Roaming\polyglot_data...
[polyglot_data]   Package embeddings2.id is already up-to-date!
[polyglot_data] Downloading package pos2.id to C:\Users\Purina
[polyglot_data]     QA\AppData\Roaming\polyglot_data...
[polyglot_data]   Package pos2.id is already up-to-date!


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [9]:
#tanpa fitur tfidf
deskr_p=pd.DataFrame(deskr)
print(deskr_p.drop(columns=['kalimat']))

       jk  label      noun     propn  subyek     takso
0       6      0  0.666667  0.166667       0  0.000000
1      51      1  0.666667  0.019608       1  0.019608
2       9      2  0.333333  0.222222       0  0.222222
3      52      3  0.653846  0.115385       0  0.000000
4       2      0  1.000000  0.000000       0  0.000000
5      86      1  0.581395  0.058140       1  0.023256
6      24      3  0.833333  0.083333       0  0.000000
7       6      0  0.166667  0.666667       0  0.166667
8      44      1  0.431818  0.090909       1  0.000000
9       7      2  0.428571  0.285714       1  0.000000
10      6      3  0.500000  0.500000       0  0.000000
11      3      0  1.000000  0.000000       0  0.000000
12     71      1  0.464789  0.000000       1  0.000000
13     21      2  0.476190  0.000000       1  0.095238
14     52      3  0.673077  0.019231       1  0.000000
15      2      0  1.000000  0.000000       0  0.000000
16     95      1  0.642105  0.084211       1  0.010526
17    131 

# TF-IDF

In [13]:
kalimat = pd.DataFrame(deskr)['kalimat'].as_matrix()
label = pd.DataFrame(deskr)['label'].as_matrix()

  """Entry point for launching an IPython kernel.
  


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), max_features=150)
tv = tfidf.fit_transform(kalimat)
with open('data/model/vectorizer.pkl', 'wb') as tvc:
    pickle.dump(tfidf, tvc)
features = tv.toarray()
labels = label
features.shape
featureName=tfidf.get_feature_names()

In [15]:
tfidf

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='latin-1', input='content',
        lowercase=True, max_df=1.0, max_features=150, min_df=5,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [16]:
len(features[:,0])

1136

In [17]:
#menambah fitur tfidf
for index,ftr in enumerate(featureName):
    deskr_p[ftr]=features[:,index]

In [18]:
deskr_p.to_csv('data/fitur.csv')

In [19]:
deskr_p

Unnamed: 0,jk,kalimat,label,noun,propn,subyek,takso,adalah,adalah bagian,akan,...,tim,topik,tugas,untuk,virtual,waktu,web,windows,yang,yang lebih
0,6,algorithm and object oriented programming methods,0,0.666667,0.166667,0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000
1,51,mata kuliah algoritma dan metode object orient...,1,0.666667,0.019608,1,0.019608,0.160514,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.107665,0.000000
2,9,cp menggunakan sintaks dan fungsi di dalam ...,2,0.333333,0.222222,0,0.222222,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000
3,52,pengertian algoritma dan pemrograman perbe...,3,0.653846,0.115385,0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000
4,2,basis data,0,1.000000,0.000000,0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000
5,86,mata kuliah ini menjelaskan tentang konsep dan...,1,0.581395,0.058140,1,0.023256,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.087035,0.000000
6,24,konsep basis data objek basis data relasiona...,3,0.833333,0.083333,0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000
7,6,data center management menggunakan docker cont...,0,0.166667,0.666667,0,0.166667,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000
8,44,process isolation diperlukan dalam rangka memb...,1,0.431818,0.090909,1,0.000000,0.000000,0.000000,0.159989,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.293881,0.0,0.0,0.267831,0.475063
9,7,siswa mampu memahami cara kerja docker cont...,2,0.428571,0.285714,1,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000


# Prepare Dataset

In [20]:
#Bisa dipakai untuk klasifikasi dengan crossval
from sklearn.model_selection import cross_validate

x=deskr_p.drop(columns=['label', 'kalimat'])
y=deskr_p['label']

In [21]:
#Split Dataset untuk klasifikasi tanpa crossval
from sklearn.model_selection import train_test_split

# data_train, data_test = train_test_split(y, shuffle=False)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=0, stratify=y) #stratify agar pembagian tiap kelas merata

In [22]:
#untuk crossval
np.save("data/data_split/X.npy", x)
np.save("data/data_split/y.npy", y)
#non-crossval
np.save("data/data_split/X_train.npy", X_train)
np.save("data/data_split/y_train.npy", y_train)
np.save("data/data_split/X_test.npy", X_test)
np.save("data/data_split/y_test.npy", y_test)

# OverSampling dengan SMOTE

In [23]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=0)
ftr_smt, cls_smt = sm.fit_sample(X_train, y_train)

In [24]:
sm

SMOTE(k_neighbors=5, kind='deprecated', m_neighbors='deprecated', n_jobs=1,
   out_step='deprecated', random_state=0, ratio=None,
   sampling_strategy='auto', svm_estimator='deprecated')

In [25]:
from collections import Counter
print('Original dataset shape %s' % Counter(y_train))
print('SMOTE dataset shape %s' % Counter(cls_smt))

Original dataset shape Counter({0: 252, 1: 251, 2: 245, 3: 13})
SMOTE dataset shape Counter({0: 252, 1: 252, 2: 252, 3: 252})


In [26]:
np.save('data/oversampling/ftr_smote.npy', ftr_smt)
np.save('data/oversampling/cls_smote.npy', cls_smt)

# Oversampling dengan ADASYN

In [27]:
from imblearn.over_sampling import ADASYN

ada = ADASYN(sampling_strategy='minority',random_state=0)
ftr_ada, cls_ada = ada.fit_resample(X_train, y_train)

In [28]:
ada

ADASYN(n_jobs=1, n_neighbors=5, random_state=0, ratio=None,
    sampling_strategy='minority')

In [29]:
from collections import Counter
print('Original dataset shape %s' % Counter(y_train))
print('Adasyn dataset shape %s' % Counter(cls_ada))

Original dataset shape Counter({0: 252, 1: 251, 2: 245, 3: 13})
Adasyn dataset shape Counter({0: 252, 1: 251, 3: 247, 2: 245})


In [30]:
np.save('data/oversampling/ftr_adasyn.npy', ftr_ada)
np.save('data/oversampling/cls_adasyn.npy', cls_ada)