In [177]:
import re
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedShuffleSplit


In [178]:
df = pd.read_csv('dataset_with_split.csv', encoding='utf8')

In [179]:
df

Unnamed: 0,id,text,annotator,annotation_id,created_at,updated_at,lead_time,Pasien,Usia pasien,Penyakit,...,Pertanyaan,Pembuka,Penyebab,Prakondisi,Objek,Penutup,Referensi,Artikel,Pengukuran,Predikat
0,821-1-0,P,1,821,2022-11-16T05:55:53.144042Z,2022-11-16T05:55:53.144067Z,1.258,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,816-1-0,Salam,1,816,2022-11-15T13:58:27.795105Z,2022-11-15T13:58:27.795139Z,17.363,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,815-1-0,Semoga membantu ya,1,815,2022-11-15T13:58:09.067035Z,2022-11-15T13:58:09.067086Z,6.970,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,814-1-0,Hindari konsumsi Alkohol dan rokok,1,814,2022-11-15T13:57:53.571838Z,2022-11-15T13:58:00.947210Z,6.827,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,813-1-0,Olahraga rutin,1,813,2022-11-15T13:57:50.069477Z,2022-11-15T13:57:50.069509Z,1.720,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1474,15-2-735,"Jika ingin memastikan, tidak ada salahnya Anda...",2,15,2022-11-11T08:37:16.135665Z,2022-11-11T08:37:16.135688Z,3.011,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1475,9-2-0,"Di usia 3-5 tahun, sewajarnya kosa kata yang d...",2,9,2022-11-11T08:29:29.064414Z,2022-11-11T08:29:29.064437Z,131.017,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1476,9-2-196,Anak seusia ini pun seringnya sudah mampu mera...,2,9,2022-11-11T08:29:29.064414Z,2022-11-11T08:29:29.064437Z,131.017,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1477,9-2-331,"Saat mengalami hal yang menarik, ia pun bisa d...",2,9,2022-11-11T08:29:29.064414Z,2022-11-11T08:29:29.064437Z,131.017,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [180]:
df.columns

Index(['id', 'text', 'annotator', 'annotation_id', 'created_at', 'updated_at',
       'lead_time', 'Pasien', 'Usia pasien', 'Penyakit', 'Gejala', 'Kapan',
       'Periode', 'Tindakan', 'Outcome', 'Pertanyaan', 'Pembuka', 'Penyebab',
       'Prakondisi', 'Objek', 'Penutup', 'Referensi', 'Artikel', 'Pengukuran',
       'Predikat'],
      dtype='object')

In [181]:
categories = ['Pasien', 'Usia pasien', 'Penyakit', 'Gejala', 'Kapan',
              'Periode', 'Tindakan', 'Outcome', 'Pertanyaan', 'Pembuka', 'Penyebab',
              'Prakondisi', 'Objek', 'Penutup', 'Referensi', 'Artikel', 'Pengukuran',
              'Predikat']

# next iterasi bisa coba pakai stratified shuffle split (hlm. 55)
# split = StratifiedShuffleSplit(n_splits=1, random_state=42, test_size=0.33)
# for train_index, test_index in split.split()
train, test = train_test_split(df, random_state=42, test_size=0.2, shuffle=True)
X_train = train.text
X_test = test.text
print(X_train.shape)
print(X_test.shape)

(1183,)
(296,)


In [182]:
NB_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', OneVsRestClassifier
        (
        MultinomialNB(fit_prior=True, class_prior=None)
         )
     )
])

In [183]:
for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    NB_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = NB_pipeline.predict(X_test)
    print(test[category].head())
    print(prediction[-len(test[category].head()) : ])
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))

    # print('Test precision is {}'.format(precision_score(test[category], prediction, zero_division=0)))
    # print('Test recall is {}'.format(recall_score(test[category], prediction, zero_division=0)))
    # print('Test f1_score is {}'.format(f1_score(test[category], prediction, zero_division=0)))
    print()

... Processing Pasien
661    0
274    0
394    0
218    0
922    0
Name: Pasien, dtype: int64
[0 0 0 0 0]
Test accuracy is 0.8614864864864865

... Processing Usia pasien
661    0
274    0
394    0
218    0
922    0
Name: Usia pasien, dtype: int64
[0 0 0 0 0]
Test accuracy is 0.9459459459459459

... Processing Penyakit
661    0
274    0
394    1
218    1
922    0
Name: Penyakit, dtype: int64
[0 0 0 0 0]
Test accuracy is 0.8817567567567568

... Processing Gejala
661    0
274    0
394    0
218    0
922    0
Name: Gejala, dtype: int64
[0 0 0 1 0]
Test accuracy is 0.8581081081081081

... Processing Kapan
661    0
274    0
394    0
218    0
922    0
Name: Kapan, dtype: int64
[0 0 0 0 0]
Test accuracy is 0.9763513513513513

... Processing Periode
661    0
274    0
394    0
218    0
922    0
Name: Periode, dtype: int64
[0 0 0 0 0]
Test accuracy is 0.972972972972973

... Processing Tindakan
661    0
274    0
394    1
218    0
922    1
Name: Tindakan, dtype: int64
[0 0 0 0 0]
Test accuracy is 0.



In [189]:
category = 'Pasien'

print('... Processing {}'.format(category))
# train the model using X_dtm & y
NB_pipeline.fit(X_train, train[category])
# compute the testing accuracy
prediction = NB_pipeline.predict(X_test)
print(test[category].head())
print(prediction[-len(test[category].head()) : ])
print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))
print(test[category], prediction)

# print('Test precision is {}'.format(precision_score(test[category], prediction, zero_division=0)))
# print('Test recall is {}'.format(recall_score(test[category], prediction, zero_division=0)))
# print('Test f1_score is {}'.format(f1_score(test[category], prediction, zero_division=0)))
print()

... Processing Pasien
661    0
274    0
394    0
218    0
922    0
Name: Pasien, dtype: int64
[0 0 0 0 0]
Test accuracy is 0.8614864864864865
661     0
274     0
394     0
218     0
922     0
       ..
324     0
1467    0
1006    0
1229    0
316     0
Name: Pasien, Length: 296, dtype: int64 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]



In [184]:
SVC_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
    ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
])

In [185]:
for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    SVC_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = SVC_pipeline.predict(X_test)
    print(test[category].head())
    print(prediction[-len(test[category].head()) : ])
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))
    print()

... Processing Pasien
661    0
274    0
394    0
218    0
922    0
Name: Pasien, dtype: int64
[0 0 0 0 0]
Test accuracy is 0.8682432432432432

... Processing Usia pasien
661    0
274    0
394    0
218    0
922    0
Name: Usia pasien, dtype: int64
[0 0 1 0 0]
Test accuracy is 0.9763513513513513

... Processing Penyakit
661    0
274    0
394    1
218    1
922    0
Name: Penyakit, dtype: int64
[0 0 0 0 0]
Test accuracy is 0.9391891891891891

... Processing Gejala
661    0
274    0
394    0
218    0
922    0
Name: Gejala, dtype: int64
[0 0 0 1 0]
Test accuracy is 0.9256756756756757

... Processing Kapan
661    0
274    0
394    0
218    0
922    0
Name: Kapan, dtype: int64
[0 0 0 0 0]
Test accuracy is 0.9831081081081081

... Processing Periode
661    0
274    0
394    0
218    0
922    0
Name: Periode, dtype: int64
[0 0 0 0 0]
Test accuracy is 0.9763513513513513

... Processing Tindakan
661    0
274    0
394    1
218    0
922    1
Name: Tindakan, dtype: int64
[1 0 0 0 0]
Test accuracy is 0



In [186]:
LogReg_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
])

In [187]:
for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    LogReg_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = LogReg_pipeline.predict(X_test)
    print(test[category].head())
    print(prediction[-len(test[category].head()) : ])
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))
    print()

... Processing Pasien
661    0
274    0
394    0
218    0
922    0
Name: Pasien, dtype: int64
[0 0 0 0 0]
Test accuracy is 0.8648648648648649

... Processing Usia pasien
661    0
274    0
394    0
218    0
922    0
Name: Usia pasien, dtype: int64
[0 0 0 0 0]
Test accuracy is 0.9493243243243243

... Processing Penyakit
661    0
274    0
394    1
218    1
922    0
Name: Penyakit, dtype: int64
[0 0 0 0 0]
Test accuracy is 0.902027027027027

... Processing Gejala
661    0
274    0
394    0
218    0
922    0
Name: Gejala, dtype: int64
[0 0 0 0 0]
Test accuracy is 0.8817567567567568

... Processing Kapan
661    0
274    0
394    0
218    0
922    0
Name: Kapan, dtype: int64
[0 0 0 0 0]
Test accuracy is 0.9763513513513513

... Processing Periode
661    0
274    0
394    0
218    0
922    0
Name: Periode, dtype: int64
[0 0 0 0 0]
Test accuracy is 0.972972972972973

... Processing Tindakan
661    0
274    0
394    1
218    0
922    1
Name: Tindakan, dtype: int64
[1 0 0 0 0]
Test accuracy is 0.8

