In [297]:
import re
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedShuffleSplit

In [298]:
df = pd.read_csv('dataset_with_split.csv', encoding='utf8')

In [299]:
df.head()

Unnamed: 0,id,text,annotator,annotation_id,created_at,updated_at,lead_time,Pasien,Usia pasien,Penyakit,...,Pertanyaan,Pembuka,Penyebab,Prakondisi,Objek,Penutup,Referensi,Artikel,Pengukuran,Predikat
0,821-1-0,P,1,821,2022-11-16T05:55:53.144042Z,2022-11-16T05:55:53.144067Z,1.258,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,816-1-0,Salam,1,816,2022-11-15T13:58:27.795105Z,2022-11-15T13:58:27.795139Z,17.363,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,815-1-0,Semoga membantu ya,1,815,2022-11-15T13:58:09.067035Z,2022-11-15T13:58:09.067086Z,6.97,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,814-1-0,Hindari konsumsi Alkohol dan rokok,1,814,2022-11-15T13:57:53.571838Z,2022-11-15T13:58:00.947210Z,6.827,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,813-1-0,Olahraga rutin,1,813,2022-11-15T13:57:50.069477Z,2022-11-15T13:57:50.069509Z,1.72,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [300]:
df.columns

Index(['id', 'text', 'annotator', 'annotation_id', 'created_at', 'updated_at',
       'lead_time', 'Pasien', 'Usia pasien', 'Penyakit', 'Gejala', 'Kapan',
       'Periode', 'Tindakan', 'Outcome', 'Pertanyaan', 'Pembuka', 'Penyebab',
       'Prakondisi', 'Objek', 'Penutup', 'Referensi', 'Artikel', 'Pengukuran',
       'Predikat'],
      dtype='object')

In [301]:
categories = ['Pasien', 'Usia pasien', 'Penyakit', 'Gejala', 'Kapan',
              'Periode', 'Tindakan', 'Outcome', 'Pertanyaan', 'Pembuka', 'Penyebab',
              'Prakondisi', 'Objek', 'Penutup', 'Referensi', 'Artikel', 'Pengukuran',
              'Predikat']

# next iterasi bisa coba pakai stratified shuffle split (hlm. 55)
# split = StratifiedShuffleSplit(n_splits=1, random_state=42, test_size=0.33)
# for train_index, test_index in split.split()
train, test = train_test_split(df, random_state=42, test_size=0.2, shuffle=True)
X_train = train.text
X_test = test.text
y_train = train[categories]
y_test = test[categories]
print(X_train.shape)
print(X_test.shape)

(1183,)
(296,)


In [302]:
NB_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', OneVsRestClassifier
        (
        MultinomialNB(fit_prior=True, class_prior=None)
    )
     )
])

In [303]:
NB_pipeline.fit(X_train, y_train)
prediction = NB_pipeline.predict(X_test)



In [304]:
print('precision score :' , precision_score(prediction, y_test, average='weighted', zero_division=0))
print('recall score :', recall_score(prediction, y_test, average='weighted', zero_division=0))
print('f1 score :',f1_score(prediction, y_test, average='weighted', zero_division=0))

precision score : 0.39766806130227683
recall score : 0.9347826086956522
f1 score : 0.5512729071261449


In [305]:
NB_pipeline.predict(["ADHD"])==1

array([[False, False,  True, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False]])

In [306]:
categories

['Pasien',
 'Usia pasien',
 'Penyakit',
 'Gejala',
 'Kapan',
 'Periode',
 'Tindakan',
 'Outcome',
 'Pertanyaan',
 'Pembuka',
 'Penyebab',
 'Prakondisi',
 'Objek',
 'Penutup',
 'Referensi',
 'Artikel',
 'Pengukuran',
 'Predikat']

In [307]:
SVC_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
    ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
])

In [308]:
SVC_pipeline.fit(X_train, y_train)
prediction = SVC_pipeline.predict(X_test)



In [309]:
print('precision score :' , precision_score(prediction, y_test, average='weighted', zero_division=0))
print('recall score :', recall_score(prediction, y_test, average='weighted', zero_division=0))
print('f1 score :',f1_score(prediction, y_test, average='weighted', zero_division=0))

precision score : 0.6398239684526112
recall score : 0.7847682119205298
f1 score : 0.695571392446023


In [310]:
LogReg_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
])

In [311]:
LogReg_pipeline.fit(X_train, y_train)
prediction = LogReg_pipeline.predict(X_test)



In [312]:
print('precision score :' , precision_score(prediction, y_test, average='weighted', zero_division=0))
print('recall score :', recall_score(prediction, y_test, average='weighted', zero_division=0))
print('f1 score :',f1_score(prediction, y_test, average='weighted', zero_division=0))

precision score : 0.4970203407204738
recall score : 0.9090909090909091
f1 score : 0.6319075905602703
