In [None]:
import re
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
df = pd.read_csv('dataset_with_split.csv', encoding='utf8')

In [None]:
df.head()

In [None]:
df.columns

In [None]:
categories = ['Pasien', 'Usia pasien', 'Penyakit', 'Gejala', 'Kapan',
              'Periode', 'Tindakan', 'Outcome', 'Pertanyaan', 'Pembuka', 'Penyebab',
              'Prakondisi', 'Objek', 'Penutup', 'Referensi', 'Artikel', 'Pengukuran',
              'Predikat']

# next iterasi bisa coba pakai stratified shuffle split (hlm. 55)
# split = StratifiedShuffleSplit(n_splits=1, random_state=42, test_size=0.33)
# for train_index, test_index in split.split()
train, test = train_test_split(df, random_state=42, test_size=0.2, shuffle=True)
X_train = train.text
X_test = test.text
y_train = train[categories]
y_test = test[categories]
print(X_train.shape)
print(X_test.shape)

In [None]:
NB_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', OneVsRestClassifier
        (
        MultinomialNB(fit_prior=True, class_prior=None)
    )
     )
])

In [None]:
NB_pipeline.fit(X_train, y_train)
prediction = NB_pipeline.predict(X_test)

In [None]:
print('precision score :' , precision_score(prediction, y_test, average='weighted', zero_division=0))
print('recall score :', recall_score(prediction, y_test, average='weighted', zero_division=0))
print('f1 score :',f1_score(prediction, y_test, average='weighted', zero_division=0))

In [None]:
NB_pipeline.predict(["ADHD"])==1

In [None]:
categories

In [None]:
SVC_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
    ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
])

In [None]:
SVC_pipeline.fit(X_train, y_train)
prediction = SVC_pipeline.predict(X_test)

In [None]:
print('precision score :' , precision_score(prediction, y_test, average='weighted', zero_division=0))
print('recall score :', recall_score(prediction, y_test, average='weighted', zero_division=0))
print('f1 score :',f1_score(prediction, y_test, average='weighted', zero_division=0))

In [None]:
LogReg_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
])

In [None]:
LogReg_pipeline.fit(X_train, y_train)
prediction = LogReg_pipeline.predict(X_test)

In [None]:
print('precision score :' , precision_score(prediction, y_test, average='weighted', zero_division=0))
print('recall score :', recall_score(prediction, y_test, average='weighted', zero_division=0))
print('f1 score :',f1_score(prediction, y_test, average='weighted', zero_division=0))