In [None]:
import re
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedShuffleSplit


In [None]:
df = pd.read_csv('dataset_with_split.csv', encoding='utf8')

In [None]:
df

In [None]:
df.columns

In [None]:
categories = ['Pasien', 'Usia pasien', 'Penyakit', 'Gejala', 'Kapan',
              'Periode', 'Tindakan', 'Outcome', 'Pertanyaan', 'Pembuka', 'Penyebab',
              'Prakondisi', 'Objek', 'Penutup', 'Referensi', 'Artikel', 'Pengukuran',
              'Predikat']

# next iterasi bisa coba pakai stratified shuffle split (hlm. 55)
# split = StratifiedShuffleSplit(n_splits=1, random_state=42, test_size=0.33)
# for train_index, test_index in split.split()
train, test = train_test_split(df, random_state=42, test_size=0.2, shuffle=True)
X_train = train.text
X_test = test.text
print(X_train.shape)
print(X_test.shape)

In [None]:
NB_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', OneVsRestClassifier
        (
        MultinomialNB(fit_prior=True, class_prior=None)
         )
     )
])

In [None]:
for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    NB_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = NB_pipeline.predict(X_test)
    print(test[category].head())
    print(prediction[-len(test[category].head()) : ])
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))

    # print('Test precision is {}'.format(precision_score(test[category], prediction, zero_division=0)))
    # print('Test recall is {}'.format(recall_score(test[category], prediction, zero_division=0)))
    # print('Test f1_score is {}'.format(f1_score(test[category], prediction, zero_division=0)))
    print()

In [None]:
from sklearn.metrics import confusion_matrix

category = 'Pasien'

print('... Processing {}'.format(category))
# train the model using X_dtm & y
NB_pipeline.fit(X_train, train[category])
# compute the testing accuracy
prediction = NB_pipeline.predict(X_test)
print(test[category].head())
print(prediction[-len(test[category].head()) : ])
print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))
print(confusion_matrix(test[category], prediction))

# print('Test precision is {}'.format(precision_score(test[category], prediction, zero_division=0)))
# print('Test recall is {}'.format(recall_score(test[category], prediction, zero_division=0)))
# print('Test f1_score is {}'.format(f1_score(test[category], prediction, zero_division=0)))
print()

In [None]:
SVC_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
    ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
])

In [None]:
for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    SVC_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = SVC_pipeline.predict(X_test)
    print(test[category].head())
    print(prediction[-len(test[category].head()) : ])
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))
    print()

In [None]:
LogReg_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
])

In [None]:
for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    LogReg_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = LogReg_pipeline.predict(X_test)
    print(test[category].head())
    print(prediction[-len(test[category].head()) : ])
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))
    print()