In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import model_selection, svm
from sklearn.metrics import accuracy_score

In [2]:
np.random.seed(500)

## Definisikan Classifier

In [3]:
# Menggunakan SVM dengan kernel linear sebagai classifier
# random_state merupakan pseudo randomization, supaya hasil yang didapat akan tetap sama setiap kali eksperimen diulang
clf = svm.SVC(kernel='linear', random_state=42)

#### opsi lain yang bisa dipertimbangkan:
<code> clf = svm.SVC(kernel='linear', C=0.9, random_state=42) </code>
<blockquote> C = regularization, default=1 </blockquote> 
<code> clf = svm.SVC(C=500.0, kernel='poly', degree=4, coef0=0, gamma=1.) </code>
<blockquote> poly kernel for multiclass labeling </blockquote> 

## Persiapan Input
**Corpus** memuat teks yang sudah dibersihkan (di tahap prapengolahan).
**LabelInset** memuat label teks dengan leksikon `InSet`.
**LabelSenti** memuat label teks dengan leksikon `sentiwords_id` dari sentistrength_id.

In [4]:
import os

# Pastikan untuk mengganti path dengan absolute path direktorimu jika baris berikut dijalankan ulang, atau restart kernel.
os.chdir('output')
base = 'prastyo-sentiment_posneg-clean-slang-stop-dup.txt'
lb_inset = 'prastyo-sentiment_posneg-clean-slang-stop-lb-inset.txt'
lb_senti = 'prastyo-sentiment_posneg-clean-slang-stop-lb-senti.txt'

Corpus = pd.read_csv(base, encoding='latin-1', header=None, sep='\t', names=['text', 'label'], dtype=str)
LabelInset = pd.read_csv(lb_inset, encoding='latin-1', header=None, names=['label'], dtype=str)
LabelSenti = pd.read_csv(lb_senti, encoding='latin-1', header=None, names=['label'], dtype=str)

In [5]:
# Jumlah positif dan negatif dari setiap jenis pelabelan
neg0, pos0 = (Corpus['label'][Corpus['label']=='neg']).count(), (Corpus['label'][Corpus['label']=='pos']).count()
neg1, pos1 = (LabelInset['label'][LabelInset['label']=='neg']).count(), (LabelInset['label'][LabelInset['label']=='pos']).count()
neg2, pos2 = (LabelSenti['label'][LabelSenti['label']=='neg']).count(), (LabelSenti['label'][LabelSenti['label']=='pos']).count()
print('neg:', neg0, '(', '{0:.2f}'.format(neg0/(neg0+pos0)*100), '%)','\t', 'pos:', pos0, '(', '{0:.2f}'.format(pos0/(neg0+pos0)*100),'%)',' | actual label')
print('neg:', neg1, '(', '{0:.2f}'.format(neg1/(neg1+pos1)*100), '%)','\t', 'pos:', pos1, '(', '{0:.2f}'.format(pos1/(neg1+pos1)*100),'%)',' | inset')
print('neg:', neg2, '(', '{0:.2f}'.format(neg2/(neg2+pos2)*100), '%)','\t', 'pos:', pos2, '(', '{0:.2f}'.format(pos2/(neg2+pos2)*100),'%)',' | senti')

neg: 900 ( 54.22 %) 	 pos: 760 ( 45.78 %)  | actual label
neg: 1197 ( 72.11 %) 	 pos: 463 ( 27.89 %)  | inset
neg: 1114 ( 67.11 %) 	 pos: 546 ( 32.89 %)  | senti


### **\*Perhatian:** pilih salah satu jenis label sebagai *baseline* untuk proses selanjutnya
`LLmark` akan digunakan nanti sebagai pembeda nama file saat menyimpan skor akurasi ke file

In [6]:
## Menggunakan label aktual sebagai baseline terhadap dirinya sendiri
# LL = Corpus[['label']]
# LLmark = 0

## Menggunakan pelabelan dari InSet
LL = LabelInset
LLmark = 1

## Menggunakan pelabelan dari sentiwords_id
# LL = LabelSenti
# LLmark = 2

In [7]:
print(Corpus[:3], '\n\n', LL[:3])

                                                text label
0  ya utang pemerintah utang bangsa indonesia hut...   neg
1  yuk kawal kebijakan pemerintah disalah oknum b...   pos
2  yuk bahu membahu membantuu pemerintah memutus ...   pos 

   label
0   neg
1   pos
2   pos


## Tokenisasi Teks

In [8]:
# Step - a : Menghapus baris kosong, jika ada.
Corpus['text'].dropna(inplace=True)
# # Step - b : Mengganti semua teks ke karakter kecil karena 'oke' dan 'OKE' diinterpretasikan berbeda
# Corpus['text'] = [entry.lower() for entry in Corpus['text']] # we've done this in '[1] text cleaning.ipynb'
# Step - c : Tokenisasi : Setiap kalimat di dalam korpus akan dipecah menjadi daftar kata/string
Corpus['text']= [word_tokenize(entry) for entry in Corpus['text']]

for index,entry in enumerate(Corpus['text']):
    # Mendeklarasikan list kosong untuk menyimpan daftar kata yang sesuai dengan aturan yang dibuat
    Final_words = []
    for word in entry:
        # Kondisi di bawah adalah untuk mengecek/mempertimbangkan alfabet saja
        if word.isalpha():
            word_Final = word
            Final_words.append(word_Final)
    Corpus.loc[index,'text_final'] = str(Final_words)

<blockquote>Ref: <i>https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34</i></blockquote>

In [9]:
print(Corpus[:3])

                                                text label  \
0  [ya, utang, pemerintah, utang, bangsa, indones...   neg   
1  [yuk, kawal, kebijakan, pemerintah, disalah, o...   pos   
2  [yuk, bahu, membahu, membantuu, pemerintah, me...   pos   

                                          text_final  
0  ['ya', 'utang', 'pemerintah', 'utang', 'bangsa...  
1  ['yuk', 'kawal', 'kebijakan', 'pemerintah', 'd...  
2  ['yuk', 'bahu', 'membahu', 'membantuu', 'pemer...  


## Split Data

In [10]:
# Membagi set data latih dan data uji dengan rasio 70:30
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],LL['label'],test_size=0.3, random_state=42)
Train_Y_Actual, Test_Y_Actual = model_selection.train_test_split(Corpus['label'],test_size=0.3, random_state=42)

In [11]:
print(Train_X.size, Train_X.size/(Test_X.size+Train_X.size),'%','\n',
      Test_X.size, Test_X.size/(Test_X.size+Train_X.size),'%')

1162 0.7 % 
 498 0.3 %


In [12]:
# Encoding label menjadi nilai antara 0 and kelas_n-1
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)
Train_Y_Actual = Encoder.fit_transform(Train_Y_Actual)
Test_Y_Actual = Encoder.fit_transform(Test_Y_Actual)

In [13]:
# print('TRAIN_X'+'\n', Train_X, '\n')
# print('TEST_X'+'\n', Test_X, '\n')
# print('TRAIN_Y'+'\n', Train_Y, '\n')
# print('TEST_Y'+'\n', Test_Y, '\n')
# # with np.printoptions():
# #     print(Test_X[:17])

# EKSTRAKSI FITUR: *Term presence*

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

# binary=True artinya tidak mempertimbangkan frekuensi
vectorizerTP = CountVectorizer(binary=True)
X = vectorizerTP.fit_transform(Corpus['text_final'])

In [15]:
# # print 16 nama fitur pertama dan terakhir
# print(vectorizerTP.get_feature_names()[:16],'...',
#       vectorizerTP.get_feature_names()[-16:])
# # print 16 vektor term presence pertama dan terakhir untuk 6 baris/kalimat
# with np.printoptions(edgeitems=16):
#     print(X.toarray()[:6])

# # print(X.shape, type(X))
# # # Jika kita ingin melihat vektor dari suatu kata:
# # print('Vector abai: ')
# # with np.printoptions(edgeitems=10):
# #     print(X.transform(['abai']).toarray())

# # print(vectorizer.vocabulary_)
# import reprlib
# print(reprlib.repr(vectorizerTP.vocabulary_))

In [16]:
# Transform Train_X dan Test_X ke vektor term presence
Train_X_TP = vectorizerTP.transform(Train_X)
Test_X_TP = vectorizerTP.transform(Test_X)

In [17]:
# print(Train_X_TP)

### KLASIFIKASI dengan *term presence*

In [18]:
# fitting data latih pada classifier
clf.fit(Train_X_TP,Train_Y)
# memprediksi label pada set data uji
predictions_SVM_TP = clf.predict(Test_X_TP)

# Menggunakan fungsi accuracy_score untuk mendapat nilai akurasi
accuracy_tp = accuracy_score(Test_Y_Actual, predictions_SVM_TP)*100
print('SVM Accuracy Score -> ', accuracy_tp)

SVM Accuracy Score ->  60.44176706827309


In [19]:
# # Membandingkan Nilai Leksikon dengan Nilai Prediksi
# df = pd.DataFrame({'Lexicon Values':Test_Y, 'Predicted Values':predictions_SVM_TP})
# df

In [20]:
# print(predictions_SVM_TP)

# EKSTRAKSI FITUR: *BoW*

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(Corpus['text_final'])

In [22]:
# # print 16 nama fitur pertama dan terakhir
# print(vectorizer.get_feature_names()[:16],'...',
#       vectorizer.get_feature_names()[-16:])
# # print 16 vektor BoW pertama dan terakhir untuk 6 baris/kalimat
# with np.printoptions(edgeitems=16):
#     print(X.toarray()[:6])

# print(X.shape, type(X))
# # # Jika kita ingin melihat vektor dari suatu kata:
# # print('Vector abai: ')
# # with np.printoptions(edgeitems=10):
# #     print(X.transform(['abai']).toarray())

# # print(vectorizer.vocabulary_)
# import reprlib
# print(reprlib.repr(vectorizer.vocabulary_))

In [23]:
# Transform Train_X dan Test_X ke vektor BoW
Train_X_BoW = vectorizer.transform(Train_X)
Test_X_BoW = vectorizer.transform(Test_X)

In [24]:
# print(Train_X_BoW)

### KLASIFIKASI dengan *BoW*

In [25]:
# fitting data latih pada classifier
clf.fit(Train_X_BoW,Train_Y)
# memprediksi label pada set data uji
predictions_SVM_BoW = clf.predict(Test_X_BoW)

# Menggunakan fungsi accuracy_score untuk mendapat nilai akurasi
accuracy_bow = accuracy_score(Test_Y_Actual, predictions_SVM_BoW)*100
print('SVM Accuracy Score -> ',accuracy_bow)

SVM Accuracy Score ->  59.63855421686747


In [26]:
# print(predictions_SVM_BoW)

# EKSTRAKSI FITUR: *TF-IDF*

In [27]:
Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(Corpus['text_final'])

X = Tfidf_vect.fit_transform(Corpus['text_final'])

#### opsi lain yang bisa dipertimbangkan:

<code> Tfidf_vect = TfidfVectorizer(max_features=None).fit(Corpus['text_final']) <code>
<code> Tfidf_vect = TfidfVectorizer(max_features=5000).fit(Corpus['text_final']) <code>
<code> Tfidf_vect = TfidfVectorizer(min_df=5, max_df=0.8, sublinear_tf=True, \
                    use_idf=True).fit(Corpus['text_final']) <code>

In [28]:
# # print 16 nama fitur pertama dan terakhir
# print(Tfidf_vect.get_feature_names()[:16],'\n',
#       Tfidf_vect.get_feature_names()[-16:])
# # print 16 vektor TF-IDF pertama dan terakhir untuk 6 baris/kalimat
# with np.printoptions(edgeitems=16):
#     print(X.toarray()[:6])

# # print(Tfidf_vect.vocabulary_)
# import reprlib
# print(reprlib.repr(Tfidf_vect.vocabulary_))

# # # Jika kita ingin melihat vektor dari suatu kata:
# # # sebagai contoh kata di array[22]:
# # val = list(Tfidf_vect.vocabulary_)[22]
# # print(val)

In [29]:
# Transform Train_X dan Test_X ke vektor TF-IDF
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [30]:
# print(Train_X_Tfidf)

### KLASIFIKASI dengan TF-IDF

In [31]:
# fitting data latih pada classifier
clf.fit(Train_X_Tfidf,Train_Y)
# memprediksi label pada set data uji
predictions_SVM_Tfidf = clf.predict(Test_X_Tfidf)

# Menggunakan fungsi accuracy_score untuk mendapat nilai akurasi
accuracy_tfidf = accuracy_score(Test_Y_Actual, predictions_SVM_Tfidf)*100
print('SVM Accuracy Score -> ',accuracy_tfidf)

SVM Accuracy Score ->  62.65060240963856


In [32]:
# # Simpan skor akurasi ke file
# if LLmark == 1:
#     output = 'svm_acc_lb1.txt'
#     with open(output, 'w') as f:
#         f.write(str(accuracy_tp)+str('\n')+str(accuracy_bow)+str('\n')+str(accuracy_tfidf))
# elif LLmark == 2:
#     output = 'svm_acc_lb2.txt'
#     with open(output, 'w') as f:
#         f.write(str(accuracy_tp)+str('\n')+str(accuracy_bow)+str('\n')+str(accuracy_tfidf))
# else:
#     output = 'svm_acc_lb0.txt'
#     with open(output, 'w') as f:
#         f.write(str(accuracy_tp)+str('\n')+str(accuracy_bow)+str('\n')+str(accuracy_tfidf))

In [33]:
# print(predictions_SVM_Tfidf)

#### **Gambaran Data:**

In [34]:
# Gambaran rasio data latih dan data uji dari teks awal dan dari vektor ekstraksi fitur.
print('X:', Corpus['text'].size,
      '\nTrain\tTest\t\t%train\t%test\n',
#       Train_Y.size, '\t', Test_Y.size, '\ty\t', '{:.2f}'.format(Train_Y.size/(Test_Y.size+Train_Y.size)*100), '\t', '{:.2f}'.format(Test_Y.size/(Test_Y.size+Train_Y.size)*100), '\n',
      Train_X.size, '\t', Test_X.size, '\tX\t', '{:.2f}'.format(Train_X.size/(Test_X.size+Train_X.size)*100), '\t', '{:.2f}'.format(Test_X.size/(Test_X.size+Train_X.size)*100), '\n',
      Train_X_TP.size, '\t', Test_X_TP.size, '\tTP\t', '{:.2f}'.format(Train_X_TP.size/(Test_X_TP.size+Train_X_TP.size)*100), '\t', '{:.2f}'.format(Test_X_TP.size/(Test_X_TP.size+Train_X_TP.size)*100), '\n',
      Train_X_BoW.size, '\t', Test_X_BoW.size, '\tBoWs\t', '{:.2f}'.format(Train_X_BoW.size/(Test_X_BoW.size+Train_X_BoW.size)*100), '\t', '{:.2f}'.format(Test_X_BoW.size/(Test_X_BoW.size+Train_X_BoW.size)*100), '\n',
      Train_X_Tfidf.size, '\t', Test_X_Tfidf.size, '\tTF-IDF\t', '{:.2f}'.format(Train_X_Tfidf.size/(Test_X_Tfidf.size+Train_X_Tfidf.size)*100), '\t', '{:.2f}'.format(Test_X_Tfidf.size/(Test_X_Tfidf.size+Train_X_Tfidf.size)*100))

# Gambaran jumlah kelas negatif dan positif dari data uji dengan label aktual, label leksikon, and label prediksi;
# Label aktual dan label leksikon disimbolkan sebagai 'Test_Y_Actual' dan 'Test_Y';
# Jika kamu memakai label aktual sebagai baseline, maka label leksikon di sini adalah label aktual itu sendiri.
print('\nneg\tpos\t\tsum\n',
      (Test_Y_Actual==0).sum(), '\t', (Test_Y_Actual==1).sum(), '\t', Test_Y_Actual.size, '\tTest_Y_Actual\n',
      (Test_Y==0).sum(), '\t', (Test_Y==1).sum(), '\t', Test_Y.size, '\tTest_Y\n',
      (predictions_SVM_TP==0).sum(),    '\t', (predictions_SVM_TP==1).sum(),    '\t', predictions_SVM_TP.size,    '\tP_TP\n',
      (predictions_SVM_BoW==0).sum(),   '\t', (predictions_SVM_BoW==1).sum(),   '\t', predictions_SVM_BoW.size,   '\tP_BoWs\n',
      (predictions_SVM_Tfidf==0).sum(), '\t', (predictions_SVM_Tfidf==1).sum(), '\t', predictions_SVM_Tfidf.size, '\tP_Tfidf')

X: 1660 
Train	Test		%train	%test
 1162 	 498 	X	 70.00 	 30.00 
 19529 	 8560 	TP	 69.53 	 30.47 
 19529 	 8560 	BoWs	 69.53 	 30.47 
 19529 	 8560 	TF-IDF	 69.53 	 30.47

neg	pos		sum
 301 	 197 	 498 	Test_Y_Actual
 347 	 151 	 498 	Test_Y
 394 	 104 	 498 	P_TP
 384 	 114 	 498 	P_BoWs
 449 	 49 	 498 	P_Tfidf


# EVALUASI / VALIDASI 

## Confusion Matrix

In [35]:
# Membuat confusion matrix dari label prediksi terhadap label aktual
from sklearn.metrics import confusion_matrix


y_true = Test_Y_Actual

## Term presence ##
print('Confusion Matrix - Term presence')
y_pred = predictions_SVM_TP
conf_matrix = confusion_matrix(y_true, y_pred, labels=[1,0])
print(conf_matrix, conf_matrix.sum())

## BoW ##
print('\nConfusion Matrix - BoW')
y_pred = predictions_SVM_BoW
conf_matrix = confusion_matrix(y_true, y_pred, labels=[1,0])
print(conf_matrix, conf_matrix.sum())

## TF-IDF ##
print('\nConfusion Matrix - TF-IDF')
y_pred = predictions_SVM_Tfidf
conf_matrix = confusion_matrix(y_true, y_pred, labels=[1,0])
print(conf_matrix, conf_matrix.sum())

Confusion Matrix - Term presence
[[ 52 145]
 [ 52 249]] 498

Confusion Matrix - BoW
[[ 55 142]
 [ 59 242]] 498

Confusion Matrix - TF-IDF
[[ 30 167]
 [ 19 282]] 498


## Classification Report: dengan *imbalanced data*
Karena kelas kata (positif & negatif) di semua jenis label—*actual label*, *label by InSet*, maupun *label by sentiwords_id*—tidak terdistribusi secara berimbang, data kita memuat *imbalanced class*. Ini bisa menyebabkan misklasifikasi pada model yang kita buat, menyebabkan penilaian yang tidak akurat. Kita akan coba melatih data *imbalance* ini dan mengevaluasinya nanti.

In [36]:
# Membuat classification report dengan imbalanced data
from sklearn.metrics import classification_report


## Term presence ##
print('Imbalanced data - Term presence\n',
      classification_report(Test_Y_Actual, predictions_SVM_TP))
## BoW ##
print('Imbalanced data - BoW\n',
      classification_report(Test_Y_Actual, predictions_SVM_BoW))
## TF-IDF ##
print('Imbalanced data - TF-IDF\n',
      classification_report(Test_Y_Actual, predictions_SVM_Tfidf))

Imbalanced data - Term presence
               precision    recall  f1-score   support

           0       0.63      0.83      0.72       301
           1       0.50      0.26      0.35       197

    accuracy                           0.60       498
   macro avg       0.57      0.55      0.53       498
weighted avg       0.58      0.60      0.57       498

Imbalanced data - BoW
               precision    recall  f1-score   support

           0       0.63      0.80      0.71       301
           1       0.48      0.28      0.35       197

    accuracy                           0.60       498
   macro avg       0.56      0.54      0.53       498
weighted avg       0.57      0.60      0.57       498

Imbalanced data - TF-IDF
               precision    recall  f1-score   support

           0       0.63      0.94      0.75       301
           1       0.61      0.15      0.24       197

    accuracy                           0.63       498
   macro avg       0.62      0.54      0.50   

#### **\*catatan:**
Kode di bawah memiliki fungsi yang sama dengan kode di atas. Kode ini sengaja tetap disimpan untuk menunjukkan proses asli yang perlu dikerjakan jika kita tidak membuat variabel tersendiri dari hasil prediksi.

In [37]:
# # Make classification report using 'imbalanced' data
# from sklearn.metrics import classification_report


# ## Term presence ##
# X_train = Train_X_TP
# X_test = Test_X_TP
# clf.fit(X_train, Train_Y)
# print('Imbalanced data - Term presence\n',
#       classification_report(Test_Y_Actual, clf.predict(X_test)))

# ## BoW ##
# X_train = Train_X_BoW
# X_test = Test_X_BoW
# clf.fit(X_train, Train_Y)
# print('Imbalanced data - BoW\n',
#       classification_report(Test_Y_Actual, clf.predict(X_test)))

# ## TF-IDF ##
# X_train = Train_X_Tfidf
# X_test = Test_X_Tfidf
# clf.fit(X_train, Train_Y)
# print('Imbalanced data - TF-IDF\n',
#       classification_report(Test_Y_Actual, clf.predict(X_test)))

## Classification Report: dengan *oversampled data*
Di sini, kita akan melatih *imbalanced data* kita dengan metode **oversampling** dan mengevaluasinya.

In [38]:
# Fungsi untuk menyimpan skor akurasi ke file
def acc_oversampled(LLmark, accuracy):
    if LLmark == 1:
        output = 'svm_acc_o_lb1.txt'
        with open(output, 'a') as f:
            f.write(str(accuracy)+str('\n'))
    elif LLmark == 2:
        output = 'svm_acc_o_lb2.txt'
        with open(output, 'a') as f:
            f.write(str(accuracy)+str('\n'))
    else:
        output = 'svm_acc_o_lb0.txt'
        with open(output, 'a') as f:
            f.write(str(accuracy)+str('\n'))

In [39]:
# Membuat classification report dengan 'oversampled' data
from imblearn.over_sampling import SVMSMOTE


# svmsmote = SVMSMOTE(random_state=None)
svmsmote = SVMSMOTE(random_state = 500)

y_train = Train_Y
y_test = Test_Y_Actual

## Term presence ##
X_train = Train_X_TP
X_test = Test_X_TP
X_oversample_svm, y_oversample_svm = svmsmote.fit_resample(X_train, y_train)
# melatih classifier dengan oversampled data menggunakan borderline-SMOTE SVM (SVM SMOTE)
clf.fit(X_oversample_svm, y_oversample_svm)
# acc_oversampled(LLmark, accuracy_score(y_test, clf.predict(X_test))*100) # print to file
print('Oversampled data - Term presence\n', classification_report(y_test, clf.predict(X_test)))

## BoW ##
X_train = Train_X_BoW
X_test = Test_X_BoW
X_oversample_svm, y_oversample_svm = svmsmote.fit_resample(X_train, y_train)
# melatih classifier dengan oversampled data menggunakan borderline-SMOTE SVM (SVM SMOTE)
clf.fit(X_oversample_svm, y_oversample_svm)
# acc_oversampled(LLmark, accuracy_score(y_test, clf.predict(X_test))*100) # print to file
print('Oversampled data - BoW\n', classification_report(y_test, clf.predict(X_test)))

## TF-IDF ##
X_train = Train_X_Tfidf
X_test = Test_X_Tfidf
X_oversample_svm, y_oversample_svm = svmsmote.fit_resample(X_train, y_train)
# melatih classifier dengan oversampled data menggunakan borderline-SMOTE SVM (SVM SMOTE)
clf.fit(X_oversample_svm, y_oversample_svm)
# acc_oversampled(LLmark, accuracy_score(y_test, clf.predict(X_test))*100) # print to file
print('Oversampled data - TF-IDF\n', classification_report(y_test, clf.predict(X_test)))

Oversampled data - Term presence
               precision    recall  f1-score   support

           0       0.62      0.71      0.66       301
           1       0.43      0.32      0.37       197

    accuracy                           0.56       498
   macro avg       0.52      0.52      0.52       498
weighted avg       0.54      0.56      0.55       498

Oversampled data - BoW
               precision    recall  f1-score   support

           0       0.62      0.70      0.66       301
           1       0.42      0.33      0.37       197

    accuracy                           0.55       498
   macro avg       0.52      0.52      0.51       498
weighted avg       0.54      0.55      0.54       498

Oversampled data - TF-IDF
               precision    recall  f1-score   support

           0       0.63      0.88      0.74       301
           1       0.55      0.22      0.31       197

    accuracy                           0.62       498
   macro avg       0.59      0.55      0.53

<blockquote><i>"The purpose of oversampling is ... to have a better prediction model. This technique was not created for any analysis purposes as every data created is synthetic, so that is a reminder."</i></blockquote>

<blockquote><i>"... <b>you should only oversample your training data and not the whole data</b> except if you would use the entire data as your training data. <b>In case you want to split the data, you should split the data first</b> before oversampled the training data."</i></blockquote>

<blockquote>Ref: <i>https://towardsdatascience.com/5-smote-techniques-for-oversampling-your-imbalance-data-b8155bdbe2b5?gi=67231aa6fa80</i></blockquote>

## Validasi dengan k-Fold cv
Ubah nilai `n_splits` sesuai dengan kebutuhan. Sebagai contoh, 5 splits berarti bahwa data (X dan y oversample) dibagi menjadi *4 porsi* untuk set latih baru dan *1 porsi* untuk set uji baru. Jika `shuffle` bernilai True, maka validasi silang dilakukan dengan kombinasi data yang berbeda di setiap iterasi. Kemudian validasi silang dilakukan dalam 5 iterasi.

In [40]:
# Simpan skor akurasi ke file
def acc_oversampled(LLmark, featExt, accuracy):
    if LLmark == 1:
        output = 'svm_acc_ov_lb1_'+str(featExt)+'_kfold.txt'
        with open(output, 'a') as f:
            f.write(str(accuracy)+'\n')
    elif LLmark == 2:
        output = 'svm_acc_ov_lb2_'+str(featExt)+'_kfold.txt'
        with open(output, 'a') as f:
            f.write(str(accuracy)+'\n')
    else:
        output = 'svm_acc_ov_lb0_'+str(featExt)+'_kfold.txt'
        with open(output, 'a') as f:
            f.write(str(accuracy)+'\n')

In [41]:
# Simpan nilai presisi, recall, dan f1-score ke file
def cr_oversampled(LLmark, featExt, precision, recall, f1):
    if LLmark == 1:
        output = 'svm_cr_ov_lb1_'+str(featExt)+'_kfold.txt'
        with open(output, 'a') as f:
            f.write(str(precision)+'\t'+str(recall)+'\t'+str(f1)+'\n')
    elif LLmark == 2:
        output = 'svm_cr_ov_lb2_'+str(featExt)+'_kfold.txt'
        with open(output, 'a') as f:
            f.write(str(precision)+'\t'+str(recall)+'\t'+str(f1)+'\n')
    else:
        output = 'svm_cr_ov_lb0_'+str(featExt)+'_kfold.txt'
        with open(output, 'a') as f:
            f.write(str(precision)+'\t'+str(recall)+'\t'+str(f1)+'\n')

In [42]:
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import KFold

outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

y_true = Corpus[['label']]
y_true = Encoder.fit_transform(y_true)
y = LL
y = Encoder.fit_transform(y)

  return f(*args, **kwargs)


#### **Step 1:** Validasi dengan `term presence` sebagai metode ekstraksi fitur. Lalu, meng-**oversample** model di setiap iterasi k-Fold dengan `SVM SMOTE`

In [43]:
featExt = 'tp'

X = vectorizerTP.fit_transform(Corpus['text_final'])
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    y_true_train, y_true_test = y_true[train_index], y_true[test_index]
    X_train_oversampled, y_train_oversampled = svmsmote.fit_resample(X_train, y_train.ravel())
    
    clf.fit(X_train_oversampled, y_train_oversampled)
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_true_test, y_pred)
    precision = precision_score(y_true_test, y_pred)
    recall = recall_score(y_true_test, y_pred)
    f1 = f1_score(y_true_test, y_pred)
    
    #print to file
#     acc_oversampled(LLmark, featExt, accuracy)
#     cr_oversampled(LLmark, featExt, precision, recall, f1)

    print(f'# Fold {fold}:')
    print(classification_report(y_true_test, y_pred), "\n")
#     print(f'accuracy: {accuracy}')
#     print(f'precision: {precision}')
#     print(f'recall: {recall}')
#     print(f'f-score: {f1}')

# Fold 1:
              precision    recall  f1-score   support

           0       0.60      0.67      0.63       204
           1       0.36      0.30      0.33       128

    accuracy                           0.53       332
   macro avg       0.48      0.49      0.48       332
weighted avg       0.51      0.53      0.52       332
 

# Fold 2:
              precision    recall  f1-score   support

           0       0.57      0.73      0.64       177
           1       0.54      0.36      0.43       155

    accuracy                           0.56       332
   macro avg       0.56      0.55      0.54       332
weighted avg       0.56      0.56      0.54       332
 

# Fold 3:
              precision    recall  f1-score   support

           0       0.58      0.70      0.64       178
           1       0.55      0.42      0.48       154

    accuracy                           0.57       332
   macro avg       0.57      0.56      0.56       332
weighted avg       0.57      0.57      0

#### **Step 2:** Validasi dengan `BoW` sebagai metode ekstraksi fitur. Lalu, meng-**oversample** model di setiap iterasi k-Fold dengan `SVM SMOTE`

In [44]:
featExt = 'bow'

X = vectorizer.fit_transform(Corpus['text_final'])
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    y_true_train, y_true_test = y_true[train_index], y_true[test_index]
    X_train_oversampled, y_train_oversampled = svmsmote.fit_resample(X_train, y_train.ravel())
    
    clf.fit(X_train_oversampled, y_train_oversampled)
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_true_test, y_pred)
    precision = precision_score(y_true_test, y_pred)
    recall = recall_score(y_true_test, y_pred)
    f1 = f1_score(y_true_test, y_pred)

    #print to file
#     acc_oversampled(LLmark, featExt, accuracy)
#     cr_oversampled(LLmark, featExt, precision, recall, f1)

    print(f'# Fold {fold}:')
    print(classification_report(y_true_test, y_pred), "\n")
#     print(f'accuracy: {accuracy}')
#     print(f'precision: {precision}')
#     print(f'recall: {recall}')
#     print(f'f-score: {f1}')

# Fold 1:
              precision    recall  f1-score   support

           0       0.61      0.67      0.64       204
           1       0.39      0.33      0.35       128

    accuracy                           0.54       332
   macro avg       0.50      0.50      0.50       332
weighted avg       0.53      0.54      0.53       332
 

# Fold 2:
              precision    recall  f1-score   support

           0       0.55      0.69      0.62       177
           1       0.50      0.35      0.42       155

    accuracy                           0.54       332
   macro avg       0.53      0.52      0.52       332
weighted avg       0.53      0.54      0.52       332
 

# Fold 3:
              precision    recall  f1-score   support

           0       0.57      0.69      0.62       178
           1       0.53      0.40      0.46       154

    accuracy                           0.55       332
   macro avg       0.55      0.54      0.54       332
weighted avg       0.55      0.55      0

#### **Step 3:** Validasi dengan `TF-IDF` sebagai metode ekstraksi fitur. Lalu, meng-**oversample** model di setiap iterasi k-Fold dengan `SVM SMOTE`

In [45]:
featExt = 'tfidf'

X = Tfidf_vect.fit_transform(Corpus['text_final'])
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    y_true_train, y_true_test = y_true[train_index], y_true[test_index]
    X_train_oversampled, y_train_oversampled = svmsmote.fit_resample(X_train, y_train.ravel())
    
    clf.fit(X_train_oversampled, y_train_oversampled)
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_true_test, y_pred)
    precision = precision_score(y_true_test, y_pred)
    recall = recall_score(y_true_test, y_pred)
    f1 = f1_score(y_true_test, y_pred)

    #print to file
#     acc_oversampled(LLmark, featExt, accuracy)
#     cr_oversampled(LLmark, featExt, precision, recall, f1)

    print(f'# Fold {fold}:')
    print(classification_report(y_true_test, y_pred), "\n")
#     print(f'accuracy: {accuracy}')
#     print(f'precision: {precision}')
#     print(f'recall: {recall}')
#     print(f'f-score: {f1}')

# Fold 1:
              precision    recall  f1-score   support

           0       0.64      0.84      0.72       204
           1       0.48      0.23      0.31       128

    accuracy                           0.61       332
   macro avg       0.56      0.54      0.52       332
weighted avg       0.57      0.61      0.57       332
 

# Fold 2:
              precision    recall  f1-score   support

           0       0.57      0.89      0.69       177
           1       0.64      0.22      0.33       155

    accuracy                           0.58       332
   macro avg       0.60      0.56      0.51       332
weighted avg       0.60      0.58      0.52       332
 

# Fold 3:
              precision    recall  f1-score   support

           0       0.59      0.87      0.70       178
           1       0.67      0.30      0.41       154

    accuracy                           0.61       332
   macro avg       0.63      0.58      0.56       332
weighted avg       0.63      0.61      0

<blockquote>Ref: <i>https://stackoverflow.com/questions/55591063/how-to-perform-smote-with-cross-validation-in-sklearn-in-python</i></blockquote>