In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import model_selection, svm
from sklearn.metrics import accuracy_score

In [2]:
np.random.seed(500)

## Define Classifier

In [3]:
# Using SVM with linear kernel as classifier
# random_state is pseudo randomization, so the result you get will be the same as ours (or close?)
clf = svm.SVC(kernel='linear', random_state=42)

#### other options to consider:
<code> clf = svm.SVC(kernel='linear', C=0.9, random_state=42) </code>
<blockquote> C = regularization, default=1 </blockquote> 
<code> clf = svm.SVC(C=500.0, kernel='poly', degree=4, coef0=0, gamma=1.) </code>
<blockquote> poly kernel for multiclass labeling </blockquote> 

## Preparing Input
**Corpus** contains text that has been preprocessed/cleaned.
**LabelInset** contains the text label using `InSet` lexicon.
**LabelSenti** contains the text label using `sentiwords_id` lexicon from sentistrength_id.

In [4]:
import os

# Be sure to change the path to absolute path of your directory if you re-run, or restart the kernel instead.
os.chdir('output')
base = 'prastyo-sentiment_posneg-clean-slang-stop-dup.txt'
lb_inset = 'prastyo-sentiment_posneg-clean-slang-stop-lb-inset.txt'
lb_senti = 'prastyo-sentiment_posneg-clean-slang-stop-lb-senti.txt'

Corpus = pd.read_csv(base, encoding='latin-1', header=None, sep='\t', names=['text', 'label'], dtype=str)
LabelInset = pd.read_csv(lb_inset, encoding='latin-1', header=None, names=['label'], dtype=str)
LabelSenti = pd.read_csv(lb_senti, encoding='latin-1', header=None, names=['label'], dtype=str)

In [5]:
# Previewing the positives and negatives count for each label
neg0, pos0 = (Corpus['label'][Corpus['label']=='neg']).count(), (Corpus['label'][Corpus['label']=='pos']).count()
neg1, pos1 = (LabelInset['label'][LabelInset['label']=='neg']).count(), (LabelInset['label'][LabelInset['label']=='pos']).count()
neg2, pos2 = (LabelSenti['label'][LabelSenti['label']=='neg']).count(), (LabelSenti['label'][LabelSenti['label']=='pos']).count()
print('neg:', neg0, '(', '{0:.2f}'.format(neg0/(neg0+pos0)*100), '%)','\t', 'pos:', pos0, '(', '{0:.2f}'.format(pos0/(neg0+pos0)*100),'%)',' | actual label')
print('neg:', neg1, '(', '{0:.2f}'.format(neg1/(neg1+pos1)*100), '%)','\t', 'pos:', pos1, '(', '{0:.2f}'.format(pos1/(neg1+pos1)*100),'%)',' | inset')
print('neg:', neg2, '(', '{0:.2f}'.format(neg2/(neg2+pos2)*100), '%)','\t', 'pos:', pos2, '(', '{0:.2f}'.format(pos2/(neg2+pos2)*100),'%)',' | senti')

neg: 900 ( 54.22 %) 	 pos: 760 ( 45.78 %)  | actual label
neg: 1197 ( 72.11 %) 	 pos: 463 ( 27.89 %)  | inset
neg: 1114 ( 67.11 %) 	 pos: 546 ( 32.89 %)  | senti


### **\*Attention:** choose one labeling as baseline for the rest algorithms
`LLmark` will be used later for filename differentiation when saving accuracy score to file

In [6]:
## Use actual label as baseline compared to itself
# LL = Corpus[['label']]
# LLmark = 0

## Use labeling by InSet
# LL = LabelInset
# LLmark = 1

## Use labeling by sentiwords_id
# LL = LabelSenti
# LLmark = 2

In [7]:
print(Corpus[:3], '\n\n', LL[:3])

                                                text label
0  ya utang pemerintah utang bangsa indonesia hut...   neg
1  yuk kawal kebijakan pemerintah disalah oknum b...   pos
2  yuk bahu membahu membantuu pemerintah memutus ...   pos 

   label
0   neg
1   pos
2   neg


## Text Tokenization

In [8]:
# Step - a : Remove blank rows if any.
Corpus['text'].dropna(inplace=True)
# # Step - b : Change all the text to lower case. This is required as python interprets 'oke' and 'OKE' differently
# Corpus['text'] = [entry.lower() for entry in Corpus['text']] # we've done this in '[1] text cleaning.ipynb'
# Step - c : Tokenization : Each entry in the corpus will be broken into set of words
Corpus['text']= [word_tokenize(entry) for entry in Corpus['text']]

for index,entry in enumerate(Corpus['text']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    for word in entry:
        # Below condition is to check/consider only alphabets
        if word.isalpha():
            word_Final = word
            Final_words.append(word_Final)
    Corpus.loc[index,'text_final'] = str(Final_words)

<blockquote>Ref: <i>https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34</i></blockquote>

In [9]:
print(Corpus[:3])

                                                text label  \
0  [ya, utang, pemerintah, utang, bangsa, indones...   neg   
1  [yuk, kawal, kebijakan, pemerintah, disalah, o...   pos   
2  [yuk, bahu, membahu, membantuu, pemerintah, me...   pos   

                                          text_final  
0  ['ya', 'utang', 'pemerintah', 'utang', 'bangsa...  
1  ['yuk', 'kawal', 'kebijakan', 'pemerintah', 'd...  
2  ['yuk', 'bahu', 'membahu', 'membantuu', 'pemer...  


## Split Data

In [10]:
# Split the training and test sets with ratio 70:30
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],LL['label'],test_size=0.3, random_state=42)
Train_Y_Actual, Test_Y_Actual = model_selection.train_test_split(Corpus['label'],test_size=0.3, random_state=42)

In [11]:
print(Train_X.size, Train_X.size/(Test_X.size+Train_X.size),'%','\n',
      Test_X.size, Test_X.size/(Test_X.size+Train_X.size),'%')

1162 0.7 % 
 498 0.3 %


In [12]:
# Encoding the labels into value between 0 and n_classes-1
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)
Train_Y_Actual = Encoder.fit_transform(Train_Y_Actual)
Test_Y_Actual = Encoder.fit_transform(Test_Y_Actual)

In [13]:
# print('TRAIN_X'+'\n', Train_X, '\n')
# print('TEST_X'+'\n', Test_X, '\n')
# print('TRAIN_Y'+'\n', Train_Y, '\n')
# print('TEST_Y'+'\n', Test_Y, '\n')
# # with np.printoptions():
# #     print(Test_X[:17])

# FEATURE EXTRACTION: Term presence

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

# binary=True means frequency isn't considered
vectorizerTP = CountVectorizer(binary=True)
X = vectorizerTP.fit_transform(Corpus['text_final'])

In [15]:
# # print first and last 16 feature names
# print(vectorizerTP.get_feature_names()[:16],'...',
#       vectorizerTP.get_feature_names()[-16:])
# # print first and last 16 term presence vector for 6 rows/sentences
# with np.printoptions(edgeitems=16):
#     print(X.toarray()[:6])

# # print(X.shape, type(X))
# # # Or if we wanted to get the vector for one word:
# # print('Vector abai: ')
# # with np.printoptions(edgeitems=10):
# #     print(X.transform(['abai']).toarray())

# # print(vectorizer.vocabulary_)
# import reprlib
# print(reprlib.repr(vectorizerTP.vocabulary_))

In [16]:
# Transform Train_X and Test_X into term presence's vector
Train_X_TP = vectorizerTP.transform(Train_X)
Test_X_TP = vectorizerTP.transform(Test_X)

In [17]:
# print(Train_X_TP)

### CLASSIFICATION with term presence

In [18]:
# fit the training dataset on the classifier
clf.fit(Train_X_TP,Train_Y)
# predict the labels on validation dataset
predictions_SVM_TP = clf.predict(Test_X_TP)

# Use accuracy_score function to get the accuracy
accuracy = accuracy_score(Test_Y_Actual, predictions_SVM_TP)*100
print('SVM Accuracy Score -> ', accuracy)

SVM Accuracy Score ->  60.44176706827309


In [19]:
# Save accuracy score to file
if LLmark == 1:
    output = 'svm_acc_lb1.txt'
    with open(output, 'w') as f:
        f.write(str(accuracy))
elif LLmark == 2:
    output = 'svm_acc_lb2.txt'
    with open(output, 'w') as f:
        f.write(str(accuracy))
else:
    output = 'svm_acc_lb0.txt'
    with open(output, 'w') as f:
        f.write(str(accuracy))

In [20]:
# # Comparing the Lexicon Values with Predicted Values
# df = pd.DataFrame({'Lexicon Values':Test_Y, 'Predicted Values':predictions_SVM_TP})
# df

In [21]:
# print(predictions_SVM_TP)

# FEATURE EXTRACTION: BoW

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(Corpus['text_final'])

In [23]:
# # print first and last 16 feature names
# print(vectorizer.get_feature_names()[:16],'...',
#       vectorizer.get_feature_names()[-16:])
# # print first and last 16 BoW vector for 6 rows/sentences
# with np.printoptions(edgeitems=16):
#     print(X.toarray()[:6])

# print(X.shape, type(X))
# # # Or if we wanted to get the vector for one word:
# # print('Vector abai: ')
# # with np.printoptions(edgeitems=10):
# #     print(X.transform(['abai']).toarray())

# # print(vectorizer.vocabulary_)
# import reprlib
# print(reprlib.repr(vectorizer.vocabulary_))

In [24]:
# Transform Train_X and Test_X into BoW's vector
Train_X_BoW = vectorizer.transform(Train_X)
Test_X_BoW = vectorizer.transform(Test_X)

In [25]:
# print(Train_X_BoW)

### CLASSIFICATION with BoW

In [26]:
# fit the training dataset on the classifier
clf.fit(Train_X_BoW,Train_Y)
# predict the labels on validation dataset
predictions_SVM_BoW = clf.predict(Test_X_BoW)

# Use accuracy_score function to get the accuracy
accuracy = accuracy_score(Test_Y_Actual, predictions_SVM_BoW)*100
print('SVM Accuracy Score -> ',accuracy)

SVM Accuracy Score ->  59.63855421686747


In [27]:
# Save accuracy score to file
if LLmark == 1:
    output = 'svm_acc_lb1.txt'
    with open(output, 'a') as f:
        f.write(str('\n')+str(accuracy))
elif LLmark == 2:
    output = 'svm_acc_lb2.txt'
    with open(output, 'a') as f:
        f.write(str('\n')+str(accuracy))
else:
    output = 'svm_acc_lb0.txt'
    with open(output, 'a') as f:
        f.write(str('\n')+str(accuracy))

In [28]:
# print(predictions_SVM_BoW)

# FEATURE EXTRACTION: TF-IDF

In [29]:
Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(Corpus['text_final'])

X = Tfidf_vect.fit_transform(Corpus['text_final'])

#### another option to consider

<code> Tfidf_vect = TfidfVectorizer(max_features=None).fit(Corpus['text_final']) <code>
<code> Tfidf_vect = TfidfVectorizer(max_features=5000).fit(Corpus['text_final']) <code>
<code> Tfidf_vect = TfidfVectorizer(min_df=5, max_df=0.8, sublinear_tf=True, \
                    use_idf=True).fit(Corpus['text_final']) <code>

In [30]:
# # print first and last 16 feature names
# print(Tfidf_vect.get_feature_names()[:16],'\n',
#       Tfidf_vect.get_feature_names()[-16:])
# # print first and last 16 TF-IDF vector for 6 rows/sentences
# with np.printoptions(edgeitems=16):
#     print(X.toarray()[:6])

# # print(Tfidf_vect.vocabulary_)
# import reprlib
# print(reprlib.repr(Tfidf_vect.vocabulary_))

# # # Or if we wanted to get the vector for one word:
# # # for example word in array[22]:
# # val = list(Tfidf_vect.vocabulary_)[22]
# # print(val)

In [31]:
# Transform Train_X and Test_X into TF-IDF's vector
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [32]:
# print(Train_X_Tfidf)

### CLASSIFICATION with TF-IDF

In [33]:
# fit the training dataset on the classifier
clf.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM_Tfidf = clf.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
accuracy = accuracy_score(Test_Y_Actual, predictions_SVM_Tfidf)*100
print('SVM Accuracy Score -> ',accuracy)

SVM Accuracy Score ->  64.45783132530121


In [34]:
# Save accuracy score to file
if LLmark == 1:
    output = 'svm_acc_lb1.txt'
    with open(output, 'a') as f:
        f.write(str('\n')+str(accuracy))
elif LLmark == 2:
    output = 'svm_acc_lb2.txt'
    with open(output, 'a') as f:
        f.write(str('\n')+str(accuracy))
else:
    output = 'svm_acc_lb0.txt'
    with open(output, 'a') as f:
        f.write(str('\n')+str(accuracy))

In [35]:
# print(predictions_SVM_Tfidf)

#### **Data Insight:**

In [36]:
# Insight of train and test set ratio from actual text and from feature extraction's vector.
print('X:', Corpus['text'].size,
      '\nTrain\tTest\t\t%train\t%test\n',
#       Train_Y.size, '\t', Test_Y.size, '\ty\t', '{:.2f}'.format(Train_Y.size/(Test_Y.size+Train_Y.size)*100), '\t', '{:.2f}'.format(Test_Y.size/(Test_Y.size+Train_Y.size)*100), '\n',
      Train_X.size, '\t', Test_X.size, '\tX\t', '{:.2f}'.format(Train_X.size/(Test_X.size+Train_X.size)*100), '\t', '{:.2f}'.format(Test_X.size/(Test_X.size+Train_X.size)*100), '\n',
      Train_X_TP.size, '\t', Test_X_TP.size, '\tTP\t', '{:.2f}'.format(Train_X_TP.size/(Test_X_TP.size+Train_X_TP.size)*100), '\t', '{:.2f}'.format(Test_X_TP.size/(Test_X_TP.size+Train_X_TP.size)*100), '\n',
      Train_X_BoW.size, '\t', Test_X_BoW.size, '\tBoWs\t', '{:.2f}'.format(Train_X_BoW.size/(Test_X_BoW.size+Train_X_BoW.size)*100), '\t', '{:.2f}'.format(Test_X_BoW.size/(Test_X_BoW.size+Train_X_BoW.size)*100), '\n',
      Train_X_Tfidf.size, '\t', Test_X_Tfidf.size, '\tTF-IDF\t', '{:.2f}'.format(Train_X_Tfidf.size/(Test_X_Tfidf.size+Train_X_Tfidf.size)*100), '\t', '{:.2f}'.format(Test_X_Tfidf.size/(Test_X_Tfidf.size+Train_X_Tfidf.size)*100))

# Insight of negatives and positives count from test sets using actual label, lexicon label, and predicted label;
# Actual label and lexicon label is denoted as 'Test_Y_Actual' and 'Test_Y' respectively;
# If you use actual label as baseline, then lexicon label here is the actual label itself.
print('\nneg\tpos\t\tsum\n',
      (Test_Y_Actual==0).sum(), '\t', (Test_Y_Actual==1).sum(), '\t', Test_Y_Actual.size, '\tTest_Y_Actual\n',
      (Test_Y==0).sum(), '\t', (Test_Y==1).sum(), '\t', Test_Y.size, '\tTest_Y\n',
      (predictions_SVM_TP==0).sum(),    '\t', (predictions_SVM_TP==1).sum(),    '\t', predictions_SVM_TP.size,    '\tP_TP\n',
      (predictions_SVM_BoW==0).sum(),   '\t', (predictions_SVM_BoW==1).sum(),   '\t', predictions_SVM_BoW.size,   '\tP_BoWs\n',
      (predictions_SVM_Tfidf==0).sum(), '\t', (predictions_SVM_Tfidf==1).sum(), '\t', predictions_SVM_Tfidf.size, '\tP_Tfidf')

X: 1660 
Train	Test		%train	%test
 1162 	 498 	X	 70.00 	 30.00 
 19529 	 8560 	TP	 69.53 	 30.47 
 19529 	 8560 	BoWs	 69.53 	 30.47 
 19529 	 8560 	TF-IDF	 69.53 	 30.47

neg	pos		sum
 301 	 197 	 498 	Test_Y_Actual
 330 	 168 	 498 	Test_Y
 346 	 152 	 498 	P_TP
 334 	 164 	 498 	P_BoWs
 402 	 96 	 498 	P_Tfidf


# EVALUATION / VALIDATION 

## Confusion Matrix

In [37]:
# Creating confusion matrix from predicted label
# compared with actual/original label
from sklearn.metrics import confusion_matrix


y_true = Test_Y_Actual

## Term presence ##
print('Confusion Matrix - Term presence')
y_pred = predictions_SVM_TP
conf_matrix = confusion_matrix(y_true, y_pred, labels=[1,0])
print(conf_matrix, conf_matrix.sum())

## BoW ##
print('\nConfusion Matrix - BoW')
y_pred = predictions_SVM_BoW
conf_matrix = confusion_matrix(y_true, y_pred, labels=[1,0])
print(conf_matrix, conf_matrix.sum())

## TF-IDF ##
print('\nConfusion Matrix - TF-IDF')
y_pred = predictions_SVM_Tfidf
conf_matrix = confusion_matrix(y_true, y_pred, labels=[1,0])
print(conf_matrix, conf_matrix.sum())

Confusion Matrix - Term presence
[[ 76 121]
 [ 76 225]] 498

Confusion Matrix - BoW
[[ 80 117]
 [ 84 217]] 498

Confusion Matrix - TF-IDF
[[ 58 139]
 [ 38 263]] 498


## Classification Report: *using imbalanced data*
Since our label class—either *actual label*, *label by InSet*, or *label by sentiwords_id*—is not distributed equally (or fair enough), our data contains imbalanced class. This could cause misclassification by the classification model we made, resulting inaccurate scoring. We'll try to train our imbalanced data and evaluate it later.

In [38]:
# Make classification report using imbalanced data
from sklearn.metrics import classification_report


## Term presence ##
print('Imbalanced data - Term presence\n',
      classification_report(Test_Y_Actual, predictions_SVM_TP))
## BoW ##
print('Imbalanced data - BoW\n',
      classification_report(Test_Y_Actual, predictions_SVM_BoW))
## TF-IDF ##
print('Imbalanced data - TF-IDF\n',
      classification_report(Test_Y_Actual, predictions_SVM_Tfidf))

Imbalanced data - Term presence
               precision    recall  f1-score   support

           0       0.65      0.75      0.70       301
           1       0.50      0.39      0.44       197

    accuracy                           0.60       498
   macro avg       0.58      0.57      0.57       498
weighted avg       0.59      0.60      0.59       498

Imbalanced data - BoW
               precision    recall  f1-score   support

           0       0.65      0.72      0.68       301
           1       0.49      0.41      0.44       197

    accuracy                           0.60       498
   macro avg       0.57      0.56      0.56       498
weighted avg       0.59      0.60      0.59       498

Imbalanced data - TF-IDF
               precision    recall  f1-score   support

           0       0.65      0.87      0.75       301
           1       0.60      0.29      0.40       197

    accuracy                           0.64       498
   macro avg       0.63      0.58      0.57   

#### **\*note:**
The code below actually is serving the same purpose as the code above. It's preserved to show the original process if we don't create variable(s) from prediction result.

In [39]:
# # Make classification report using 'imbalanced' data
# from sklearn.metrics import classification_report


# ## Term presence ##
# X_train = Train_X_TP
# X_test = Test_X_TP
# clf.fit(X_train, Train_Y)
# print('Imbalanced data - Term presence\n',
#       classification_report(Test_Y_Actual, clf.predict(X_test)))

# ## BoW ##
# X_train = Train_X_BoW
# X_test = Test_X_BoW
# clf.fit(X_train, Train_Y)
# print('Imbalanced data - BoW\n',
#       classification_report(Test_Y_Actual, clf.predict(X_test)))

# ## TF-IDF ##
# X_train = Train_X_Tfidf
# X_test = Test_X_Tfidf
# clf.fit(X_train, Train_Y)
# print('Imbalanced data - TF-IDF\n',
#       classification_report(Test_Y_Actual, clf.predict(X_test)))

## Classification Report: *using oversampled data*
Here we'll train our imbalanced data using **oversampling** method and evaluate it.

In [40]:
# Save accuracy score to file
def acc_oversampled(LLmark, accuracy):
    if LLmark == 1:
        output = 'svm_acc_o_lb1.txt'
        with open(output, 'a') as f:
            f.write(str(accuracy)+str('\n'))
    elif LLmark == 2:
        output = 'svm_acc_o_lb2.txt'
        with open(output, 'a') as f:
            f.write(str(accuracy)+str('\n'))
    else:
        output = 'svm_acc_o_lb0.txt'
        with open(output, 'a') as f:
            f.write(str(accuracy)+str('\n'))

In [41]:
# Make classification report using 'oversampled' data
from imblearn.over_sampling import SVMSMOTE


# svmsmote = SVMSMOTE(random_state=None)
svmsmote = SVMSMOTE(random_state = 500)

y_train = Train_Y
y_test = Test_Y_Actual

## Term presence ##
X_train = Train_X_TP
X_test = Test_X_TP
X_oversample_svm, y_oversample_svm = svmsmote.fit_resample(X_train, y_train)
# train the classifier with oversampled data using borderline-SMOTE SVM (SVM SMOTE)
clf.fit(X_oversample_svm, y_oversample_svm)
acc_oversampled(LLmark, accuracy_score(y_test, clf.predict(X_test))*100)
print('Oversampled data - Term presence\n', classification_report(y_test, clf.predict(X_test)))

## BoW ##
X_train = Train_X_BoW
X_test = Test_X_BoW
X_oversample_svm, y_oversample_svm = svmsmote.fit_resample(X_train, y_train)
# train the classifier with oversampled data using borderline-SMOTE SVM (SVM SMOTE)
clf.fit(X_oversample_svm, y_oversample_svm)
acc_oversampled(LLmark, accuracy_score(y_test, clf.predict(X_test))*100)
print('Oversampled data - BoW\n', classification_report(y_test, clf.predict(X_test)))

## TF-IDF ##
X_train = Train_X_Tfidf
X_test = Test_X_Tfidf
X_oversample_svm, y_oversample_svm = svmsmote.fit_resample(X_train, y_train)
# train the classifier with oversampled data using borderline-SMOTE SVM (SVM SMOTE)
clf.fit(X_oversample_svm, y_oversample_svm)
acc_oversampled(LLmark, accuracy_score(y_test, clf.predict(X_test))*100)
print('Oversampled data - TF-IDF\n', classification_report(y_test, clf.predict(X_test)))

Oversampled data - Term presence
               precision    recall  f1-score   support

           0       0.61      0.61      0.61       301
           1       0.40      0.40      0.40       197

    accuracy                           0.52       498
   macro avg       0.50      0.50      0.50       498
weighted avg       0.52      0.52      0.52       498

Oversampled data - BoW
               precision    recall  f1-score   support

           0       0.61      0.60      0.61       301
           1       0.40      0.41      0.40       197

    accuracy                           0.53       498
   macro avg       0.51      0.51      0.51       498
weighted avg       0.53      0.53      0.53       498

Oversampled data - TF-IDF
               precision    recall  f1-score   support

           0       0.65      0.80      0.72       301
           1       0.53      0.35      0.42       197

    accuracy                           0.62       498
   macro avg       0.59      0.57      0.57

<blockquote><i>"The purpose of oversampling is ... to have a better prediction model. This technique was not created for any analysis purposes as every data created is synthetic, so that is a reminder."</i></blockquote>

<blockquote><i>"... <b>you should only oversample your training data and not the whole data</b> except if you would use the entire data as your training data. <b>In case you want to split the data, you should split the data first</b> before oversampled the training data."</i></blockquote>

<blockquote>Ref: <i>https://towardsdatascience.com/5-smote-techniques-for-oversampling-your-imbalance-data-b8155bdbe2b5?gi=67231aa6fa80</i></blockquote>

## Validation using k-Fold cv
Feel free to change `n_splits` value based on your needs. For example, 5 splits means that the data (oversampled X and y) is splitted to *4 portion* for new training set and *1 portion* for new test set. If `shuffle` set to True, it then use different data combination. Then, it cross-validated for 5 iterations. 

In [42]:
# Save accuracy score to file
def acc_oversampled(LLmark, featExt, accuracy):
    if LLmark == 1:
        output = 'svm_acc_ov_lb1_'+str(featExt)+'_kfold.txt'
        with open(output, 'a') as f:
            f.write(str(accuracy)+'\n')
    elif LLmark == 2:
        output = 'svm_acc_ov_lb2_'+str(featExt)+'_kfold.txt'
        with open(output, 'a') as f:
            f.write(str(accuracy)+'\n')
    else:
        output = 'svm_acc_ov_lb0_'+str(featExt)+'_kfold.txt'
        with open(output, 'a') as f:
            f.write(str(accuracy)+'\n')

In [43]:
# Save precision, recall, f1-score to file
def cr_oversampled(LLmark, featExt, precision, recall, f1):
    if LLmark == 1:
        output = 'svm_cr_ov_lb1_'+str(featExt)+'_kfold.txt'
        with open(output, 'a') as f:
            f.write(str(precision)+'\t'+str(recall)+'\t'+str(f1)+'\n')
    elif LLmark == 2:
        output = 'svm_cr_ov_lb2_'+str(featExt)+'_kfold.txt'
        with open(output, 'a') as f:
            f.write(str(precision)+'\t'+str(recall)+'\t'+str(f1)+'\n')
    else:
        output = 'svm_cr_ov_lb0_'+str(featExt)+'_kfold.txt'
        with open(output, 'a') as f:
            f.write(str(precision)+'\t'+str(recall)+'\t'+str(f1)+'\n')

In [44]:
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import KFold

outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

y_true = Corpus[['label']]
y_true = Encoder.fit_transform(y_true)
y = LL
y = Encoder.fit_transform(y)

  return f(*args, **kwargs)


#### **Step 1:** Evaluate with `term presence` as feature extraction method. Then, **oversample the model** in each fold using `SVM SMOTE`

In [45]:
featExt = 'tp'

X = vectorizerTP.fit_transform(Corpus['text_final'])
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    y_true_train, y_true_test = y_true[train_index], y_true[test_index]
    X_train_oversampled, y_train_oversampled = svmsmote.fit_resample(X_train, y_train.ravel())
    
    clf.fit(X_train_oversampled, y_train_oversampled)
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_true_test, y_pred)
    precision = precision_score(y_true_test, y_pred)
    recall = recall_score(y_true_test, y_pred)
    f1 = f1_score(y_true_test, y_pred)
    
    #print to file
    acc_oversampled(LLmark, featExt, accuracy)
    cr_oversampled(LLmark, featExt, precision, recall, f1)

    print(f'# For fold {fold}:')
#     print(classification_report(y_true_test, y_pred), "\n")
    print(f'accuracy: {accuracy}')
#     print(f'precision: {precision}')
#     print(f'recall: {recall}')
    print(f'f-score: {f1}')

# For fold 1:
accuracy: 0.5271084337349398
f-score: 0.3843137254901961
# For fold 2:
accuracy: 0.5692771084337349
f-score: 0.5017421602787456
# For fold 3:
accuracy: 0.5421686746987951
f-score: 0.48299319727891155
# For fold 4:
accuracy: 0.49096385542168675
f-score: 0.4565916398713827
# For fold 5:
accuracy: 0.46987951807228917
f-score: 0.38461538461538464


#### **Step 2:** Evaluate with `BoW` as feature extraction method. Then, **oversample the model** in each fold using `SVM SMOTE`

In [46]:
featExt = 'bow'

X = vectorizer.fit_transform(Corpus['text_final'])
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    y_true_train, y_true_test = y_true[train_index], y_true[test_index]
    X_train_oversampled, y_train_oversampled = svmsmote.fit_resample(X_train, y_train.ravel())
    
    clf.fit(X_train_oversampled, y_train_oversampled)
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_true_test, y_pred)
    precision = precision_score(y_true_test, y_pred)
    recall = recall_score(y_true_test, y_pred)
    f1 = f1_score(y_true_test, y_pred)

    #print to file
    acc_oversampled(LLmark, featExt, accuracy)
    cr_oversampled(LLmark, featExt, precision, recall, f1)

    print(f'# For fold {fold}:')
#     print(classification_report(y_true_test, y_pred), "\n")
    print(f'accuracy: {accuracy}')
#     print(f'precision: {precision}')
#     print(f'recall: {recall}')
    print(f'f-score: {f1}')

# For fold 1:
accuracy: 0.5060240963855421
f-score: 0.3880597014925374
# For fold 2:
accuracy: 0.5602409638554217
f-score: 0.4859154929577465
# For fold 3:
accuracy: 0.5512048192771084
f-score: 0.4879725085910653
# For fold 4:
accuracy: 0.48493975903614456
f-score: 0.4429967426710098
# For fold 5:
accuracy: 0.46987951807228917
f-score: 0.393103448275862


#### **Step 3:** Evaluate with `TF-IDF` as feature extraction method. Then, **oversample the model** in each fold using `SVM SMOTE`

In [47]:
featExt = 'tfidf'

X = Tfidf_vect.fit_transform(Corpus['text_final'])
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    y_true_train, y_true_test = y_true[train_index], y_true[test_index]
    X_train_oversampled, y_train_oversampled = svmsmote.fit_resample(X_train, y_train.ravel())
    
    clf.fit(X_train_oversampled, y_train_oversampled)
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_true_test, y_pred)
    precision = precision_score(y_true_test, y_pred)
    recall = recall_score(y_true_test, y_pred)
    f1 = f1_score(y_true_test, y_pred)

    #print to file
    acc_oversampled(LLmark, featExt, accuracy)
    cr_oversampled(LLmark, featExt, precision, recall, f1)

    print(f'# For fold {fold}:')
#     print(classification_report(y_true_test, y_pred), "\n")
    print(f'accuracy: {accuracy}')
#     print(f'precision: {precision}')
#     print(f'recall: {recall}')
    print(f'f-score: {f1}')

# For fold 1:
accuracy: 0.6295180722891566
f-score: 0.43317972350230416
# For fold 2:
accuracy: 0.6054216867469879
f-score: 0.43776824034334766
# For fold 3:
accuracy: 0.5873493975903614
f-score: 0.43621399176954734
# For fold 4:
accuracy: 0.5632530120481928
f-score: 0.3933054393305439
# For fold 5:
accuracy: 0.5753012048192772
f-score: 0.35023041474654376


<blockquote>Ref: <i>https://stackoverflow.com/questions/55591063/how-to-perform-smote-with-cross-validation-in-sklearn-in-python</i></blockquote>