In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import nltk
import re
import string
from collections import Counter
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from IPython.display import display
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
file_path_train = '../data/train_preprocess.tsv'
train_data = pd.read_csv(file_path_train, sep='\t', header=None)

file_path_val = '../data/valid_preprocess.tsv'
val_data = pd.read_csv(file_path_val, sep='\t', header=None)

file_path_test = '../data/test_preprocess.tsv'
test_data = pd.read_csv(file_path_test, sep='\t', header=None)

# Data Understanding
print('Data Train')
print('Data Size:', train_data.shape)
print('Sample Data:')
display(train_data.head())
print('\nSentiment Distribution:', train_data[1].value_counts())

print('\nData Validation')
print('Data Size:', val_data.shape)
print('Sample Data:')
display(val_data.head())

print('\nData Test')
print('Data Size:', test_data.shape)
print('Sample Data:')
display(test_data.head())

Data Train
Data Size: (11000, 2)
Sample Data:


Unnamed: 0,0,1
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral
2,lokasi strategis di jalan sumatera bandung . t...,positive
3,betapa bahagia nya diri ini saat unboxing pake...,positive
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative



Sentiment Distribution: 1
positive    6416
negative    3436
neutral     1148
Name: count, dtype: int64

Data Validation
Data Size: (1260, 2)
Sample Data:


Unnamed: 0,0,1
0,"meski masa kampanye sudah selesai , bukan bera...",neutral
1,tidak enak,negative
2,restoran ini menawarkan makanan sunda . kami m...,positive
3,lokasi di alun alun masakan padang ini cukup t...,positive
4,betapa bejad kader gerindra yang anggota dprd ...,negative



Data Test
Data Size: (500, 2)
Sample Data:


Unnamed: 0,0,1
0,kemarin gue datang ke tempat makan baru yang a...,negative
1,kayak nya sih gue tidak akan mau balik lagi ke...,negative
2,"kalau dipikir-pikir , sebenarnya tidak ada yan...",negative
3,ini pertama kalinya gua ke bank buat ngurusin ...,negative
4,waktu sampai dengan gue pernah disuruh ibu lat...,negative


In [3]:
# Label Splitting
x_train = train_data[0]
y_train = train_data[1]

print('x_train:\n')
display(x_train.head())
print('y_train:\n')
display(y_train.head())

x_val = val_data[0]
y_val = val_data[1]
print('x_val:\n')
display(x_val.head())
print('y_val:\n')
display(y_val.head())

x_test = test_data[0]
y_test = test_data[1]
print('x_test:\n')
display(x_test.head())
print('y_test:\n')
display(y_test.head())

x_train:



0    warung ini dimiliki oleh pengusaha pabrik tahu...
1    mohon ulama lurus dan k212 mmbri hujjah partai...
2    lokasi strategis di jalan sumatera bandung . t...
3    betapa bahagia nya diri ini saat unboxing pake...
4    duh . jadi mahasiswa jangan sombong dong . kas...
Name: 0, dtype: object

y_train:



0    positive
1     neutral
2    positive
3    positive
4    negative
Name: 1, dtype: object

x_val:



0    meski masa kampanye sudah selesai , bukan bera...
1                                           tidak enak
2    restoran ini menawarkan makanan sunda . kami m...
3    lokasi di alun alun masakan padang ini cukup t...
4    betapa bejad kader gerindra yang anggota dprd ...
Name: 0, dtype: object

y_val:



0     neutral
1    negative
2    positive
3    positive
4    negative
Name: 1, dtype: object

x_test:



0    kemarin gue datang ke tempat makan baru yang a...
1    kayak nya sih gue tidak akan mau balik lagi ke...
2    kalau dipikir-pikir , sebenarnya tidak ada yan...
3    ini pertama kalinya gua ke bank buat ngurusin ...
4    waktu sampai dengan gue pernah disuruh ibu lat...
Name: 0, dtype: object

y_test:



0    negative
1    negative
2    negative
3    negative
4    negative
Name: 1, dtype: object

In [4]:
# Label Encoding
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)
y_test_encoded = label_encoder.transform(y_test)

print('Encoded labels train:', y_train_encoded)
print('Encoded labels validation:', y_val_encoded)
print('Encoded labels test:', y_test_encoded)
print('Classes:', label_encoder.classes_)

Encoded labels train: [2 1 2 ... 1 0 2]
Encoded labels validation: [1 0 2 ... 0 0 2]
Encoded labels test: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 0 1 1 1 1
 0 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 1 1 1 1 1 1 1
 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 

In [5]:
# Preprocessing
def lowercase (data):
    return data.str.lower()

def entity_masking (data):
    # Email masking
    email = re.compile(r'\w+@\w+\.[a-z]{3}')
    data_mask = data.apply(lambda x: email.sub('_email_', x))

    # Phone Number Masking
    phone = re.compile(r'(\+?\d{1,2}[-.\s]?)?(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}')
    data_mask = data_mask.apply(lambda x: phone.sub('_phone_', x))
    
    return data_mask

def tokenization (data):
    data_tokenize = data.apply(lambda x: word_tokenize(x))
    return data_tokenize

def idn_spelling_correction (data):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    
    data_stem = []
    for sentence in data:
        stemmed_words = [stemmer.stem(word) for word in sentence]
        data_stem.append(stemmed_words)

    return data_stem

def remove_punctuation (data):
    data_nopunc = []

    for sentence in data:
        cleaned_sentence = [word for word in sentence if word not in string.punctuation]
        data_nopunc.append(cleaned_sentence)

    return data_nopunc

def preprocess (data):
    data = lowercase(data)
    data = entity_masking(data)
    data = tokenization(data)
    data = idn_spelling_correction(data)
    data = remove_punctuation(data)
    return data

In [7]:
# Preprocess Data
x_train_preprocessed = preprocess(x_train)
x_val_preprocessed = preprocess(x_val)
x_test_preprocessed = preprocess(x_test)

In [14]:
# Feature Extraction: Bag of Words TF-IDF
x_train_preprocessed = [' '.join(doc) for doc in x_train_preprocessed]
x_val_preprocessed = [' '.join(doc) for doc in x_val_preprocessed]
x_test_preprocessed = [' '.join(doc) for doc in x_test_preprocessed]

tfidf_vec = TfidfVectorizer(min_df=100, token_pattern=r'[a-zA-Z]+')
x_train_bow = tfidf_vec.fit_transform(x_train_preprocessed)
x_val_bow = tfidf_vec.transform(x_val_preprocessed)
x_test_bow = tfidf_vec.transform(x_test_preprocessed)


In [17]:
# Evaluation Metrics: Accuracy, Precision, Recall, F1-Score
def model_predict(model, x):
    y_pred = model.predict(x)

    return y_pred

def model_evaluation(y_pred, y_actual):
    accuracy = accuracy_score(y_actual, y_pred)
    precision = precision_score(y_actual, y_pred, average='weighted')
    recall = recall_score(y_actual, y_pred, average='weighted')
    f1 = f1_score(y_actual, y_pred, average='weighted')

    return accuracy, precision, recall, f1

In [24]:
# Model Training
# Logistic Regression
print('Logistic Regression')
logreg = LogisticRegression(max_iter=1000)
logreg.fit(x_train_bow, y_train_encoded)

# Predictions validation data
y_pred = model_predict(logreg, x_val_bow)
print('Predictions:', y_pred)
print('Actual:', y_val_encoded)
print()

evaluation_metrics = model_evaluation(y_pred, y_val_encoded)
print('Accuracy:', evaluation_metrics[0])
print('Precision:', evaluation_metrics[1])
print('Recall:', evaluation_metrics[2])
print('F1-Score:', evaluation_metrics[3])

# Hyperparameter Tuning
print("\nHyperparameter Tuning for Logistic Regression")
param_grid_logreg = {
    'C': [0.1, 1, 10],
    'solver': ['newton-cg', 'lbfgs', 'liblinear']
}
grid_search_logreg = GridSearchCV(LogisticRegression(max_iter=1000), param_grid_logreg, cv=5)
grid_search_logreg.fit(x_train_bow, y_train_encoded)
print('Best Parameters (LogReg):', grid_search_logreg.best_params_)
print('Best Score (LogReg):', grid_search_logreg.best_score_)

# Predictions validation data
y_pred = model_predict(grid_search_logreg, x_val_bow)
print('Predictions:', y_pred)
print('Actual:', y_val_encoded)
print()

evaluation_metrics = model_evaluation(y_pred, y_val_encoded)
print('Accuracy:', evaluation_metrics[0])
print('Precision:', evaluation_metrics[1])
print('Recall:', evaluation_metrics[2])
print('F1-Score:', evaluation_metrics[3])

Logistic Regression
Predictions: [0 2 2 ... 2 2 2]
Actual: [1 0 2 ... 0 0 2]

Accuracy: 0.6634920634920635
Precision: 0.637041247647381
Recall: 0.6634920634920635
F1-Score: 0.6298988267935423

Hyperparameter Tuning for Logistic Regression
Best Parameters (LogReg): {'C': 10, 'solver': 'newton-cg'}
Best Score (LogReg): 0.6822727272727273
Predictions: [0 2 2 ... 2 2 2]
Actual: [1 0 2 ... 0 0 2]

Accuracy: 0.6706349206349206
Precision: 0.6482914579973402
Recall: 0.6706349206349206
F1-Score: 0.6420954434456648


In [25]:
# Random Forest Classifier
print('Random Forest')
rf = RandomForestClassifier()
rf.fit(x_train_bow, y_train_encoded)

# Prediction on validation set
y_pred = model_predict(rf, x_val_bow)
print('Predictions:', y_pred)
print('Actual:', y_val_encoded)
print()

evaluation_metrics = model_evaluation(y_pred, y_val_encoded)
print('Accuracy:', evaluation_metrics[0])
print('Precision:', evaluation_metrics[1])
print('Recall:', evaluation_metrics[2])
print('F1-Score:', evaluation_metrics[3])

# Hyperparameter Tuning
print("\nHyperparameter Tuning for Random Forest")
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20]
}
grid_search_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5)
grid_search_rf.fit(x_train_bow, y_train_encoded)
print('Best Parameters (RandomForest):', grid_search_rf.best_params_)
print('Best Score (RandomForest):', grid_search_rf.best_score_)

# Prediction on validation data
y_pred = model_predict(grid_search_rf, x_val_bow)
print('Predictions:', y_pred)
print('Actual:', y_val_encoded)
print()

evaluation_metrics = model_evaluation(y_pred, y_val_encoded)
print('Accuracy:', evaluation_metrics[0])
print('Precision:', evaluation_metrics[1])
print('Recall:', evaluation_metrics[2])
print('F1-Score:', evaluation_metrics[3])

Random Forest
Predictions: [0 2 2 ... 2 0 2]
Actual: [1 0 2 ... 0 0 2]

Accuracy: 0.6984126984126984
Precision: 0.6917692435633565
Recall: 0.6984126984126984
F1-Score: 0.6866909583249067

Hyperparameter Tuning for Random Forest
Best Parameters (RandomForest): {'max_depth': 20, 'n_estimators': 200}
Best Score (RandomForest): 0.7158181818181819
Predictions: [0 0 2 ... 2 2 2]
Actual: [1 0 2 ... 0 0 2]

Accuracy: 0.6952380952380952
Precision: 0.6858872535275782
Recall: 0.6952380952380952
F1-Score: 0.6832383541046135


In [26]:
# Support Vector Machine
svm = SVC()
svm.fit(x_train_bow, y_train_encoded)

# Prediction on validation data
y_pred = model_predict(svm, x_val_bow)
print('Predictions:', y_pred)
print('Actual:', y_val_encoded)
print()

evaluation_metrics = model_evaluation(y_pred, y_val_encoded)
print('Support Vector Machine')
print('Accuracy:', evaluation_metrics[0])
print('Precision:', evaluation_metrics[1])
print('Recall:', evaluation_metrics[2])
print('F1-Score:', evaluation_metrics[3])

# Hyperparameter Tuning
print("\nHyperparameter Tuning for Support Vector Machine")
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}

grid_search_svm = GridSearchCV(SVC(), param_grid_svm, cv=5)
grid_search_svm.fit(x_train_bow, y_train_encoded)
print('Best Parameters (SVM):', grid_search_svm.best_params_)
print('Best Score (SVM):', grid_search_svm.best_score_)

# Prediction on validation data
y_pred = model_predict(grid_search_svm, x_val_bow)
print('Predictions:', y_pred)
print('Actual:', y_val_encoded)
print()

evaluation_metrics = model_evaluation(y_pred, y_val_encoded)
print('Accuracy:', evaluation_metrics[0])
print('Precision:', evaluation_metrics[1])
print('Recall:', evaluation_metrics[2])
print('F1-Score:', evaluation_metrics[3])

Predictions: [0 0 2 ... 2 2 2]
Actual: [1 0 2 ... 0 0 2]

Support Vector Machine
Accuracy: 0.6968253968253968
Precision: 0.6809768771056545
Recall: 0.6968253968253968
F1-Score: 0.6813644328472854

Hyperparameter Tuning for Support Vector Machine
Best Parameters (SVM): {'C': 10, 'kernel': 'rbf'}
Best Score (SVM): 0.7238181818181817
Predictions: [0 0 2 ... 2 2 2]
Actual: [1 0 2 ... 0 0 2]

Accuracy: 0.703968253968254
Precision: 0.6869554094359898
Recall: 0.703968253968254
F1-Score: 0.6900201337960555
