In [2]:
import nltk
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [3]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
train_data = pd.read_csv('train_preprocess.tsv', sep='\t', header=None, names=['text', 'label'])
test_data = pd.read_csv('test_preprocess.tsv', sep='\t', header=None, names=['text', 'label'])
validation_data = pd.read_csv('valid_preprocess.tsv', sep='\t', header=None, names=['text', 'label'])

In [5]:
X_train = train_data.drop(columns=['label'])
y_train = train_data['label']

X_test = test_data.drop(columns=['label'])
y_test = test_data['label']

X_val = validation_data.drop(columns=['label'])
y_val = validation_data['label']


In [6]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def preprocess(sent):
    # menghapus token yang bukan alfabet (non alfabet/non letter)
    remove_nonletters = re.sub("[^a-zA-Z]", " ", sent)
    # mengubah token menjadi lower case
    lower = nltk.word_tokenize(remove_nonletters.lower())
    # set list stopwords untuk bahasa indonesia
    stop_words = set(stopwords.words('indonesian'))
    # stemming token untuk token non stopword
    stopwords_removed = [stemmer.stem(token) for token in lower if not token in stop_words]
    return " ".join(stopwords_removed)

In [8]:
preprocess("memberikan pujian")

'puji'

In [9]:
train_data['cleaned'] = train_data['text'].apply(preprocess)
test_data['cleaned'] = test_data['text'].apply(preprocess)
validation_data['cleaned'] = validation_data['text'].apply(preprocess)

In [10]:
# Feature Extraction : return vektor angka

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
train_features = vectorizer.fit_transform(train_data['cleaned']).toarray()
test_features = vectorizer.transform(test_data['cleaned']).toarray()
val_features = vectorizer.transform(validation_data['cleaned']).toarray()

In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model_name, y_true, y_pred):
    print(f"Evaluating {model_name} Model")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred, average='weighted'))
    print("Recall:", recall_score(y_true, y_pred, average='weighted'))
    print("F1 Score:", f1_score(y_true, y_pred, average='weighted'))
    print()

In [23]:
from sklearn.linear_model import LogisticRegression

print("Training Softmax Regression...")
softmax_reg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=200)
softmax_reg.fit(train_features, y_train)

Training Softmax Regression...




In [24]:
softmax_pred = softmax_reg.predict(test_features)
evaluate_model("Softmax Regression", y_test, softmax_pred)

Evaluating Softmax Regression Model
Accuracy: 0.736
Precision: 0.7345994473930293
Recall: 0.736
F1 Score: 0.728736079617032



In [40]:
from sklearn.neural_network import MLPClassifier

print("Training MLP (Neural Network)...")
mlp = MLPClassifier(hidden_layer_sizes=(125, 50), max_iter=200, alpha=0.005)
mlp.fit(train_features, train_data['label'])

Training MLP (Neural Network)...


In [41]:
mlp_pred = mlp.predict(test_features)
evaluate_model("MLP", y_test, mlp_pred)

Evaluating MLP Model
Accuracy: 0.72
Precision: 0.7159257158674991
Recall: 0.72
F1 Score: 0.7133432898483393



In [20]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
y_val_encoded = label_encoder.transform(y_val)


In [21]:
from xgboost import XGBClassifier

print("Training XGBoost...")
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb.fit(train_features, y_train_encoded)

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



In [22]:
xgb_pred = xgb.predict(test_features)

xgb_pred_labels = label_encoder.inverse_transform(xgb_pred)
evaluate_model("XGBoost", y_test, xgb_pred_labels)

Evaluating XGBoost Model
Accuracy: 0.688
Precision: 0.7285472880061116
Recall: 0.688
F1 Score: 0.6617940145502645



In [42]:
print(softmax_reg.classes_)

['negative' 'neutral' 'positive']


In [43]:
print("Prediksi probabilitas Softmax Regression:")
pred_softmax = softmax_reg.predict_proba(vectorizer.transform([preprocess('makanan di sini enak sekali. saya suka!')]))
print('negative score: ' + str(pred_softmax[0][0]) + "\nneutral score: " + str(pred_softmax[0][1]) + "\npositive score: " + str(pred_softmax[0][2]))

Prediksi probabilitas Softmax Regression:
negative score: 0.2697511581860198
neutral score: 0.003928818469705477
positive score: 0.7263200233442747


In [44]:
print(mlp.classes_)

['negative' 'neutral' 'positive']


In [45]:
print("Prediksi probabilitas MLP:")
pred_mlp = mlp.predict_proba(vectorizer.transform([preprocess('makanan di sini enak sekali. saya suka!')]))
print('negative score: ' + str(pred_mlp[0][0]) + "\nneutral score: " + str(pred_mlp[0][1]) + "\npositive score: " + str(pred_mlp[0][2]))

Prediksi probabilitas MLP:
negative score: 0.2517357789037664
neutral score: 0.007349268228020351
positive score: 0.7409149528682132


In [46]:
print(xgb.classes_)

[0 1 2]


In [47]:
print("Prediksi probabilitas XGBoost:")
pred_xgb = xgb.predict_proba(vectorizer.transform([preprocess('makanan di sini enak sekali. saya suka!')]))
print('negative score: ' + str(pred_xgb[0][0]) + "\nneutral score: " + str(pred_xgb[0][1]) + "\npositive score: " + str(pred_xgb[0][2]))

Prediksi probabilitas XGBoost:
negative score: 0.98493284
neutral score: 0.014920079
positive score: 0.00014713347
