## Sentiment Analysis Pada Ulasan Aplikasi Tiket.com di Playstore

## Proyek ini bertujuan untuk mengidentifikasi dan mengevaluasi ulasan yang terdapat pada kolom ulasan playstore aplikasi **Tiket.com**. Hasil analisis pada proyek ini bisa menjadi alat yang sangat berguna baik dalam memahami pandangan pelanggan, tanggapan publik terhadap layanan pada aplikasi.

# Import Library

In [None]:
# drive mount
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import numpy as np

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data Gathering

In [None]:
# read csv reviews data
df =  pd.read_csv('/content/drive/MyDrive/DBS/nlpdataset/tiket_reviews.csv')
df.head()

Unnamed: 0,userName,score,review,date
0,Dedy Sanjaya,5,good,2025-05-01 02:34:56
1,dobrek 1140,1,cie viral,2025-05-01 00:03:40
2,Hadi risman,1,ko pas mau pesan tiket kereta gangguan terus ya?,2025-04-30 23:15:18
3,Risky Setiawan,1,"mengecewakan pelayanannya, order tidak sesuai ...",2025-04-30 08:16:15
4,Flansia Johanes,5,Good service,2025-04-30 08:03:07


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   userName  10000 non-null  object
 1   score     10000 non-null  int64 
 2   review    10000 non-null  object
 3   date      10000 non-null  object
dtypes: int64(1), object(3)
memory usage: 312.6+ KB


# Data Preprocessing

Labelling

In [None]:
def label_sentiment(score):
    if score <= 2:
        return "negatif"
    elif score == 3:
        return "netral"
    else:
        return "positif"

df['label'] = df['score'].apply(label_sentiment)
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
positif,5350
negatif,4550
netral,100


Cleaning

In [None]:
import re
import string
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('indonesian'))

def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # hapus URL
    text = re.sub(r'\@w+|\#','', text)  # hapus mention & hashtag
    text = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', text)  # hapus tanda baca
    text = re.sub(r'\s+', ' ', text).strip()  # hapus spasi ganda
    text = ' '.join([word for word in text.split() if word not in stop_words])  # hapus stopword
    return text

df['clean_review'] = df['review'].astype(str).apply(clean_text)
df[['review', 'clean_review', 'label']].head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,review,clean_review,label
0,good,good,positif
1,cie viral,cie viral,negatif
2,ko pas mau pesan tiket kereta gangguan terus ya?,ko pas pesan tiket kereta gangguan ya,negatif
3,"mengecewakan pelayanannya, order tidak sesuai ...",mengecewakan pelayanannya order sesuai fasilit...,negatif
4,Good service,good service,positif


Feature Extraction

TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

X = df['clean_review']
y = df['label']

vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, stratify=y, random_state=42)

# Scheme 1 (SVM & TF-IDF)

In [None]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score

model_svm = LinearSVC()
model_svm.fit(X_train, y_train)
y_pred_svm = model_svm.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

Accuracy: 0.996
              precision    recall  f1-score   support

     negatif       1.00      0.99      1.00       910
      netral       1.00      1.00      1.00        20
     positif       0.99      1.00      1.00      1070

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000



Model terlalu overfit. Hal ini kemungkinan akibat jumlah data imbalanced antar kelas, terutama pada kelas 'netral'. Langkah selanjutnya adalah menggunakan sampling untuk melakukan training model dan sisa data yang ada hanya akan digunakan untuk testing

In [None]:
# Cetak persentase akurasi
print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred_svm) * 100))

Accuracy: 99.60%


Retrain with Stratified Undersampling

In [None]:
from sklearn.utils import resample

# Buat data seimbang untuk training (100 sampel per kelas)
df_pos = df[df['label'] == 'positif']
df_neg = df[df['label'] == 'negatif']
df_net = df[df['label'] == 'netral']

df_sampled = pd.concat([
    resample(df_pos, replace=False, n_samples=100, random_state=42),
    resample(df_neg, replace=False, n_samples=100, random_state=42),
    df_net  # hanya 100 data, biarkan asli
])

In [None]:
# Shuffle dataset training
df_train_balanced = df_sampled.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
# Dataset testing dari dataset asli
df_test_rest = df.drop(df_train_balanced.index)

In [None]:
# TF-IDF vectorizer (fit hanya dari data training)
vectorizer_balanced = TfidfVectorizer(max_features=5000)
X_train_bal = vectorizer_balanced.fit_transform(df_train_balanced['clean_review'])
y_train_bal = df_train_balanced['label']

In [None]:
# Transform testing data
X_test_rest = vectorizer_balanced.transform(df_test_rest['clean_review'])
y_test_rest = df_test_rest['label']

In [None]:
# Train SVM dengan data seimbang (kecil)
svm_balanced = LinearSVC(random_state=42)
svm_balanced.fit(X_train_bal, y_train_bal)
y_pred_balanced = svm_balanced.predict(X_test_rest)

In [None]:
# Evaluasi terhadap data asli
accuracy_balanced = accuracy_score(y_test_rest, y_pred_balanced)
report_balanced = classification_report(y_test_rest, y_pred_balanced)

In [None]:
# Cetak akurasi dan report
print("Accuracy (balanced):", accuracy_balanced)
print(report_balanced)

Accuracy (balanced): 0.9202061855670103
              precision    recall  f1-score   support

     negatif       0.86      0.98      0.92      4425
      netral       1.00      1.00      1.00        97
     positif       0.98      0.87      0.92      5178

    accuracy                           0.92      9700
   macro avg       0.95      0.95      0.95      9700
weighted avg       0.93      0.92      0.92      9700



In [None]:
# Cetak persentase akurasi
print("Accuracy (balanced): {:.2f}%".format(accuracy_balanced * 100))

Accuracy (balanced): 92.02%


Akurasi sudah cukup bagus, namun turun drastis. Hal ini diakibatkan jumlah data yang sedikit yang digunakan untuk training. Maka perlu ditingkatkan kembali dengan cara menambahkan jumlah data sampling untuk training, membangkitkan data sintetis pada kelas dengan jumlah data yang sedikit yakni 'netral', serta melakukan pembobotan yang seimbang antar kelas

Class Rebalancing and Feature Enrichment with Cost-Sensitive SVM

In [None]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0


In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
df_pos_200 = resample(df[df['label'] == 'positif'], replace=False, n_samples=200, random_state=42)
df_neg_200 = resample(df[df['label'] == 'negatif'], replace=False, n_samples=200, random_state=42)
df_net_100 = resample(df[df['label'] == 'netral'], replace=False, n_samples=100, random_state=42)

In [None]:
df_sampled_partial = pd.concat([df_pos_200, df_neg_200, df_net_100]).sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
vectorizer_smote = TfidfVectorizer(max_features=7000, ngram_range=(1, 2))
X_partial = vectorizer_smote.fit_transform(df_sampled_partial['clean_review'])
y_partial = df_sampled_partial['label']

In [None]:
sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X_partial, y_partial)

In [None]:
used_idx = df_sampled_partial.index
df_test_smote = df.drop(used_idx)

X_test_smote = vectorizer_smote.transform(df_test_smote['clean_review'])
y_test_smote = df_test_smote['label']

In [None]:
svm_smote = LinearSVC(C=1.0, class_weight='balanced', max_iter=1000, random_state=42)
svm_smote.fit(X_resampled, y_resampled)
y_pred_smote = svm_smote.predict(X_test_smote)

In [None]:
accuracy_smote = accuracy_score(y_test_smote, y_pred_smote)
report_smote = classification_report(y_test_smote, y_pred_smote)

In [None]:
print("Accuracy (SMOTE):", accuracy_smote)
print(report_smote)

Accuracy (SMOTE): 0.9650526315789474
              precision    recall  f1-score   support

     negatif       0.93      1.00      0.96      4334
      netral       1.00      1.00      1.00        95
     positif       1.00      0.93      0.97      5071

    accuracy                           0.97      9500
   macro avg       0.98      0.98      0.98      9500
weighted avg       0.97      0.97      0.97      9500



In [None]:
print("Accuracy (balanced): {:.2f}%".format(accuracy_smote * 100))

Accuracy (balanced): 96.51%


# Scheme 2 (Logistik Regression & TF-IDF)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
df_sampled_lr = pd.concat([
    resample(df_pos, replace=False, n_samples=100, random_state=42),
    resample(df_neg, replace=False, n_samples=100, random_state=42),
    df_net  # 100 data netral
])


df_sampled_lr = df_sampled_lr.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
# TF-IDF dari data training (300 data)
vectorizer_lr = TfidfVectorizer(max_features=5000)
X_train_lr = vectorizer_lr.fit_transform(df_sampled_lr['clean_review'])
y_train_lr = df_sampled_lr['label']

In [None]:
# Gunakan sisa data sebagai testing
df_test_lr = df.drop(df_sampled_lr.index)
X_test_lr = vectorizer_lr.transform(df_test_lr['clean_review'])
y_test_lr = df_test_lr['label']

In [None]:
# Latih Logistic Regression
lr_model = LogisticRegression(max_iter=200, random_state=42)
lr_model.fit(X_train_lr, y_train_lr)
y_pred_lr = lr_model.predict(X_test_lr)

In [None]:
# Evaluasi
accuracy_lr = accuracy_score(y_test_lr, y_pred_lr)
report_lr = classification_report(y_test_lr, y_pred_lr)

In [None]:
print("Accuracy (Logistic Regression):", accuracy_lr)
print(report_lr)

Accuracy (Logistic Regression): 0.9152577319587629
              precision    recall  f1-score   support

     negatif       0.86      0.98      0.91      4425
      netral       1.00      1.00      1.00        97
     positif       0.98      0.86      0.92      5178

    accuracy                           0.92      9700
   macro avg       0.95      0.95      0.94      9700
weighted avg       0.92      0.92      0.92      9700



In [None]:
print("Accuracy (balanced): {:.2f}%".format(accuracy_lr * 100))

Accuracy (balanced): 91.53%


# Scheme 3 (BiLSTM)

In [None]:
!pip install tensorflow



In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from sklearn.preprocessing import LabelEncoder

In [None]:
df_sampled_dl = df_sampled_lr.copy()

In [None]:
df_test_dl = df.drop(df_sampled_dl.index)

In [None]:
# Tokenisasi dan padding
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(df_sampled_dl['clean_review'])

In [None]:
X_train_seq = tokenizer.texts_to_sequences(df_sampled_dl['clean_review'])
X_train_pad = pad_sequences(X_train_seq, maxlen=100)

In [None]:
X_test_seq = tokenizer.texts_to_sequences(df_test_dl['clean_review'])
X_test_pad = pad_sequences(X_test_seq, maxlen=100)

In [None]:
le_dl = LabelEncoder()
y_train_enc = le_dl.fit_transform(df_sampled_dl['label'])
y_test_enc = le_dl.transform(df_test_dl['label'])

In [None]:
model = Sequential([
    Embedding(input_dim=5000, output_dim=64, input_length=100),
    Bidirectional(LSTM(64, return_sequences=False)),
    Dropout(0.5),
    Dense(3, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])



In [None]:
history = model.fit(X_train_pad, y_train_enc, epochs=10, batch_size=16, validation_split=0.2, verbose=0)

In [None]:
loss, acc = model.evaluate(X_test_pad, y_test_enc, verbose=0)
acc

0.864845335483551

In [None]:
print("Accuracy: {:.2f}%".format(acc * 100))

Accuracy: 86.48%


# Scheme 4 (Random Forest & TF-IDF)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
df_sampled_rf = df_sampled_lr.copy()  # 300 data seimbang
df_test_rf = df.drop(df_sampled_rf.index)

In [None]:
vectorizer_rf = TfidfVectorizer(max_features=5000)
X_train_rf = vectorizer_rf.fit_transform(df_sampled_rf['clean_review'])
y_train_rf = df_sampled_rf['label']

In [None]:
X_test_rf = vectorizer_rf.transform(df_test_rf['clean_review'])
y_test_rf = df_test_rf['label']

In [None]:
rf_sampled_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_sampled_model.fit(X_train_rf, y_train_rf)
y_pred_rf_sampled = rf_sampled_model.predict(X_test_rf)

In [None]:
accuracy_rf_sampled = accuracy_score(y_test_rf, y_pred_rf_sampled)
report_rf_sampled = classification_report(y_test_rf, y_pred_rf_sampled)

In [None]:
print("Accuracy (Random Forest):", accuracy_rf_sampled)
print(report_rf_sampled)

Accuracy (Random Forest): 0.8951546391752577
              precision    recall  f1-score   support

     negatif       0.94      0.82      0.88      4425
      netral       1.00      1.00      1.00        97
     positif       0.86      0.95      0.91      5178

    accuracy                           0.90      9700
   macro avg       0.93      0.93      0.93      9700
weighted avg       0.90      0.90      0.89      9700



In [None]:
print("Accuracy (balanced): {:.2f}%".format(accuracy_rf_sampled * 100))

Accuracy (balanced): 89.52%


# Inference

In [None]:
stop_words_id = set([
    'yang', 'dan', 'di', 'ke', 'dari', 'ini', 'itu', 'untuk', 'dengan', 'pada',
    'adalah', 'karena', 'jika', 'maka', 'tetapi', 'sebagai', 'dalam', 'juga',
    'tidak', 'bukan', 'sudah', 'belum', 'sangat', 'akan', 'lebih', 'kurang'
])

def predict_sentiment_svm(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'\@w+|\#','', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = ' '.join([word for word in text.split() if word not in stop_words_id])
    vector = vectorizer_smote.transform([text])
    prediction = svm_smote.predict(vector)
    return prediction[0]

In [None]:
predict_sentiment_svm("Saya sangat kecewa dengan layanan customer service-nya yang lambat.")

'negatif'

In [None]:
predict_sentiment_svm("saya suka aplikasi ini, tidak ada error saat pemesanan")

'positif'