In [4]:
import time
import re
import csv
import ast
import string
import swifter
import requests
import datetime
from tqdm import tqdm
from io import StringIO
from google_play_scraper import Sort, reviews
import pickle

import numpy as np
import pandas as pd
import plotly.graph_objects as go

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import load_model

import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Tsaqif\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tsaqif\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Prediksi Langsung

In [14]:
def classify_text():
    # Mapping angka ke label teks
    replace_map = {0: 'negative', 1: 'neutral', 2: 'positive'}
    
    # Ambil input teks dari user
    input_text = input("Masukkan teks yang ingin diklasifikasi: ")
    input_list = [input_text]

    # ==============================================================================
    # 1. BiLSTM Model
    # ==============================================================================
    with open('../models/tokenizer_bilstm_model_embedding.pkl', 'rb') as f:
        tokenizer = pickle.load(f)

    bilstm_model = load_model('../models/bilstm_model_embedding.keras')

    sequences = tokenizer.texts_to_sequences(input_list)
    padded_sequences = pad_sequences(sequences, maxlen=100)

    bilstm_preds = bilstm_model.predict(padded_sequences)
    bilstm_label = np.argmax(bilstm_preds, axis=1)[0]
    bilstm_label_str = replace_map[bilstm_label]

    # ==============================================================================
    # 2. Logistic Regression (CountVectorizer)
    # ==============================================================================
    with open('../models/best_pipeline_logreg_countvectorizer.pkl', 'rb') as f:
        count_model = pickle.load(f)

    count_pred = count_model.predict(input_list)[0]
    count_label_str = replace_map[count_pred]

    # ==============================================================================
    # 3. Logistic Regression (TF-IDF)
    # ==============================================================================
    with open('../models/best_pipeline_logreg_tfidf.pkl', 'rb') as f:
        tfidf_model = pickle.load(f)

    tfidf_pred = tfidf_model.predict(input_list)[0]
    tfidf_label_str = replace_map[tfidf_pred]

    # ==============================================================================
    # 4. Soft Voting Ensemble (TF-IDF)
    # ==============================================================================
    with open('../models/tfidf_vectorizer_ensamble.pkl', 'rb') as f:
        ensemble_vectorizer = pickle.load(f)

    with open('../models/soft_voting_ensamble_model_tfidf.pkl', 'rb') as f:
        ensemble_model = pickle.load(f)

    X_ensemble = ensemble_vectorizer.transform(input_list)
    ensemble_pred = ensemble_model.predict(X_ensemble)[0]
    ensemble_label_str = replace_map[ensemble_pred]

    # Cetak hasil prediksi dari semua model
    print("\nHasil Klasifikasi:")
    print(f"1. BiLSTM Model: {bilstm_label} ({bilstm_label_str})")
    print(f"2. Logistic Regression (CountVectorizer): {count_pred} ({count_label_str})")
    print(f"3. Logistic Regression (TF-IDF): {tfidf_pred} ({tfidf_label_str})")
    print(f"4. Soft Voting Ensemble: {ensemble_pred} ({ensemble_label_str})")

In [13]:
classify_text()

Masukkan teks yang ingin diklasifikasi:  Jujur saya kecewa dengan aplikasi ini, sistemnya lambat


[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1s[0m 621ms/step

Hasil Klasifikasi:
1. BiLSTM Model: 1 (neutral)
2. Logistic Regression (CountVectorizer): 0 (negative)
3. Logistic Regression (TF-IDF): 0 (negative)
4. Soft Voting Ensemble: 0 (negative)


# Scraping

In [70]:
app_ids = ['com.tokopedia.tkpd']
review_data_dict = {}

for app_id in tqdm(app_ids, desc="Memproses Aplikasi"):
    try:
        review_data = []
        continuation_token = None
        matching_review_count = 0
        non_matching_review_count = 0

        # Rentang tanggal yang diinginkan (1-31 Desember 2024)
        end_date = datetime.datetime(2024, 12, 31, 23, 59, 59)
        start_date = datetime.datetime(2024, 12, 1, 0, 0, 0)

        review_bar = tqdm(desc=f'Ulasan untuk {app_id}: ', total=500)
        while True:
            review_result, continuation_token = reviews(
                app_id,
                lang='id',              # Bahasa Indonesia
                country='id',           # Negara Indonesia
                sort=Sort.NEWEST,       # Urutkan dari yang terbaru
                count=100,              # Ambil 100 ulasan per batch
                continuation_token=continuation_token
            )

            filtered_reviews = [review for review in review_result if start_date <= review['at'] <= end_date]
            review_data.extend(filtered_reviews)

            for review in review_result:
                if review['at'] < start_date:
                    non_matching_review_count += 1
                elif start_date <= review['at'] <= end_date:
                    matching_review_count += 1
                    non_matching_review_count = 0
                else:
                    non_matching_review_count = 0

            review_bar.update(1)
            review_bar.set_postfix({
                'Cocok': matching_review_count,
                'Tidak Cocok': non_matching_review_count
            })

            if not continuation_token or non_matching_review_count >= 500:
                break

        review_bar.close()

        kolom_data = ['reviewId', 'userName', 'content', 'score', 'thumbsUpCount', 'at', 'reviewCreatedVersion', 'appVersion']
        review_data_dict[app_id] = pd.DataFrame(review_data, columns=kolom_data)

        time.sleep(5)

    except Exception as e:
        print(f'Error saat mengambil ulasan untuk aplikasi {app_id}: {e}')

Memproses Aplikasi:   0%|          | 0/1 [00:00<?, ?it/s]
Ulasan untuk com.tokopedia.tkpd:   0%|          | 0/500 [00:00<?, ?it/s][A
Ulasan untuk com.tokopedia.tkpd:   0%|          | 1/500 [00:00<01:12,  6.88it/s][A
Ulasan untuk com.tokopedia.tkpd:   0%|          | 1/500 [00:00<01:12,  6.88it/s, Cocok=0, Tidak Cocok=0][A
Ulasan untuk com.tokopedia.tkpd:   0%|          | 2/500 [00:00<01:23,  5.94it/s, Cocok=0, Tidak Cocok=0][A
Ulasan untuk com.tokopedia.tkpd:   0%|          | 2/500 [00:00<01:23,  5.94it/s, Cocok=0, Tidak Cocok=0][A
Ulasan untuk com.tokopedia.tkpd:   1%|          | 3/500 [00:00<01:16,  6.50it/s, Cocok=0, Tidak Cocok=0][A
Ulasan untuk com.tokopedia.tkpd:   1%|          | 3/500 [00:00<01:16,  6.50it/s, Cocok=0, Tidak Cocok=0][A
Ulasan untuk com.tokopedia.tkpd:   1%|          | 4/500 [00:00<01:12,  6.82it/s, Cocok=0, Tidak Cocok=0][A
Ulasan untuk com.tokopedia.tkpd:   1%|          | 4/500 [00:00<01:12,  6.82it/s, Cocok=0, Tidak Cocok=0][A
Ulasan untuk com.tokopedia

In [71]:
df = review_data_dict['com.tokopedia.tkpd'].head()
print(df.shape)
df.head()

(5, 8)


Unnamed: 0,reviewId,userName,content,score,thumbsUpCount,at,reviewCreatedVersion,appVersion
0,3423f5d8-b98e-4cd3-b6dd-c8f1d6f87383,Reza Bolang,Gara aplikasi ini WhatsApp gw gk bsa di buka üò°...,1,0,2024-12-31 23:52:31,3.293.1,3.293.1
1,e01983cf-38c5-4dda-86ef-7cfe75959977,bram bey,Terimakasih.....,5,0,2024-12-31 23:51:20,3.294.1,3.294.1
2,b45c0a9a-3fb2-439b-92d2-bf64e03fa7ce,I‚ÄîYOUNG LEX,sangat cocok,5,0,2024-12-31 23:42:18,3.289.0,3.289.0
3,26529007-ea7e-4e33-ab5e-18e75504465b,Windi Maura,"jujur saya kecewa dg apk ini,bagian alamat tdk...",2,0,2024-12-31 23:17:47,,
4,4c1bef94-b634-48c1-a86b-eb25ce5dbf6a,Atik Sumarti,Sangat kecewa dengan sistem Tokopedia order ba...,2,3,2024-12-31 23:14:02,3.294.1,3.294.1


# Predict

In [73]:
df_selected = df[['content']]
df_selected.rename(columns={'content': 'reprocessed_content'}, inplace=True)
df_selected

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected.rename(columns={'content': 'reprocessed_content'}, inplace=True)


Unnamed: 0,reprocessed_content
0,Gara aplikasi ini WhatsApp gw gk bsa di buka üò°...
1,Terimakasih.....
2,sangat cocok
3,"jujur saya kecewa dg apk ini,bagian alamat tdk..."
4,Sangat kecewa dengan sistem Tokopedia order ba...


In [82]:
# ==============================================================================
# 1. BiLSTM Model
# ==============================================================================
with open('../models/tokenizer_bilstm_model_embedding.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

bilstm_model = load_model('../models/bilstm_model_embedding.keras')

sequences = tokenizer.texts_to_sequences(df_selected['reprocessed_content'])
padded_sequences = pad_sequences(sequences, maxlen=100)

bilstm_preds = bilstm_model.predict(padded_sequences)
bilstm_labels = np.argmax(bilstm_preds, axis=1)

# ==============================================================================
# 2. Logistic Regression (CountVectorizer)
# ==============================================================================
with open('../models/best_pipeline_logreg_countvectorizer.pkl', 'rb') as f:
    count_model = pickle.load(f)

count_preds = count_model.predict(df_selected['reprocessed_content'])

# ==============================================================================
# 3. Logistic Regression (TF-IDF)
with open('../models/best_pipeline_logreg_tfidf.pkl', 'rb') as f:
    tfidf_model = pickle.load(f)

tfidf_preds = tfidf_model.predict(df_selected['reprocessed_content'])

# ==============================================================================
# 4. Soft Voting Ensemble (TF-IDF)
# ==============================================================================
with open('../models/tfidf_vectorizer_ensamble.pkl', 'rb') as f:
    ensemble_vectorizer = pickle.load(f)

with open('../models/soft_voting_ensamble_model_tfidf.pkl', 'rb') as f:
    ensemble_model = pickle.load(f)

X_ensemble = ensemble_vectorizer.transform(df_selected['reprocessed_content'])
ensemble_preds = ensemble_model.predict(X_ensemble)

# ==============================================================================
# Tampilkan hasil prediksi
# ==============================================================================
result = df_selected.copy()
result['bilstm_pred'] = bilstm_labels
result['count_logreg_pred'] = count_preds
result['tfidf_logreg_pred'] = tfidf_preds
result['ensemble_pred'] = ensemble_preds

# Mapping nilai
replace_map = {0: 'negative', 1: 'neutral', 2: 'positive'}
cols_to_replace = ['bilstm_pred', 'count_logreg_pred', 'tfidf_logreg_pred', 'ensemble_pred']
result[cols_to_replace] = result[cols_to_replace].replace(replace_map).astype('category')

result

[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 383ms/step


Unnamed: 0,reprocessed_content,bilstm_pred,count_logreg_pred,tfidf_logreg_pred,ensemble_pred
0,Gara aplikasi ini WhatsApp gw gk bsa di buka üò°...,neutral,negative,negative,negative
1,Terimakasih.....,neutral,positive,positive,positive
2,sangat cocok,neutral,negative,negative,negative
3,"jujur saya kecewa dg apk ini,bagian alamat tdk...",neutral,negative,negative,negative
4,Sangat kecewa dengan sistem Tokopedia order ba...,neutral,negative,negative,negative


# Interpretasi Hasil Prediksi Sentimen

## Analisis Baris per Baris

### Baris 0  
**Teks:** "Gara aplikasi ini WhatsApp gw gk bsa di buka üò°..."  
- BILSTM memprediksi: **neutral**  
- Model lain memprediksi: **negative**  
- **Interpretasi:** Kalimat ini jelas berisi keluhan/emosi negatif (ada emoji marah üò°), sehingga prediksi **negative** lebih tepat. Model BILSTM kurang sensitif terhadap nada negatif di kalimat ini.

### Baris 1  
**Teks:** "Terimakasih....."  
- BILSTM: **neutral**  
- Model lain: **positive**  
- **Interpretasi:** Kalimat mengandung ucapan terima kasih, yang bernada positif. Prediksi model count_logreg, tfidf_logreg, dan ensemble sudah benar. Prediksi BILSTM cenderung netral.

### Baris 2  
**Teks:** "sangat cocok"  
- Semua model selain BILSTM memprediksi: **negative**  
- BILSTM: **neutral**  
- **Interpretasi:** Kalimat ini jelas bernada positif (frasa ‚Äúsangat cocok‚Äù). Semua model gagal mengenali ini, bahkan memprediksi negatif atau netral. Ini indikasi model perlu perbaikan khusus pada data positif pendek.

### Baris 3  
**Teks:** "jujur saya kecewa dg apk ini,bagian alamat tdk..."  
- Semua model selain BILSTM memprediksi: **negative**  
- BILSTM: **neutral**  
- **Interpretasi:** Kalimat berisi kata ‚Äúkecewa‚Äù, jelas negatif.

### Baris 4  
**Teks:** "Sangat kecewa dengan sistem Tokopedia order ba..."  
- Semua model selain BILSTM memprediksi: **negative**  
- BILSTM: **neutral**  
- **Interpretasi:** Sama seperti baris 3, kalimat jelas negatif.

---

## Kesimpulan

- Model BILSTM cenderung memprediksi label **neutral** untuk semua kalimat yang ada, kurang sensitif terhadap sentimen positif maupun negatif di contoh ini.
- Model `count_logreg`, `tfidf_logreg`, dan `ensemble` lebih sering memprediksi dengan benar, kecuali kalimat sangat pendek dan positif (‚Äúsangat cocok‚Äù) yang mereka salah prediksi sebagai negatif.
- Ada kecenderungan model klasik dan ensemble salah mendeteksi kalimat positif pendek sebagai negatif, menandakan perlunya data pelatihan tambahan atau tuning.

---

## Rekomendasi Perbaikan

- Tambahkan contoh kalimat pendek dan bernada positif ke dataset pelatihan.
- Evaluasi ulang model BILSTM agar tidak terlalu bias ke kelas netral.
- Gunakan teknik augmentasi data untuk memperbaiki kesimbangan kelas.
- Gunakan metrik evaluasi lengkap (precision, recall, F1-score per kelas) untuk analisis performa yang lebih mendalam.

---