In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'pandas'

In [2]:
# Download resource NLTK
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
# 1. Load dataset
df = pd.read_csv('playstore_reviews.csv')

In [18]:
# Ambil sampel 10.000 ulasan untuk efisiensi
df = df.sample(n=10000, random_state=42).reset_index(drop=True)
print(f"Jumlah data setelah sampling: {len(df)}")

Jumlah data setelah sampling: 10000


In [19]:
# 2. Pelabelan data berdasarkan rating
def label_sentiment(rating):
    if rating >= 4:
        return 'positive'
    elif rating == 3:
        return 'neutral'
    else:
        return 'negative'

df['sentiment'] = df['rating'].apply(label_sentiment)

# Cek distribusi sentimen
print("\nDistribusi Sentimen:")
print(df['sentiment'].value_counts())


Distribusi Sentimen:
sentiment
positive    7978
negative    1672
neutral      350
Name: count, dtype: int64


In [20]:
# 3. Preprocessing dan Ekstraksi Fitur
stop_words = set(stopwords.words('indonesian'))

def preprocess_text(text):
    text = str(text).lower()
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
    return ' '.join(tokens)

df['processed_text'] = df['text'].apply(preprocess_text)

In [21]:
# Ekstraksi fitur menggunakan TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['processed_text']).toarray()
y = df['sentiment']

In [22]:
# 4. 3 skema pelatihan
experiments = [
    {
        'name': 'SVM + TF-IDF + 80/20',
        'model': SVC(kernel='linear'),
        'train_size': 0.8
    },
    {
        'name': 'RandomForest + TF-IDF + 80/20',
        'model': RandomForestClassifier(n_estimators=100, random_state=42),
        'train_size': 0.8
    },
    {
        'name': 'SVM + TF-IDF + 70/30',
        'model': SVC(kernel='linear'),
        'train_size': 0.7
    }
]

In [24]:
# 5. Eksperimen Utama
results = []
for exp in experiments:
    print(f"\nEksperimen: {exp['name']}")

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=exp['train_size'], random_state=42, stratify=y
    )

    # Latih model
    model = exp['model']
    model.fit(X_train, y_train)

    # Evaluasi
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    train_accuracy = accuracy_score(y_train, y_pred_train)
    test_accuracy = accuracy_score(y_test, y_pred_test)

    print(f"Akurasi Training: {train_accuracy:.4f}")
    print(f"Akurasi Testing: {test_accuracy:.4f}")
    print("Laporan Klasifikasi (Testing):")
    print(classification_report(y_test, y_pred_test))

    results.append({
        'name': exp['name'],
        'train_accuracy': train_accuracy,
        'test_accuracy': test_accuracy
    })


Eksperimen: SVM + TF-IDF + 80/20
Akurasi Training: 0.9244
Akurasi Testing: 0.8790
Laporan Klasifikasi (Testing):
              precision    recall  f1-score   support

    negative       0.72      0.64      0.68       334
     neutral       0.00      0.00      0.00        70
    positive       0.91      0.97      0.94      1596

    accuracy                           0.88      2000
   macro avg       0.54      0.54      0.54      2000
weighted avg       0.84      0.88      0.86      2000


Eksperimen: RandomForest + TF-IDF + 80/20
Akurasi Training: 0.9869
Akurasi Testing: 0.8680
Laporan Klasifikasi (Testing):
              precision    recall  f1-score   support

    negative       0.70      0.62      0.66       334
     neutral       0.00      0.00      0.00        70
    positive       0.90      0.96      0.93      1596

    accuracy                           0.87      2000
   macro avg       0.53      0.53      0.53      2000
weighted avg       0.83      0.87      0.85      2000




In [25]:
# 6. Inference
print("\nContoh Inference:")
sample_texts = [
    "Aplikasi ini sangat membantu, pengiriman cepat!",
    "Fitur lumayan, tapi kadang error.",
    "Sangat buruk, aplikasi sering crash."
]


Contoh Inference:


In [26]:
# Preprocess dan transform
sample_processed = [preprocess_text(text) for text in sample_texts]
sample_tfidf = tfidf.transform(sample_processed).toarray()

In [27]:
# Prediksi menggunakan model terbaik (dari eksperimen pertama)
best_model = experiments[0]['model']
best_model.fit(X, y)  # Latih ulang pada seluruh data untuk inference
predictions = best_model.predict(sample_tfidf)

In [28]:
# Tampilkan hasil
for text, pred in zip(sample_texts, predictions):
    print(f"Teks: {text}")
    print(f"Sentimen: {pred}\n")

Teks: Aplikasi ini sangat membantu, pengiriman cepat!
Sentimen: positive

Teks: Fitur lumayan, tapi kadang error.
Sentimen: positive

Teks: Sangat buruk, aplikasi sering crash.
Sentimen: negative



In [29]:
# 7. Simpan hasil eksperimen
results_df = pd.DataFrame(results)
print("\nRingkasan Hasil Eksperimen:")
print(results_df)


Ringkasan Hasil Eksperimen:
                            name  train_accuracy  test_accuracy
0           SVM + TF-IDF + 80/20        0.924375       0.879000
1  RandomForest + TF-IDF + 80/20        0.986875       0.868000
2           SVM + TF-IDF + 70/30        0.924571       0.867333


In [24]:
# 8. Simpan dataset yang sudah diproses (opsional)
df[['text', 'rating', 'sentiment', 'processed_text']].to_csv('processed_reviews.csv', index=False)