1. Install dan Import Library

In [1]:
!pip install nltk Sastrawi pandas scikit-learn

import pandas as pd
import numpy as np
import re, string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip




2. Load Dataset Hasil Scraping

In [2]:
df = pd.read_csv("whatsapp_reviews.csv")
print("Jumlah data:", len(df))
df = df[['content']] 

Jumlah data: 10000


In [3]:
df.head()

Unnamed: 0,content
0,Good
1,sekarang fitur foto selfi WA jadi mirror atau ...
2,"Aplikasinya lumayan bagus,namun untuk status d..."
3,aplikasi ini bagus
4,intinya bagus


In [4]:
df.columns

Index(['content'], dtype='object')

3. Preprocessing Teks

In [5]:
def clean_text(text):
    # Hapus emoji
    text = text.encode('ascii', 'ignore').decode('ascii')
    # Lowercase
    text = text.lower()
    # Hapus URL
    text = re.sub(r'http\S+|www.\S+', '', text)
    # Hapus angka
    text = re.sub(r'\d+', '', text)
    # Hapus tanda baca
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Hapus spasi berlebih
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['cleaned'] = df['content'].astype(str).apply(clean_text)

In [6]:
df[['content', 'cleaned']].head()

Unnamed: 0,content,cleaned
0,Good,good
1,sekarang fitur foto selfi WA jadi mirror atau ...,sekarang fitur foto selfi wa jadi mirror atau ...
2,"Aplikasinya lumayan bagus,namun untuk status d...",aplikasinya lumayan bagusnamun untuk status da...
3,aplikasi ini bagus,aplikasi ini bagus
4,intinya bagus,intinya bagus


4. Labeling Manual Berdasarkan Keyword

In [7]:
positive_keywords = ['bagus', 'baik', 'mantap', 'suka', 'puas', 'keren', 'cepat', 'mudah', 'memuaskan']
negative_keywords = ['jelek', 'buruk', 'parah', 'tidak suka', 'kecewa', 'lemot', 'error', 'macet', 'lambat']

def label_sentiment(text):
    for word in positive_keywords:
        if word in text:
            return 'positif'
    for word in negative_keywords:
        if word in text:
            return 'negatif'
    return 'netral'

df['sentiment'] = df['cleaned'].apply(label_sentiment)

Menyimpan hasil preprocessing

In [8]:
df.to_csv("whatsapp_preprocessed_labeled.csv", index=False)

5. Persiapan Dataset untuk Model

In [9]:
df = df.dropna(subset=['cleaned', 'sentiment']).reset_index(drop=True)
X_text = df['cleaned'].astype(str)
y = df['sentiment']

TF-IDF Vectorizer

In [10]:
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X_text)

6. Split Data

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

7. Training dan Evaluasi Model

In [12]:
svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

print("=== SVM + TF-IDF ===")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Classification Report:\n", classification_report(y_test, y_pred_svm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))

=== SVM + TF-IDF ===
Accuracy: 0.945
Classification Report:
               precision    recall  f1-score   support

     negatif       0.96      0.42      0.58        55
      netral       0.91      1.00      0.96      1157
     positif       1.00      0.90      0.95       788

    accuracy                           0.94      2000
   macro avg       0.96      0.77      0.83      2000
weighted avg       0.95      0.94      0.94      2000

Confusion Matrix:
 [[  23   32    0]
 [   0 1157    0]
 [   1   77  710]]
