In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
# Load data hasil preprocessing & pelabelan
df = pd.read_csv("whatsapp_preprocessed_labeled.csv")

# Hapus baris yang memiliki nilai NaN di kolom cleaned atau sentiment
df = df.dropna(subset=['cleaned', 'sentiment'])

# Reset index setelah drop
df = df.reset_index(drop=True)

# Pisahkan X dan y
X_text = df['cleaned'].astype(str)  # pastikan bentuknya string
y = df['sentiment']

# TF-IDF Vectorizer
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X_text)

# CountVectorizer (jika dibutuhkan)
count = CountVectorizer()
X_count = count.fit_transform(X_text)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [4]:
svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

print("SVM + TF-IDF")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))
print(confusion_matrix(y_test, y_pred_svm))

SVM + TF-IDF
Accuracy: 0.9339095068632435
              precision    recall  f1-score   support

     negatif       1.00      0.27      0.43        70
      netral       0.90      1.00      0.95      1200
     positif       1.00      0.89      0.94       697

    accuracy                           0.93      1967
   macro avg       0.97      0.72      0.77      1967
weighted avg       0.94      0.93      0.93      1967

[[  19   51    0]
 [   0 1200    0]
 [   0   79  618]]
