In [24]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Load data terbaru (V4) yang sudah bersih
train_ready = pd.read_csv('dataset_final/train_final.csv').dropna(subset=['caption_cleaned'])
test_ready = pd.read_csv('dataset_final/test_final.csv').dropna(subset=['caption_cleaned'])

# 2. Vektorisasi TF-IDF
# Menggunakan Unigram & Bigram agar menangkap frasa seperti "terima kasih"
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train = tfidf.fit_transform(train_ready['caption_cleaned'])
X_test = tfidf.transform(test_ready['caption_cleaned'])

y_train = train_ready['emotion']
y_test = test_ready['emotion']
feature_names = tfidf.get_feature_names_out()

# 3. Model Baseline: SVM
# Kernel linear sangat stabil untuk representasi teks TF-IDF
svm_model = SVC(kernel='linear', class_weight='balanced')
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)

# 4. Evaluasi Hasil
print("\n--- HASIL EVALUASI BASELINE TF-IDF (V4) ---")
print(classification_report(y_test, y_pred))


--- HASIL EVALUASI BASELINE TF-IDF (V4) ---
              precision    recall  f1-score   support

       Anger       0.41      0.42      0.41        53
Anticipation       0.60      0.76      0.67       244
     Disgust       0.31      0.16      0.21        32
        Fear       0.49      0.39      0.44        92
         Joy       0.64      0.57      0.60        69
     Sadness       0.44      0.39      0.42        61
    Surprise       0.50      0.39      0.44        61
       Trust       0.48      0.47      0.47       128

    accuracy                           0.53       740
   macro avg       0.48      0.44      0.46       740
weighted avg       0.52      0.53      0.52       740

