In [26]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV, cross_val_predict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
# Ganti path ke lokasi file Anda
# Gunakan file dan kolom yang sesuai dengan komentar_vo3 _labeling.csv
file_path = 'komentar_vo3 _labeling.csv'
df = pd.read_csv(file_path)

# Pastikan kolom yang diperlukan ada
print(df.columns)
# Gunakan kolom 'comment', 'likes', 'sentimen'
df = df[['comment', 'likes', 'sentimen']]

Index(['username', 'comment', 'time', 'likes', 'sentimen', 'Unnamed: 5'], dtype='object')


In [28]:
# Preprocessing
factory = StemmerFactory()
stemmer = factory.create_stemmer()
stop_words = set(stopwords.words('indonesian'))

def preprocessing(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+|\#", "", text)
    text = re.sub(r"\d+", "", text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip()
    tokens = text.split()
    tokens = [t for t in tokens if t not in stop_words]
    text = ' '.join(tokens)
    return stemmer.stem(text)

df['clean_text'] = df['comment'].apply(preprocessing)

# Save preprocessed data
df.to_csv('preprocessed_comments.csv', index=False)
print("Preprocessed data saved to preprocessed_comments.csv")

Preprocessed data saved to preprocessed_comments.csv


In [29]:
# TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_text = tfidf.fit_transform(df['clean_text'])

# Save TF-IDF matrix
tfidf_df = pd.DataFrame(X_text.toarray(), columns=tfidf.get_feature_names_out())
tfidf_df.to_csv('tfidf_matrix.csv', index=False)
print("TF-IDF matrix saved to tfidf_matrix.csv")

# Gabungkan dengan fitur numerik (likes)
X_extra = df[['likes']].fillna(0).values
from scipy.sparse import hstack
X = hstack([X_text, X_extra])

# Save combined features (TF-IDF + likes)
combined_df = tfidf_df.copy()
combined_df['likes'] = X_extra
combined_df.to_csv('tfidf_plus_likes.csv', index=False)
print("TF-IDF + likes features saved to tfidf_plus_likes.csv")

# Label: gunakan sentimen (1=positif, 0=netral, -1=negatif)
y = df['sentimen']

TF-IDF matrix saved to tfidf_matrix.csv
TF-IDF + likes features saved to tfidf_plus_likes.csv
TF-IDF + likes features saved to tfidf_plus_likes.csv


In [30]:
# Model SVM OVO
from sklearn.utils import check_X_y

# Pastikan tidak ada NaN di X dan y
if hasattr(X, "toarray"):
    X_dense = X.toarray()
else:
    X_dense = X
X_dense = np.nan_to_num(X_dense)
y = pd.to_numeric(y, errors='coerce').fillna(0).astype(int)

# Pastikan jumlah sample sama
X_dense, y = check_X_y(X_dense, y)

svm = SVC(decision_function_shape='ovo')

# Grid search (tuning)
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto'],
    'kernel': ['rbf']
}

# Jika data sangat sedikit, gunakan cv=2
cv_val = min(4, len(y)) if len(y) >= 2 else 2

grid = GridSearchCV(svm, param_grid, cv=cv_val, scoring='accuracy', n_jobs=-1)
grid.fit(X_dense, y)

print("Best params:", grid.best_params_)
print("Best score:", grid.best_score_)

Best params: {'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}
Best score: 0.49722222222222223


In [31]:
# Evaluasi dengan cross_val_predict
best_model = grid.best_estimator_
y_pred = cross_val_predict(best_model, X, y, cv=4)

# Save predictions
results_df = df.copy()
results_df['predicted_sentimen'] = y_pred
results_df.to_csv('sentiment_predictions.csv', index=False)
print("Predictions saved to sentiment_predictions.csv")

print(classification_report(y, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y, y_pred))

Predictions saved to sentiment_predictions.csv
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       132
           0       0.50      0.99      0.66       356
           1       0.60      0.01      0.03       230

    accuracy                           0.50       718
   macro avg       0.37      0.34      0.23       718
weighted avg       0.44      0.50      0.34       718

Confusion Matrix:
[[  0 132   0]
 [  0 354   2]
 [  0 227   3]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
