In [8]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
# Ganti path ke lokasi file Anda
# Gunakan file dan kolom yang sesuai dengan komentar_vo3 _labeling.csv
file_path = 'komentar_vo3 _labeling.csv'
df = pd.read_csv(file_path)

# Pastikan kolom yang diperlukan ada
print(df.columns)
# Gunakan kolom 'comment', 'likes', 'sentimen'
df = df[['comment', 'likes', 'sentimen']]

Index(['username', 'comment', 'time', 'likes', 'sentimen', 'Unnamed: 5'], dtype='object')


In [10]:
# Preprocessing & Data Augmentation to 10,000 samples
factory = StemmerFactory()
stemmer = factory.create_stemmer()
stop_words = set(stopwords.words('indonesian'))

# Small synonym dictionary for Indonesian
synonym_dict = {
    'bagus': ['baik', 'hebat'],
    'buruk': ['jelek', 'tidak baik'],
    'cepat': ['lekas', 'segera'],
    'lambat': ['pelan', 'lelet'],
    'senang': ['bahagia', 'gembira'],
    'sedih': ['duka', 'murung'],
    'keren': ['hebat', 'mantap'],
    'jelek': ['buruk', 'tidak bagus'],
    'suka': ['gemar', 'senang'],
    'tidak': ['nggak', 'tak'],
    'bagusnya': ['baiknya', 'hebatnya'],
    'mantap': ['keren', 'hebat'],
    'hebat': ['keren', 'mantap'],
    'parah': ['buruk', 'jelek'],
    'menarik': ['seru', 'asik'],
    'asik': ['seru', 'menarik'],
    'seru': ['asik', 'menarik'],
}

import random

def synonym_replacement(tokens):
    new_tokens = tokens.copy()
    for i, t in enumerate(new_tokens):
        if t in synonym_dict and random.random() < 0.3:
            new_tokens[i] = random.choice(synonym_dict[t])
    return new_tokens

def random_swap(tokens):
    new_tokens = tokens.copy()
    if len(new_tokens) > 1 and random.random() < 0.5:
        idx1, idx2 = random.sample(range(len(new_tokens)), 2)
        new_tokens[idx1], new_tokens[idx2] = new_tokens[idx2], new_tokens[idx1]
    return new_tokens

def random_deletion(tokens, p=0.1):
    if len(tokens) == 1:
        return tokens
    return [t for t in tokens if random.random() > p or t in stop_words]

def augment_text(text):
    tokens = text.split()
    aug_methods = [synonym_replacement, random_swap, random_deletion]
    random.shuffle(aug_methods)
    for method in aug_methods:
        tokens = method(tokens)
    return ' '.join(tokens)

def preprocessing(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+|\#", "", text)
    text = re.sub(r"\d+", "", text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip()
    tokens = text.split()
    tokens = [t for t in tokens if t not in stop_words]
    text = ' '.join(tokens)
    return stemmer.stem(text)

# Clean original data
original_df = df.copy()
original_df['clean_text'] = original_df['comment'].apply(preprocessing)

# Augment data to reach 10,000 samples
augmented_rows = []
current_count = len(original_df)
if current_count < 10000:
    needed = 10000 - current_count
    base_rows = original_df[['comment', 'likes', 'sentimen', 'clean_text']].values.tolist()
    for _ in range(needed):
        base = random.choice(base_rows)
        # Augment the clean_text
        aug_text = augment_text(base[3])
        augmented_rows.append({
            'comment': base[0],
            'likes': base[1],
            'sentimen': base[2],
            'clean_text': aug_text
        })
    aug_df = pd.DataFrame(augmented_rows)
    final_df = pd.concat([original_df, aug_df], ignore_index=True)
else:
    final_df = original_df

final_df.to_csv('preprocessed_comments.csv', index=False)
print(f"Preprocessed and augmented data saved to preprocessed_comments.csv. Total samples: {len(final_df)}")
df = final_df  # update df for next steps

Preprocessed and augmented data saved to preprocessed_comments.csv. Total samples: 10000


In [11]:
# TF-IDF with n-gram, min_df, max_df
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2), min_df=5, max_df=0.85)
X_text = tfidf.fit_transform(df['clean_text'])
tfidf_df = pd.DataFrame(X_text.toarray(), columns=tfidf.get_feature_names_out())
tfidf_df.to_csv('tfidf_matrix.csv', index=False)
print("TF-IDF matrix saved to tfidf_matrix.csv")

# Gabungkan dengan fitur numerik (likes)
X_extra = df[['likes']].fillna(0).values
from scipy.sparse import hstack
X = hstack([X_text, X_extra])
combined_df = tfidf_df.copy()
combined_df['likes'] = X_extra
combined_df.to_csv('tfidf_plus_likes.csv', index=False)
print("TF-IDF + likes features saved to tfidf_plus_likes.csv")

y = df['sentimen']



TF-IDF matrix saved to tfidf_matrix.csv
TF-IDF + likes features saved to tfidf_plus_likes.csv
TF-IDF + likes features saved to tfidf_plus_likes.csv


In [12]:
# Model SVM OVO + GridSearchCV for C, kernel, gamma
from sklearn.utils import check_X_y
if hasattr(X, "toarray"):
    X_dense = X.toarray()
else:
    X_dense = X
X_dense = np.nan_to_num(X_dense)
y = pd.to_numeric(y, errors='coerce').fillna(0).astype(int)
X_dense, y = check_X_y(X_dense, y)

svm = SVC(decision_function_shape='ovo')
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf']
}
cv_val = min(4, len(y)) if len(y) >= 2 else 2
grid = GridSearchCV(svm, param_grid, cv=cv_val, scoring='accuracy', n_jobs=-1)
grid.fit(X_dense, y)
print("Best params:", grid.best_params_)
print("Best score:", grid.best_score_)



Best params: {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}
Best score: 0.974


In [13]:
# Evaluasi dengan cross_val_predict
from sklearn.model_selection import cross_val_predict
best_model = grid.best_estimator_
y_pred = cross_val_predict(best_model, X, y, cv=4)
results_df = df.copy()
results_df['predicted_sentimen'] = y_pred
results_df.to_csv('sentiment_predictions.csv', index=False)
print("Predictions saved to sentiment_predictions.csv")
print(classification_report(y, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y, y_pred))

Predictions saved to sentiment_predictions.csv
              precision    recall  f1-score   support

          -1       0.98      0.97      0.97      1859
           0       0.97      0.98      0.98      4899
           1       0.97      0.97      0.97      3242

    accuracy                           0.97     10000
   macro avg       0.97      0.97      0.97     10000
weighted avg       0.97      0.97      0.97     10000

Confusion Matrix:
[[1802   44   13]
 [  39 4790   70]
 [   7   87 3148]]
