In [11]:
# Load data ke dalam data frame 
import pandas as pd

# spesifikasi encoding diperlukan karena data tidak menggunakan UTF-8
df = pd.read_csv('sentimen_270_terpisah_new.csv', encoding='latin-1') 

df.head()

Unnamed: 0,review_tokens_stemmed,Sentimen,sentimen,classified_text,tweet_text,sentence_score,max_positive,max_negative,kelas
0,kecewa banget pesan free pouch kirim cotton pa...,Negatif,{'classified_text': 'kecewa [-4] banget pesan ...,kecewa [-4] banget pesan free pouch kirim cott...,kecewa banget pesan free pouch kirim cotton p...,['kecewa [-4] banget pesan free pouch kirim co...,5,-4,positif
1,kali emas tidak safe seperti tutup botol tidak...,Negatif,{'classified_text': 'kali emas [5] tidak safe ...,kali emas [5] tidak safe seperti [1] tutup bot...,kali emas tidak safe seperti tutup botol tida...,['kali emas [5] tidak safe seperti [1] tutup b...,5,-3,positif
2,paket datang barang pecah return seller ikut k...,Negatif,{'classified_text': 'paket datang barang pecah...,paket datang barang pecah [-2] return seller i...,paket datang barang pecah return seller ikut ...,['paket datang barang pecah [-2] return seller...,1,-1,netral
3,enggak sabun nya enggak sesuai deskripsi,Negatif,{'classified_text': 'enggak sabun nya enggak s...,enggak sabun nya enggak sesuai [-3] deskripsi,enggak sabun nya enggak sesuai deskripsi,['enggak sabun nya enggak sesuai [-3] deskripsi'],1,-3,negatif
4,enggak sesuai harap,Negatif,{'classified_text': 'enggak sesuai [-3] harap'...,enggak sesuai [-3] harap,enggak sesuai harap,['enggak sesuai [-3] harap'],1,-3,negatif


In [12]:
# Drop 3 kolom terakhir yang tidak digunakan dengan fungsi iloc
df = df.drop(df.iloc[:,1:8], axis=1)

# Cek data teratas
df.head()

Unnamed: 0,review_tokens_stemmed,kelas
0,kecewa banget pesan free pouch kirim cotton pa...,positif
1,kali emas tidak safe seperti tutup botol tidak...,positif
2,paket datang barang pecah return seller ikut k...,netral
3,enggak sabun nya enggak sesuai deskripsi,negatif
4,enggak sesuai harap,negatif


In [13]:
# Data untuk rename kolom v1 dan v2
new_cols = {
    'review_tokens_stemmed' : 'Review',
    'kelas' : 'Sentimen'
}

# Rename nama kolom v1 dan v2
df = df.rename(columns=new_cols)

# Cek data teratas
df.head()

Unnamed: 0,Review,Sentimen
0,kecewa banget pesan free pouch kirim cotton pa...,positif
1,kali emas tidak safe seperti tutup botol tidak...,positif
2,paket datang barang pecah return seller ikut k...,netral
3,enggak sabun nya enggak sesuai deskripsi,negatif
4,enggak sesuai harap,negatif


In [14]:
# Cek jumlah data per kelas
print(df['Sentimen'].value_counts())
print('\n')

# Cek kelengkapan data
print(df.info())
print('\n')

# Cek statistik deskriptif
print(df.describe())

positif    141
netral      63
negatif     43
Name: Sentimen, dtype: int64


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 247 entries, 0 to 246
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Review    247 non-null    object
 1   Sentimen  247 non-null    object
dtypes: object(2)
memory usage: 4.0+ KB
None


              Review Sentimen
count            247      247
unique           238        3
top     terima kasih  positif
freq               3      141


In [15]:
# Data untuk label
new_labels = {
    'negatif' : 0,
    'netral' : 1,
    'positif' : 2
}

# Encode label (mengubah label dari spam dan ham menjadi 1 dan 0)
df['Sentimen'] = df['Sentimen'].map(new_labels)

# Cek data teratas
df.head()

Unnamed: 0,Review,Sentimen
0,kecewa banget pesan free pouch kirim cotton pa...,2
1,kali emas tidak safe seperti tutup botol tidak...,2
2,paket datang barang pecah return seller ikut k...,1
3,enggak sabun nya enggak sesuai deskripsi,0
4,enggak sesuai harap,0


In [16]:
# Memisahkan fitur dengan label
X = df['Review'].values
y = df['Sentimen'].values

In [17]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer

rsk = RepeatedStratifiedKFold(n_splits=5, 
                                n_repeats=3, 
                                random_state=42)

# Inisiasi TfidfVectorizer
bow = TfidfVectorizer(max_features=500)

mnb = MultinomialNB(alpha=0.1)

# List untuk menyimpan prediksi dan label sebenarnya dari setiap lipatan
all_predictions_testing = []
all_true_labels_testing = []
all_predictions_training = []
all_true_labels_training = []

# Loop melalui setiap lipatan (fold) dalam cross-validation
for train_index, test_index in rsk.split(X, y):
    X_train, X_test = X[train_index], X[test_index]  # Data latih dan uji
    y_train, y_test = y[train_index], y[test_index]  # Label data latih dan uji

    # Transformasi seluruh data menggunakan TfidfVectorizer
    X_train_transform = bow.fit_transform(X_train)
    X_test_transform = bow.transform(X_test)

    # Melatih model klasifikasi pada data latih
    mnb.fit(X_train_transform, y_train)

    # Membuat prediksi pada data uji
    predictions_testing = mnb.predict(X_test_transform)

    # Menambahkan prediksi dan label sebenarnya ke dalam list
    all_predictions_testing.extend(predictions_testing)
    all_true_labels_testing.extend(y_test)

    # Membuat prediksi pada data training
    predictions_training = mnb.predict(X_train_transform)

    # Menambahkan prediksi dan label sebenarnya ke dalam list
    all_predictions_training.extend(predictions_training)
    all_true_labels_training.extend(y_train)

# Lakukan grid search untuk mencari nilai alpha terbaik
# grid_search = GridSearchCV(estimator=pipeline, cv=5, param_grid=params, scoring='accuracy')
# grid_search.fit(X_train, y_train)

# # Ambil nilai alpha terbaik dari hasil grid search
# best_alpha = grid_search.best_params_['clf__alpha']
# best_maxf = grid_search.best_params_['tfidf__max_features']

# print(f'Best alpha: {best_alpha}')
# print(f'Best max_features: {best_maxf}')

# Gunakan nilai alpha terbaik untuk training model
# best_naive_bayes_classifier = MultinomialNB(alpha=best_alpha)
# best_naive_bayes_classifier.fit(X_train, y_train)


In [18]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Hitung metrik evaluasi
accuracy_training = accuracy_score(all_true_labels_training, all_predictions_training)
precision_training = precision_score(all_true_labels_training, all_predictions_training, average='weighted')
recall_training = recall_score(all_true_labels_training, all_predictions_training, average='weighted')
f1_training = f1_score(all_true_labels_training, all_predictions_training, average='weighted')

# Print hasil
print("------- AKURASI TRAINING -------")
print("Overall accuracy:", accuracy_training)
print("Overall precision:", precision_training)
print("Overall recall:", recall_training)
print("Overall F1-score:", f1_training)

print(f'Confusion Matrix : \n {confusion_matrix(all_true_labels_training, all_predictions_training)}')
print('-----------------------------------------------------\n')
print(classification_report(all_true_labels_training, all_predictions_training, zero_division=0))


------- AKURASI TRAINING -------
Overall accuracy: 0.9446693657219973
Overall precision: 0.9480419075725269
Overall recall: 0.9446693657219973
Overall F1-score: 0.9430555148090884
Confusion Matrix : 
 [[ 498    0   18]
 [  37  612  107]
 [   2    0 1690]]
-----------------------------------------------------

              precision    recall  f1-score   support

           0       0.93      0.97      0.95       516
           1       1.00      0.81      0.89       756
           2       0.93      1.00      0.96      1692

    accuracy                           0.94      2964
   macro avg       0.95      0.92      0.93      2964
weighted avg       0.95      0.94      0.94      2964



In [19]:
# Hitung metrik evaluasi
accuracy_testing = accuracy_score(all_true_labels_testing, all_predictions_testing)
precision_testing = precision_score(all_true_labels_testing, all_predictions_testing, average='weighted')
recall_testing = recall_score(all_true_labels_testing, all_predictions_testing, average='weighted')
f1_testing = f1_score(all_true_labels_testing, all_predictions_testing, average='weighted')

# Print hasil
print("------- AKURASI TESTING -------")
print("Overall accuracy:", accuracy_testing)
print("Overall precision:", precision_testing)
print("Overall recall:", recall_testing)
print("Overall F1-score:", f1_testing)

print(f'Confusion Matrix : \n {confusion_matrix(all_true_labels_testing, all_predictions_testing)}')
print('-----------------------------------------------------\n')
print(classification_report(all_true_labels_testing, all_predictions_testing, zero_division=0))

------- AKURASI TESTING -------
Overall accuracy: 0.6491228070175439
Overall precision: 0.6008356500852742
Overall recall: 0.6491228070175439
Overall F1-score: 0.591349286816918
Confusion Matrix : 
 [[ 60  15  54]
 [ 27  24 138]
 [  6  20 397]]
-----------------------------------------------------

              precision    recall  f1-score   support

           0       0.65      0.47      0.54       129
           1       0.41      0.13      0.19       189
           2       0.67      0.94      0.78       423

    accuracy                           0.65       741
   macro avg       0.58      0.51      0.51       741
weighted avg       0.60      0.65      0.59       741



In [20]:
new_text = input("\nMasukkan teks baru: ")
new_text_vec = bow.transform([new_text])
predicted_sentimen = mnb.predict(new_text_vec)

if predicted_sentimen[0] == 0:
    sentiment_label = "negatif"
elif predicted_sentimen[0] == 1:
    sentiment_label = "netral"
elif predicted_sentimen[0] == 2:
    sentiment_label = "positif"

print("Hasil Analisis Sentimen untuk Teks Baru : ", sentiment_label)

Hasil Analisis Sentimen untuk Teks Baru :  positif
