In [1]:
# Load data ke dalam data frame 
import pandas as pd

# spesifikasi encoding diperlukan karena data tidak menggunakan UTF-8
df = pd.read_csv('allreview_clean.csv', encoding='latin-1') 

df.head()

Unnamed: 0,review_tokens_stemmed,sentimen,classified_text,tweet_text,sentence_score,max_positive,max_negative,kelas
0,kulit kerut,"{'classified_text': 'kulit kerut [-2]', 'tweet...",kulit kerut [-2],kulit kerut,['kulit kerut [-2]'],1,-2,negatif
1,tipe kulit kering,"{'classified_text': 'tipe kulit kering', 'twee...",tipe kulit kering,tipe kulit kering,['tipe kulit kering'],1,-1,netral
2,bagus banget lip tint nya warna bagus kemas ge...,{'classified_text': 'bagus [5] banget lip tint...,bagus [5] banget lip tint nya warna bagus [4] ...,bagus banget lip tint nya warna bagus kemas g...,['bagus [5] banget lip tint nya warna bagus [4...,5,-1,positif
3,expire kirim cepat cocok guna sesuai instruksi,{'classified_text': 'expire kirim cepat [4] co...,expire kirim cepat [4] cocok [4] guna sesuai [...,expire kirim cepat cocok guna sesuai instruksi,['expire kirim cepat [4] cocok [4] guna sesuai...,4,-1,positif
4,no review found,"{'classified_text': 'no review found', 'tweet_...",no review found,no review found,['no review found'],1,-1,netral


In [2]:
# Drop 3 kolom terakhir yang tidak digunakan dengan fungsi iloc
df = df.drop(df.iloc[:,1:7], axis=1)

# Cek data teratas
df.head()

Unnamed: 0,review_tokens_stemmed,kelas
0,kulit kerut,negatif
1,tipe kulit kering,netral
2,bagus banget lip tint nya warna bagus kemas ge...,positif
3,expire kirim cepat cocok guna sesuai instruksi,positif
4,no review found,netral


In [3]:
# Data untuk rename kolom v1 dan v2
new_cols = {
    'review_tokens_stemmed' : 'Review',
    'kelas' : 'Sentiment'
}

# Rename nama kolom v1 dan v2
df = df.rename(columns=new_cols)

# Cek data teratas
df.head()

Unnamed: 0,Review,Sentiment
0,kulit kerut,negatif
1,tipe kulit kering,netral
2,bagus banget lip tint nya warna bagus kemas ge...,positif
3,expire kirim cepat cocok guna sesuai instruksi,positif
4,no review found,netral


In [4]:
# Cek jumlah data per kelas
print(df['Sentiment'].value_counts())
print('\n')

# Cek kelengkapan data
print(df.info())
print('\n')

# Cek statistik deskriptif
print(df.describe())

positif    1129
netral      358
negatif     133
Name: Sentiment, dtype: int64


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1620 entries, 0 to 1619
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Review     1620 non-null   object
 1   Sentiment  1620 non-null   object
dtypes: object(2)
memory usage: 25.4+ KB
None


                 Review Sentiment
count              1620      1620
unique             1499         3
top     coba moga cocok   positif
freq                 16      1129


In [5]:
# Data untuk label
new_labels = {
    'negatif' : 0,
    'netral' : 1,
    'positif' : 2
}

# Encode label (mengubah label dari spam dan ham menjadi 1 dan 0)
df['Sentiment'] = df['Sentiment'].map(new_labels)

# Cek data teratas
df.head()

Unnamed: 0,Review,Sentiment
0,kulit kerut,0
1,tipe kulit kering,1
2,bagus banget lip tint nya warna bagus kemas ge...,2
3,expire kirim cepat cocok guna sesuai instruksi,2
4,no review found,1


In [6]:
# Memisahkan fitur dengan label
X = df['Review'].values
y = df['Sentiment'].values

In [7]:
# Ekstraksi fitur
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Split data training dan data testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold

cv_method = RepeatedStratifiedKFold(n_splits=5, 
                                    n_repeats=3, 
                                    random_state=42)

# Inisiasi pipeline dengan TfidfVectorizer dan MultinomialNB
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

# Tentukan rentang nilai parameter yang akan diuji
params = {
    'clf__alpha': [0.1, 0.01, 0.001, 0.0001],
    'tfidf__max_features': [100, 500, 1000, 2000]
}

# Lakukan grid search untuk mencari nilai alpha terbaik
grid_search = GridSearchCV(estimator=pipeline, cv=cv_method, param_grid=params, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Ambil nilai alpha terbaik dari hasil grid search
best_alpha = grid_search.best_params_['clf__alpha']
best_maxf = grid_search.best_params_['tfidf__max_features']

print(f'Best alpha: {best_alpha}')
print(f'Best max_features: {best_maxf}')

# Gunakan nilai alpha terbaik untuk training model
# best_naive_bayes_classifier = MultinomialNB(alpha=best_alpha)
# best_naive_bayes_classifier.fit(X_train, y_train)


Best alpha: 0.1
Best max_features: 1000


In [9]:
# Inisiasi TfidfVectorizer
bow = TfidfVectorizer(max_features=best_maxf)

# Fitting dan transform X_train dengan CountVectorizer
X_train = bow.fit_transform(X_train)

# Transform X_test
# X_test hanya transform (bukan fit_transform) karena agar model tidak mengetahui parameter yang 
# digunakan oleh CountVectorizer untuk fitting data X_test
# sehingga data testing tetap menjadi data yang asing bagi model
X_test = bow.transform(X_test)

In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# Implementasi Naive Bayes dengan Laplace smoothing
# alpha = 1.0  # Parameter smoothing (Laplace smoothing)
mnb = MultinomialNB(alpha=best_alpha)

# Fit Model ke MultinomialNB
mnb = mnb.fit(X_train, y_train)

# Prediksi dengan data testing
predicted = mnb.predict(X_test)

# Print hasil evaluasi
print('Accuracy : ', accuracy_score(y_test, predicted))
print('Precision : ', precision_score(y_test, predicted, average='weighted'))
print('Recall : ', recall_score(y_test, predicted, average='weighted'))
print('F1 Score : ', f1_score(y_test, predicted, average='weighted'))

print(f'Confusion Matrix : \n {confusion_matrix(y_test, predicted)}')
print('-----------------------------------------------------\n')
print(classification_report(y_test, predicted, zero_division=0))

Accuracy :  0.7469135802469136
Precision :  0.6992291175877641
Recall :  0.7469135802469136
F1 Score :  0.695440286702526
Confusion Matrix : 
 [[ 17   9  13]
 [ 18  17  75]
 [  0   8 329]]
-----------------------------------------------------

              precision    recall  f1-score   support

           0       0.49      0.44      0.46        39
           1       0.50      0.15      0.24       110
           2       0.79      0.98      0.87       337

    accuracy                           0.75       486
   macro avg       0.59      0.52      0.52       486
weighted avg       0.70      0.75      0.70       486

