In [1]:
# Load data ke dalam data frame 
import pandas as pd

# spesifikasi encoding diperlukan karena data tidak menggunakan UTF-8
df = pd.read_csv('allreview_clean.csv', encoding='latin-1') 

df.head()

Unnamed: 0,review_tokens_stemmed,sentimen,classified_text,tweet_text,sentence_score,max_positive,max_negative,kelas
0,kulit kerut,"{'classified_text': 'kulit kerut [-2]', 'tweet...",kulit kerut [-2],kulit kerut,['kulit kerut [-2]'],1,-2,negatif
1,tipe kulit kering,"{'classified_text': 'tipe kulit kering', 'twee...",tipe kulit kering,tipe kulit kering,['tipe kulit kering'],1,-1,netral
2,bagus banget lip tint nya warna bagus kemas ge...,{'classified_text': 'bagus [5] banget lip tint...,bagus [5] banget lip tint nya warna bagus [4] ...,bagus banget lip tint nya warna bagus kemas g...,['bagus [5] banget lip tint nya warna bagus [4...,5,-1,positif
3,expire kirim cepat cocok guna sesuai instruksi,{'classified_text': 'expire kirim cepat [4] co...,expire kirim cepat [4] cocok [4] guna sesuai [...,expire kirim cepat cocok guna sesuai instruksi,['expire kirim cepat [4] cocok [4] guna sesuai...,4,-1,positif
4,no review found,"{'classified_text': 'no review found', 'tweet_...",no review found,no review found,['no review found'],1,-1,netral


In [2]:
# Drop 3 kolom terakhir yang tidak digunakan dengan fungsi iloc
df = df.drop(df.iloc[:,1:7], axis=1)

# Cek data teratas
df.head()

Unnamed: 0,review_tokens_stemmed,kelas
0,kulit kerut,negatif
1,tipe kulit kering,netral
2,bagus banget lip tint nya warna bagus kemas ge...,positif
3,expire kirim cepat cocok guna sesuai instruksi,positif
4,no review found,netral


In [3]:
# Data untuk rename kolom v1 dan v2
new_cols = {
    'review_tokens_stemmed' : 'Review',
    'kelas' : 'Sentiment'
}

# Rename nama kolom v1 dan v2
df = df.rename(columns=new_cols)

# Cek data teratas
df.head()

Unnamed: 0,Review,Sentiment
0,kulit kerut,negatif
1,tipe kulit kering,netral
2,bagus banget lip tint nya warna bagus kemas ge...,positif
3,expire kirim cepat cocok guna sesuai instruksi,positif
4,no review found,netral


In [4]:
# Cek jumlah data per kelas
print(df['Sentiment'].value_counts())
print('\n')

# Cek kelengkapan data
print(df.info())
print('\n')

# Cek statistik deskriptif
print(df.describe())

positif    1129
netral      358
negatif     133
Name: Sentiment, dtype: int64


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1620 entries, 0 to 1619
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Review     1620 non-null   object
 1   Sentiment  1620 non-null   object
dtypes: object(2)
memory usage: 25.4+ KB
None


                 Review Sentiment
count              1620      1620
unique             1499         3
top     coba moga cocok   positif
freq                 16      1129


In [5]:
# Data untuk label
new_labels = {
    'negatif' : 0,
    'netral' : 1,
    'positif' : 2
}

# Encode label (mengubah label dari spam dan ham menjadi 1 dan 0)
df['Sentiment'] = df['Sentiment'].map(new_labels)

# Cek data teratas
df.head()

Unnamed: 0,Review,Sentiment
0,kulit kerut,0
1,tipe kulit kering,1
2,bagus banget lip tint nya warna bagus kemas ge...,2
3,expire kirim cepat cocok guna sesuai instruksi,2
4,no review found,1


In [6]:
# Memisahkan fitur dengan label
X = df['Review'].values
y = df['Sentiment'].values

In [7]:
# Ekstraksi fitur
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Split data training dan data testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Inisiasi TfidfVectorizer
bow = TfidfVectorizer()

# Fitting dan transform X_train dengan CountVectorizer
X_train = bow.fit_transform(X_train)

# Transform X_test
# X_test hanya transform (bukan fit_transform) karena agar model tidak mengetahui parameter yang 
# digunakan oleh CountVectorizer untuk fitting data X_test
# sehingga data testing tetap menjadi data yang asing bagi model
X_test = bow.transform(X_test)

In [8]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks

# Melakukan SMOTE terlebih dahulu untuk menambah instans kelas minoritas
smote = SMOTE(random_state=5, sampling_strategy='not majority')
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Kemudian menerapkan Tomek Links untuk membersihkan noise
tomek = TomekLinks(sampling_strategy='majority')
X_resampled, y_resampled = tomek.fit_resample(X_resampled, y_resampled)

# Inisialisasi model Naive Bayes
mnb = MultinomialNB()

mnb.fit(X_resampled, y_resampled)

# Prediksi
y_pred = mnb.predict(X_test)

# Print hasil evaluasi
print('Accuracy : ', accuracy_score(y_test, y_pred))
print('Precision : ', precision_score(y_test, y_pred, average='weighted'))
print('Recall : ', recall_score(y_test, y_pred, average='weighted'))
print('F1 Score : ', f1_score(y_test, y_pred, average='weighted'))

print(f'Confusion Matrix : \n {confusion_matrix(y_test, y_pred)}')
print('-----------------------------------------------------\n')
print(classification_report(y_test, y_pred, zero_division=0))

Accuracy :  0.75
Precision :  0.7637921583304618
Recall :  0.75
F1 Score :  0.7438910260233395
Confusion Matrix : 
 [[ 18   5   4]
 [ 22  22  26]
 [ 13  11 203]]
-----------------------------------------------------

              precision    recall  f1-score   support

           0       0.34      0.67      0.45        27
           1       0.58      0.31      0.41        70
           2       0.87      0.89      0.88       227

    accuracy                           0.75       324
   macro avg       0.60      0.63      0.58       324
weighted avg       0.76      0.75      0.74       324

