In [7]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from collections import Counter
import re
import string

In [10]:

df = pd.read_csv('reviews_spotify.csv')

print("Jumlah baris dan kolom:")
print(df.shape)

print("\nInformasi kolom:")
print(df.info())

print("\n5 baris pertama dataset:")
print(df.head())




Jumlah baris dan kolom:
(11000, 11)

Informasi kolom:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11000 entries, 0 to 10999
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   reviewId              11000 non-null  object 
 1   userName              11000 non-null  object 
 2   userImage             11000 non-null  object 
 3   content               11000 non-null  object 
 4   score                 11000 non-null  int64  
 5   thumbsUpCount         11000 non-null  int64  
 6   reviewCreatedVersion  7576 non-null   object 
 7   at                    11000 non-null  object 
 8   replyContent          0 non-null      float64
 9   repliedAt             0 non-null      float64
 10  appVersion            7576 non-null   object 
dtypes: float64(2), int64(2), object(7)
memory usage: 945.4+ KB
None

5 baris pertama dataset:
                               reviewId         userName  \
0  0ad428e1-ee55-47

In [12]:
# Menangani data yang hilang pada kolom 'username'
df_cleaned = df.dropna(subset=['userName'])
print(df_cleaned.isnull().sum())

reviewId                    0
userName                    0
userImage                   0
content                     0
score                       0
thumbsUpCount               0
reviewCreatedVersion     3424
at                          0
replyContent            11000
repliedAt               11000
appVersion               3424
dtype: int64


In [14]:
# Fungsi untuk membersihkan teks
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

df['cleaned_content'] = df['content'].apply(clean_text)

In [16]:
def get_sentiment(text):
    from textblob import TextBlob
    polarity = TextBlob(text).sentiment.polarity
    if polarity > 0:
        return 'positif'
    elif polarity < 0:
        return 'negatif'
    else:
        return 'netral'

df['label'] = df['cleaned_content'].apply(get_sentiment)

In [18]:
# Encode label
y = LabelEncoder().fit_transform(df['label'])
X = df['cleaned_content']

# Pisahkan data menjadi train dan test
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [19]:
# Preprocessing untuk model berbasis TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train_raw)
X_test_tfidf = vectorizer.transform(X_test_raw)

smote = SMOTE(random_state=42)
X_train_tfidf_smote, y_train_smote = smote.fit_resample(X_train_tfidf, y_train)

print("Distribusi label setelah SMOTE:", Counter(y_train_smote))

Distribusi label setelah SMOTE: Counter({1: 7702, 2: 7702, 0: 7702})


In [20]:
# 1. Model SVM
svm = SVC(kernel='linear', random_state=42)
svm.fit(X_train_tfidf_smote, y_train_smote)
y_pred_svm = svm.predict(X_test_tfidf)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("\nEvaluasi Model SVM:")
print(f"Accuracy: {accuracy_svm}")
print(classification_report(y_test, y_pred_svm, target_names=['negatif', 'netral', 'positif']))


Evaluasi Model SVM:
Accuracy: 0.9804545454545455
              precision    recall  f1-score   support

     negatif       0.78      0.66      0.71        32
      netral       0.98      1.00      0.99      1926
     positif       0.98      0.88      0.93       242

    accuracy                           0.98      2200
   macro avg       0.91      0.84      0.88      2200
weighted avg       0.98      0.98      0.98      2200



In [21]:
# 2. Model Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_tfidf_smote, y_train_smote)
y_pred_rf = rf.predict(X_test_tfidf)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("\nEvaluasi Model Random Forest:")
print(f"Accuracy: {accuracy_rf}")
print(classification_report(y_test, y_pred_rf, target_names=['negatif', 'netral', 'positif']))


Evaluasi Model Random Forest:
Accuracy: 0.9572727272727273
              precision    recall  f1-score   support

     negatif       0.75      0.28      0.41        32
      netral       0.96      1.00      0.98      1926
     positif       0.95      0.73      0.82       242

    accuracy                           0.96      2200
   macro avg       0.89      0.67      0.74      2200
weighted avg       0.96      0.96      0.95      2200



In [22]:

# Preprocessing untuk model LSTM
max_words = 10000
max_len = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train_raw)

X_train_seq = tokenizer.texts_to_sequences(X_train_raw)
X_test_seq = tokenizer.texts_to_sequences(X_test_raw)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# Konversi label ke one-hot encoding
y_train_cat = pd.get_dummies(y_train).values
y_test_cat = pd.get_dummies(y_test).values

In [23]:
# 3. Model LSTM
model_lstm = Sequential([
    Embedding(input_dim=max_words, output_dim=100, input_length=max_len),
    LSTM(128, return_sequences=True, kernel_regularizer='l2'),
    Dropout(0.5),
    LSTM(64, kernel_regularizer='l2'),
    Dropout(0.5),
    Dense(3, activation='softmax')
])

model_lstm.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2)
]

history = model_lstm.fit(
    X_train_pad, y_train_cat,
    epochs=15,
    batch_size=64,
    validation_data=(X_test_pad, y_test_cat),
    callbacks=callbacks
)

oss, accuracy_lstm = model_lstm.evaluate(X_test_pad, y_test_cat)
print("\nEvaluasi Model LSTM:")
print(f"Accuracy: {accuracy_lstm}")




Epoch 1/15
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 18ms/step - accuracy: 0.8419 - loss: 2.2784 - val_accuracy: 0.8755 - val_loss: 0.5261 - learning_rate: 0.0010
Epoch 2/15
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.8776 - loss: 0.4916 - val_accuracy: 0.8755 - val_loss: 0.4365 - learning_rate: 0.0010
Epoch 3/15
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.8730 - loss: 0.4498 - val_accuracy: 0.8755 - val_loss: 0.4254 - learning_rate: 0.0010
Epoch 4/15
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.8744 - loss: 0.4419 - val_accuracy: 0.8755 - val_loss: 0.4323 - learning_rate: 0.0010
Epoch 5/15
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.8780 - loss: 0.4283 - val_accuracy: 0.8755 - val_loss: 0.4212 - learning_rate: 0.0010
Epoch 6/15
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

In [24]:
!pip freeze > requirements.txt
from google.colab import files
files.download('requirements.txt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>