## Sentimen Analysis of Gojek App Reviews Using Machine Learning Algorithms

In [1]:
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv('gojek_reviews.csv')

In [3]:
print(df.head())

                                              Review  Rating  Thumbs Up Count  \
0                                    Sangat membantu       5                0   
1               Aplikasi yang sangat membantu sekali       5                0   
2  Mari bersama2 pindah menggunakan grab sebab in...       1                0   
3  beli paket pakai gopay berhasil tp paket inter...       1                0   
4                                           Mantappp       5                0   

           Review Date  
0  2024-10-21 12:28:25  
1  2024-10-21 12:27:39  
2  2024-10-21 12:25:32  
3  2024-10-21 12:00:31  
4  2024-10-21 12:00:13  


In [4]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = set(stopwords.words('indonesian'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
# Preprocessing teks
def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

In [6]:
# Menerapkan preprocessing pada kolom ulasan
df['cleaned_review'] = df['Review'].apply(preprocess_text)

In [7]:
# Menampilkan beberapa hasil ulasan setelah preprocessing
print(df[['Review', 'cleaned_review']].head())

                                              Review  \
0                                    Sangat membantu   
1               Aplikasi yang sangat membantu sekali   
2  Mari bersama2 pindah menggunakan grab sebab in...   
3  beli paket pakai gopay berhasil tp paket inter...   
4                                           Mantappp   

                                      cleaned_review  
0                                           membantu  
1                                  aplikasi membantu  
2                  mari pindah grab aplikasi berguna  
3  beli paket pakai gopay berhasil tp paket inter...  
4                                           mantappp  


In [8]:
def label_sentiment(rating):
    if rating >= 4:
        return 'positif'
    elif rating == 3:
        return 'netral'
    else:
        return 'negatif'

In [9]:
# menerapkan pelabelan sentimen
df['sentiment'] = df['Rating'].apply(label_sentiment)

In [10]:
# Menampilkan beberapa data dengan sentimen
print(df[['Review', 'Rating', 'sentiment']].head())

                                              Review  Rating sentiment
0                                    Sangat membantu       5   positif
1               Aplikasi yang sangat membantu sekali       5   positif
2  Mari bersama2 pindah menggunakan grab sebab in...       1   negatif
3  beli paket pakai gopay berhasil tp paket inter...       1   negatif
4                                           Mantappp       5   positif


In [11]:
# Inisialisasi TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)

In [12]:
# Ekstraksi fitur dari teks ulasan
X = tfidf.fit_transform(df['cleaned_review']).toarray()

In [13]:
# Target label (sentimen)
y = df['sentiment']

In [14]:
# menampilkan bentuk fitur dan label
print(f"Shape of X: {X.shape}, Shape of y: {y.shape}")

Shape of X: (10000, 5000), Shape of y: (10000,)


In [15]:
# Membagi data menjadi training set dan testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Jumlah data latih: {len(X_train)}, Jumlah data uji: {len(X_test)}")

Jumlah data latih: 8000, Jumlah data uji: 2000


In [16]:
# Support Vector Machine
model = SVC()

In [17]:
# Melatih model pada data latih
model.fit(X_train, y_train)

In [18]:
# Prediksi pada data training
y_train_pred = model.predict(X_train)

# Prediksi pada data testing
y_pred = model.predict(X_test)

In [19]:
# Menghitung akurasi untuk data training
train_accuracy = accuracy_score(y_train, y_train_pred)

# Menghitung akurasi untuk data testing
test_accuracy = accuracy_score(y_test, y_pred)

In [20]:
# Menampilkan hasil akurasi
print("SVM Training Accuracy:", train_accuracy)
print("SVM Testing Accuracy:", test_accuracy)

SVM Training Accuracy: 0.947875
SVM Testing Accuracy: 0.886


In [21]:
# Evaluasi model
print(f"Akurasi: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Akurasi: 0.886


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

     negatif       0.81      0.83      0.82       545
      netral       0.00      0.00      0.00        81
     positif       0.91      0.96      0.94      1374

    accuracy                           0.89      2000
   macro avg       0.58      0.60      0.59      2000
weighted avg       0.85      0.89      0.87      2000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [22]:
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report

In [24]:
# Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

In [25]:
# Prediksi pada data training
y_train_pred_rf = rf_model.predict(X_train)

# Prediksi pada data testing
y_pred_rf = rf_model.predict(X_test)

In [26]:
# Menghitung akurasi untuk data training
train_accuracy_rf = accuracy_score(y_train, y_train_pred_rf)

# Menghitung akurasi untuk data testing
test_accuracy_rf = accuracy_score(y_test, y_pred_rf)

In [27]:
# Menampilkan hasil akurasi
print("Random Forest Training Accuracy:", train_accuracy_rf)
print("Random Forest Testing Accuracy:", test_accuracy_rf)

Random Forest Training Accuracy: 0.989125
Random Forest Testing Accuracy: 0.8715


In [28]:
# Menampilkan classification report untuk data testing
print(f"Akurasi: {accuracy_score(y_test, y_pred_rf)}")
print(classification_report(y_test, y_pred_rf))

Akurasi: 0.8715
              precision    recall  f1-score   support

     negatif       0.77      0.82      0.80       545
      netral       0.00      0.00      0.00        81
     positif       0.91      0.94      0.93      1374

    accuracy                           0.87      2000
   macro avg       0.56      0.59      0.57      2000
weighted avg       0.84      0.87      0.85      2000



In [29]:
# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

In [30]:
# Prediksi pada data training
y_train_pred_lr = lr_model.predict(X_train)

# Prediksi pada data testing
y_pred_lr = lr_model.predict(X_test)

In [31]:
# Menghitung akurasi untuk data training
train_accuracy_lr = accuracy_score(y_train, y_train_pred_lr)

# Menghitung akurasi untuk data testing
test_accuracy_lr = accuracy_score(y_test, y_pred_lr)

In [32]:
# Menampilkan hasil akurasi
print("Logistic Regression Training Accuracy:", train_accuracy_lr)
print("Logistic Regression Testing Accuracy:", test_accuracy_lr)

Logistic Regression Training Accuracy: 0.90975
Logistic Regression Testing Accuracy: 0.8775


In [33]:
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

Logistic Regression Accuracy: 0.8775
              precision    recall  f1-score   support

     negatif       0.82      0.78      0.80       545
      netral       0.00      0.00      0.00        81
     positif       0.90      0.97      0.93      1374

    accuracy                           0.88      2000
   macro avg       0.57      0.58      0.58      2000
weighted avg       0.84      0.88      0.86      2000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [38]:
!pip install pipreqs



In [39]:
!pipreqs /content --force

INFO: Not scanning for jupyter notebooks.
INFO: Successfully saved requirements file in /content/requirements.txt


In [42]:
!cat /content/requirements.txt




In [41]:
from google.colab import files
files.download('/content/requirements.txt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [43]:
!pip freeze > requirements.txt