In [1]:
import pandas as pd
import re
import string
import numpy as np
import nltk
import joblib

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from imblearn.over_sampling import SMOTE
from transformers import pipeline
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report


In [2]:
# Load dataset
app_id = "com.instagram.android"
df = pd.read_csv(f"{app_id}_reviews.csv")

# Preprocessing function
factory = StemmerFactory()
stemmer = factory.create_stemmer()
stop_words = set(stopwords.words('indonesian'))

In [3]:
# Function for text cleaning
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = word_tokenize(text)
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing
df['cleaned_text'] = df['content'].astype(str).apply(clean_text)

# Save cleaned data
df.to_csv('cleaned_reviews.csv', index=False)
print('Data preprocessing completed.')

Data preprocessing completed.


In [4]:
# Load Pre-trained BERT Sentiment Model
sentiment_pipeline = pipeline("text-classification", model="indobenchmark/indobert-base-p1")

def predict_sentiment(text):
    result = sentiment_pipeline(text)[0]['label']
    return {'LABEL_0': 'negative', 'LABEL_1': 'neutral', 'LABEL_2': 'positive'}.get(result, 'neutral')

# Apply sentiment prediction
df['sentiment'] = df['cleaned_text'].apply(predict_sentiment)
df.to_csv('labeled_reviews.csv', index=False)
print("Sentiment labeling completed.")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


Sentiment labeling completed.


In [5]:
# Feature Extraction (TF-IDF + Word2Vec)
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(df['cleaned_text'])
pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out()).to_csv("tfidf_features.csv", index=False)

# Word2Vec Feature Extraction
tokenized_text = df['cleaned_text'].apply(lambda x: x.split())
word2vec_model = Word2Vec(sentences=tokenized_text, vector_size=100, window=5, min_count=2, workers=4)
word2vec_model.save("word2vec.model")

print("Feature extraction completed.")


Feature extraction completed.


In [None]:

# Convert text to vector
def vectorize_text(text, model):
    words = text.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(100)

X_w2v = np.array([vectorize_text(text, word2vec_model) for text in df['cleaned_text']])

# Label Encoding
label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
df['label'] = df['sentiment'].map(label_mapping)

# Split Data
# Split Data Sekali untuk Konsistensi
X_train, X_test, y_train, y_test = train_test_split(df.index, df['label'], test_size=0.2, random_state=42)

# Ambil fitur berdasarkan indeks yang sama
X_train_tfidf = X_tfidf[X_train]
X_test_tfidf = X_tfidf[X_test]

X_train_w2v = X_w2v[X_train]
X_test_w2v = X_w2v[X_test]

# Handling Class Imbalance (SMOTE)
smote = SMOTE(random_state=42)
X_train_tfidf, y_train_tfidf = smote.fit_resample(X_train_tfidf, y_train)
X_train_w2v, y_train_w2v = smote.fit_resample(X_train_w2v, y_train)


In [11]:

# Train and Evaluate SVM Model
svm_model = SVC(kernel='linear', C=1.0)
svm_model.fit(X_train_tfidf, y_train_tfidf)
y_pred_svm = svm_model.predict(X_test_tfidf)
print("SVM Model Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

# Train and Evaluate Random Forest Model
rf_model = RandomForestClassifier(n_estimators=500, max_depth=30, random_state=42)
rf_model.fit(X_train_tfidf, y_train_tfidf)
y_pred_rf = rf_model.predict(X_test_tfidf)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

# Train and Evaluate XGBoost Model
xgb_model = XGBClassifier(n_estimators=300, max_depth=10, learning_rate=0.05, random_state=42)
xgb_model.fit(X_train_tfidf, y_train_tfidf)
y_pred_xgb = xgb_model.predict(X_test_tfidf)
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))


SVM Model Accuracy: 0.91
              precision    recall  f1-score   support

           0       0.07      0.06      0.07        16
           1       0.34      0.65      0.44       161
           2       0.98      0.93      0.95      3023

    accuracy                           0.91      3200
   macro avg       0.46      0.55      0.49      3200
weighted avg       0.94      0.91      0.92      3200

Random Forest Accuracy: 0.87375
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        16
           1       0.26      0.80      0.40       161
           2       0.98      0.88      0.93      3023

    accuracy                           0.87      3200
   macro avg       0.42      0.56      0.44      3200
weighted avg       0.94      0.87      0.90      3200



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


XGBoost Accuracy: 0.92375
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        16
           1       0.39      0.70      0.50       161
           2       0.98      0.94      0.96      3023

    accuracy                           0.92      3200
   macro avg       0.46      0.55      0.49      3200
weighted avg       0.94      0.92      0.93      3200



In [12]:
# Save Best Model
joblib.dump(xgb_model, 'best_model.pkl')
print("Best model saved.")

Best model saved.
