# **Skema 1–TF-IDF + SVM**

## **Import Library**

In [74]:
import nltk
nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()
text = "This app is really helpful and easy to use!"
print(sid.polarity_scores(text))


{'neg': 0.0, 'neu': 0.517, 'pos': 0.483, 'compound': 0.7614}


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Caca\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [75]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score

## **Load Data**

In [76]:
# Load dataset lokal
df = pd.read_csv('uber_reviews_labeled.csv')

# Tampilkan beberapa baris awal
print(df.head())

                               reviewId         userName  \
0  5a4bb1dc-c7a2-4ede-a1fd-31784ceba605        Jay Hurst   
1  e5b7ea01-6103-4ad2-a75e-06a59eb2f5af      Madhu Maddy   
2  9163c787-2220-4178-8a87-f532f5dfaf0f     Andrew Radak   
3  d6f25e14-5ba6-44f7-8fcf-67f303cc2050  Syed Arif Iqbal   
4  b153c7c1-32a3-494f-93db-7e259c6377f7       Haidar Ali   

                                           userImage  \
0  https://play-lh.googleusercontent.com/a/ACg8oc...   
1  https://play-lh.googleusercontent.com/a/ACg8oc...   
2  https://play-lh.googleusercontent.com/a/ACg8oc...   
3  https://play-lh.googleusercontent.com/a-/ALV-U...   
4  https://play-lh.googleusercontent.com/a-/ALV-U...   

                                             content  score  thumbsUpCount  \
0                                            love it      5              0   
1                                               good      5              0   
2  this is a monopoly praying on people needing m...      1         

In [77]:
print(df.columns)

Index(['reviewId', 'userName', 'userImage', 'content', 'score',
       'thumbsUpCount', 'reviewCreatedVersion', 'at', 'replyContent',
       'repliedAt', 'appVersion', 'clean_text', 'sentiment'],
      dtype='object')


## **Preprocessing**

In [78]:
# Ubah ke lowercase
df['clean_text'] = df['clean_text'].astype(str).str.lower()
df['sentiment'] = df['sentiment'].astype(str).str.lower()

In [79]:
# Pisahkan fitur dan label
X = df['clean_text']
y = df['sentiment']

# Bagi data: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## **Pelatihan Model**

In [80]:
# TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # bisa disesuaikan
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# SVM model
model = LinearSVC()
model.fit(X_train_tfidf, y_train)

# Evaluasi
y_pred = model.predict(X_test_tfidf)
print("Akurasi:", accuracy_score(y_test, y_pred))
print("Laporan klasifikasi:\n", classification_report(y_test, y_pred))

Akurasi: 0.909
Laporan klasifikasi:
               precision    recall  f1-score   support

    negative       0.74      0.81      0.77       313
     neutral       0.82      0.85      0.84       274
    positive       0.97      0.94      0.95      1413

    accuracy                           0.91      2000
   macro avg       0.85      0.87      0.86      2000
weighted avg       0.91      0.91      0.91      2000



 Evaluasi:
 - Positive class mendominasi (jumlahnya 1413 dari 2000), jadi skor weighted avg tinggi. Tapi macro avg (yang memperlakukan semua kelas sama rata) juga 86%, yang berarti model cukup seimbang.
 - Negative class masih agak lemah (precision 0.74, recall 0.81), mungkin bisa diperbaiki dengan augmentasi data atau tuning hyperparameter.

# **Skema 2 – Word2Vec + Random Forest**

## **Import Library**

In [51]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Caca\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Caca\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [52]:
# Data handling
import pandas as pd
import numpy as np
import struct
from tqdm import tqdm

In [53]:
import re
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

## **Pre-trained Word2Vec model**

Note: Saya menggunakan pre-trained word2vec model karena tidak bisa melakukan isntalasi Gensim ataupun Spacy

Saya menggunakan pre-trained word embeddings dari Google News Word2Vec

In [None]:
# Fungsi Pembaca Word2Vec untuk file .bin
def load_word2vec_bin(file_path, limit=None):
    word_vectors = {}
    with open(file_path, 'rb') as f:
        header = f.readline()
        vocab_size, vector_size = map(int, header.split())
        if limit:
            vocab_size = min(limit, vocab_size)
        for _ in tqdm(range(vocab_size), desc="Loading Word2Vec"):
            word = []
            while True:
                ch = f.read(1)
                if ch == b' ':
                    break
                if ch != b'\n':
                    word.append(ch)
            word = b''.join(word).decode('utf-8', errors='ignore')
            vector = struct.unpack('f' * vector_size, f.read(4 * vector_size))
            word_vectors[word] = np.array(vector, dtype=np.float32)
    return word_vectors

In [None]:
# Load model Word2Vec
word2vec_path = r'C:\Users\Caca\Downloads\GoogleNews-vectors-negative300.bin'
word_vectors = load_word2vec_bin(word2vec_path, limit=500000)

Loading Word2Vec: 100%|██████████| 500000/500000 [00:11<00:00, 41901.36it/s]


## **Load Data**

In [57]:
df = pd.read_csv('uber_reviews_labeled.csv')
df['clean_text'] = df['clean_text'].astype(str).str.lower()

## **Preprocessing**

In [58]:
# Vektor rata-rata tiap kalimat
def get_average_vector(text, word_vectors, vector_size=300):
    words = text.split()
    vectors = [word_vectors[word] for word in words if word in word_vectors]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(vector_size)


In [59]:
# ubah teks menjadi vektor
X = np.array([get_average_vector(text, word_vectors) for text in tqdm(df['clean_text'], desc="Vectorizing Text")])
y = df['sentiment']

Vectorizing Text: 100%|██████████| 10000/10000 [00:00<00:00, 39151.21it/s]


## **Pelatihan Model**

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [61]:
y_pred = model.predict(X_test)
print("Akurasi:", accuracy_score(y_test, y_pred))
print("Laporan klasifikasi:\n", classification_report(y_test, y_pred))

Akurasi: 0.8645
Laporan klasifikasi:
               precision    recall  f1-score   support

    negative       0.70      0.72      0.71       313
     neutral       0.88      0.60      0.71       274
    positive       0.90      0.95      0.92      1413

    accuracy                           0.86      2000
   macro avg       0.83      0.76      0.78      2000
weighted avg       0.86      0.86      0.86      2000



 Evaluasi:
- Akurasi: Sudah memenuhi syarat.
- Keseimbangan kelas:
    1. Kelas positive mendominasi dan hasilnya sangat bagus (F1 = 0.92).
    2. Kelas neutral agak rendah recall-nya (0.60), tapi masih cukup baik.
    3. Kelas negative cukup seimbang precision-recall-nya.
- Macro avg F1-score: 0.78 → mengindikasikan model cukup mampu menangani ketidakseimbangan kelas.

# **Skema 3 – TF-IDF + Logistic Regression**

# **Import Library**

In [62]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

## **Load Data**

In [63]:
# Load dataset
df = pd.read_csv("uber_reviews_labeled.csv")

## **Preprocessing**

In [None]:
# Mengubah ke Lowercase
df['clean_text'] = df['clean_text'].astype(str).str.lower()

In [66]:
# TF-IDF vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['clean_text'])

# Label
y = df['sentiment']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## **Pelatihan Model**

In [67]:
# Train Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

In [68]:
# Predict & evaluate
y_pred = lr_model.predict(X_test)
print("Akurasi:", accuracy_score(y_test, y_pred))
print("Laporan klasifikasi:")
print(classification_report(y_test, y_pred))

Akurasi: 0.9015
Laporan klasifikasi:
              precision    recall  f1-score   support

    negative       0.76      0.80      0.78       313
     neutral       0.85      0.77      0.81       274
    positive       0.94      0.95      0.95      1413

    accuracy                           0.90      2000
   macro avg       0.85      0.84      0.85      2000
weighted avg       0.90      0.90      0.90      2000



 Evaluasi:
 - Model Logistic Regression dengan TF-IDF menunjukkan performa sangat baik dengan akurasi 90%
 - Keseimbangan klasifikasi pada tiga kelas juga cukup stabil, walaupun kelas neutral sedikit lebih rendah pada recall.