## Install semua library eksternal dan konfigurasi

In [5]:
# --- Install semua library eksternal ---
!pip install gensim --quiet
!pip install scikit-learn --quiet
!pip install nltk --quiet
!pip install matplotlib seaborn --quiet
!pip install tensorflow --quiet


In [None]:
!pip install numpy --upgrade --force-reinstall --quiet
import os
os.kill(os.getpid(), 9)

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.3 requires numpy<2.0,>=1.18.5, but you have numpy 2.2.4 which is incompatible.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 2.2.4 which is incompatible.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.2.4 which is incompatible.[0m[31m
[0m

In [None]:
# Fix masalah kompatibilitas numpy
!pip install numpy==1.23.5 --quiet
!pip install gensim --force-reinstall --no-cache-dir --quiet
import os
os.kill(os.getpid(), 9)


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
blosc2 3.2.1 requires numpy>=1.26, but you have numpy 1.23.5 which is incompatible.
jaxlib 0.5.1 requires numpy>=1.25, but you have numpy 1.23.5 which is incompatible.
chex 0.1.89 requires numpy>=1.24.1, but you have numpy 1.23.5 which is incompatible.
pymc 5.21.2 requires numpy>=1.25.0, but you have numpy 1.23.5 which is incompatible.
bigframes 1.42.0 requires numpy>=1.24.0, but you have numpy 1.23.5 which is incompatible.
jax 0.5.2 requires numpy>=1.25, but you have numpy 1.23.5 which is incompatible.
scikit-image 0.25.2 requires numpy>=1.24, but you have numpy 1.23.5 which is incompatible.
treescope 0.1.9 requires numpy>=1.25.2, but you have numpy 1.23.5 which is incompatible.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.23.5 which is incompatible.
tensorflow 2.18.0 requires numpy<2.1.0,>

## Import library

In [3]:
import pandas as pd
import re
import string
import nltk
import numpy as np
from nltk.corpus import stopwords
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from gensim.models import Word2Vec
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D

## Langkah 1: Load CSV & Labeling Otomatis

In [4]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load data hasil scraping
df = pd.read_csv('tiktok_reviews.csv')

# Bersihkan data kosong
df.dropna(subset=['content', 'score'], inplace=True)

# Preprocessing dasar: menghapus tanda baca, angka, dan stopwords
def clean_text(text):
    text = text.lower()                                # huruf kecil
    text = re.sub(r'\d+', '', text)                    # hapus angka
    text = text.translate(str.maketrans('', '', string.punctuation))  # hapus tanda baca
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

df['clean_content'] = df['content'].apply(clean_text)

# Labeling berdasarkan analisis polarity (lexicon-based)
def get_sentiment(text):
    polarity = TextBlob(text).sentiment.polarity
    if polarity > 0.1:
        return 'positive'
    elif polarity < -0.1:
        return 'negative'
    else:
        return 'neutral'

df['label'] = df['clean_content'].apply(get_sentiment)

# Cek distribusi kelas
print(df['label'].value_counts())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


label
positive    56000
neutral     38500
negative     5500
Name: count, dtype: int64


## Langkah 2: Resampling data negative

In [5]:
from sklearn.utils import resample

# Pisahkan berdasarkan label
df_pos = df[df['label'] == 'positive']
df_neu = df[df['label'] == 'neutral']
df_neg = df[df['label'] == 'negative']

# Cari jumlah maksimum dari semua kelas
max_count = max(len(df_pos), len(df_neu), len(df_neg))

# Oversample masing-masing kelas ke jumlah maksimum
df_pos_upsampled = resample(df_pos, replace=True, n_samples=max_count, random_state=42)
df_neu_upsampled = resample(df_neu, replace=True, n_samples=max_count, random_state=42)
df_neg_upsampled = resample(df_neg, replace=True, n_samples=max_count, random_state=42)

# Gabungkan semua kelas
df_balanced = pd.concat([df_pos_upsampled, df_neu_upsampled, df_neg_upsampled])
df_balanced = df_balanced.sample(frac=1, random_state=42)  # acak ulang

# Cek ulang distribusi
print(df_balanced['label'].value_counts())


label
neutral     56000
positive    56000
negative    56000
Name: count, dtype: int64


## Langkah 3: Encode Label & Split Data

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Encode label hasil dari lexicon-based analysis pada df_balanced
le = LabelEncoder()
df_balanced['label_encoded'] = le.fit_transform(df_balanced['label'])  # Contoh: positive=2, neutral=1, negative=0

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df_balanced['clean_content'], df_balanced['label_encoded'],
    test_size=0.2,
    stratify=df_balanced['label_encoded'],  # agar distribusi kelas tetap proporsional
    random_state=42
)

print("Training data:", len(X_train))
print("Testing data :", len(X_test))


Training data: 134400
Testing data : 33600


## Langkah 4 : Tokenizing & Padding

In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

MAX_NUM_WORDS = 20000
MAX_SEQUENCE_LENGTH = 100

# Tokenisasi teks hasil preprocessing (clean_content)
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(X_train)

# Ubah teks menjadi urutan angka (sequences)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding agar panjang input seragam
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_SEQUENCE_LENGTH)
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_SEQUENCE_LENGTH)

# One-hot encode label target: 3 kelas (negatif, netral, positif)
y_train_cat = to_categorical(y_train, num_classes=3)
y_test_cat = to_categorical(y_test, num_classes=3)



## Langkah 5 : Word2Vec Embedding Matrix

In [8]:
from gensim.models import Word2Vec
import numpy as np

# Pastikan setiap teks sudah dalam bentuk token list
sentences = [text.split() for text in X_train]  # X_train berisi clean_content

# Latih model Word2Vec berdasarkan data training
w2v_model = Word2Vec(
    sentences,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4,
    sg=1  # skip-gram, bisa juga cbow (sg=0) tergantung eksperimen
)

# Ambil indeks kata dari tokenizer
word_index = tokenizer.word_index

# Siapkan embedding matrix (untuk layer embedding di model deep learning)
embedding_matrix = np.zeros((MAX_NUM_WORDS, 100))
for word, i in word_index.items():
    if i < MAX_NUM_WORDS:
        try:
            embedding_vector = w2v_model.wv[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            pass  # jika kata tidak ada di model w2v, biarkan tetap 0


## Langkah 6 : Pelatihan

### Skema Pelatihan 1: LSTM + Word2Vec + Split 80/20

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

model1 = Sequential()
model1.add(Embedding(MAX_NUM_WORDS, 100, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
model1.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model1.add(Dense(64, activation='relu'))
model1.add(Dropout(0.3))
model1.add(Dense(3, activation='softmax'))

model1.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model1.summary()

history1 = model1.fit(X_train_pad, y_train_cat, epochs=5, batch_size=512, validation_data=(X_test_pad, y_test_cat))

Epoch 1/5
[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m318s[0m 1s/step - accuracy: 0.8640 - loss: 0.3374 - val_accuracy: 0.9854 - val_loss: 0.0406
Epoch 2/5
[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m336s[0m 1s/step - accuracy: 0.9845 - loss: 0.0443 - val_accuracy: 0.9854 - val_loss: 0.0385
Epoch 3/5
[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m382s[0m 1s/step - accuracy: 0.9849 - loss: 0.0417 - val_accuracy: 0.9854 - val_loss: 0.0382
Epoch 4/5
[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m387s[0m 1s/step - accuracy: 0.9850 - loss: 0.0407 - val_accuracy: 0.9854 - val_loss: 0.0378
Epoch 5/5
[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m329s[0m 1s/step - accuracy: 0.9847 - loss: 0.0402 - val_accuracy: 0.9854 - val_loss: 0.0376


#### Evaluasi Skema Pelatihan 1

In [11]:
model1.summary()

y_pred1 = model1.predict(X_test_pad)
y_pred1_classes = np.argmax(y_pred1, axis=1)

print("Accuracy Skema 1:", accuracy_score(y_test, y_pred1_classes))
print(classification_report(y_test, y_pred1_classes, target_names=le.classes_))

[1m1050/1050[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 44ms/step
Accuracy Skema 1: 0.9854464285714286
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00     11200
     neutral       0.96      1.00      0.98     11200
    positive       1.00      0.96      0.98     11200

    accuracy                           0.99     33600
   macro avg       0.99      0.99      0.99     33600
weighted avg       0.99      0.99      0.99     33600



### Skema Pelatihan 2: LSTM + TF-IDF + Split 80/20

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfidf = tfidf.transform(X_test).toarray()

# LSTM but we reshape to 3D
X_train_tfidf_3d = np.expand_dims(X_train_tfidf, axis=2)
X_test_tfidf_3d = np.expand_dims(X_test_tfidf, axis=2)

model2 = Sequential()
model2.add(LSTM(128, input_shape=(X_train_tfidf_3d.shape[1], 1)))
model2.add(Dense(64, activation='relu'))
model2.add(Dropout(0.3))
model2.add(Dense(3, activation='softmax'))

model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history2 = model2.fit(X_train_tfidf_3d, y_train_cat, epochs=5, batch_size=512, validation_data=(X_test_tfidf_3d, y_test_cat))


  super().__init__(**kwargs)


Epoch 1/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1015s[0m 6s/step - accuracy: 0.6615 - loss: 0.8690 - val_accuracy: 0.6750 - val_loss: 0.7920
Epoch 2/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1020s[0m 6s/step - accuracy: 0.6754 - loss: 0.7966 - val_accuracy: 0.6750 - val_loss: 0.7895
Epoch 3/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1047s[0m 6s/step - accuracy: 0.6742 - loss: 0.7973 - val_accuracy: 0.6750 - val_loss: 0.7877
Epoch 4/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m994s[0m 6s/step - accuracy: 0.6756 - loss: 0.7900 - val_accuracy: 0.6797 - val_loss: 0.7753
Epoch 5/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1095s[0m 7s/step - accuracy: 0.6814 - loss: 0.7830 - val_accuracy: 0.6945 - val_loss: 0.7645


#### Evaluasi Skema Pelatihan 2

In [None]:
y_pred2 = model2.predict(X_test_tfidf_3d)
y_pred2_classes = np.argmax(y_pred2, axis=1)

print("Accuracy Skema 2:", accuracy_score(y_test, y_pred2_classes))
print(classification_report(y_test, y_pred2_classes, target_names=le.classes_))

[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m210s[0m 335ms/step
Accuracy Skema 2: 0.69455
              precision    recall  f1-score   support

    negative       0.75      0.11      0.20      5200
     neutral       0.00      0.00      0.00      1300
    positive       0.69      0.99      0.81     13500

    accuracy                           0.69     20000
   macro avg       0.48      0.37      0.34     20000
weighted avg       0.66      0.69      0.60     20000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Skema Pelatihan 3: CNN + Word2Vec + Split 70/30

In [12]:
# Konstanta
MAX_NUM_WORDS = 20000
MAX_SEQUENCE_LENGTH = 100

# Split ulang menggunakan data hasil oversampling (df_balanced)
X_train2, X_test2, y_train2, y_test2 = train_test_split(
    df_balanced['clean_content'], df_balanced['label_encoded'],
    test_size=0.3, stratify=df_balanced['label_encoded'], random_state=42
)

# Tokenisasi ulang menggunakan tokenizer sebelumnya
X_train2_seq = tokenizer.texts_to_sequences(X_train2)
X_test2_seq = tokenizer.texts_to_sequences(X_test2)

# Padding
X_train2_pad = pad_sequences(X_train2_seq, maxlen=MAX_SEQUENCE_LENGTH)
X_test2_pad = pad_sequences(X_test2_seq, maxlen=MAX_SEQUENCE_LENGTH)

# One-hot encoding untuk label
y_train2_cat = to_categorical(y_train2, num_classes=3)
y_test2_cat = to_categorical(y_test2, num_classes=3)

# CNN Model
model3 = Sequential()
model3.add(Embedding(MAX_NUM_WORDS, 100, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
model3.add(Conv1D(128, 5, activation='relu'))
model3.add(GlobalMaxPooling1D())
model3.add(Dense(64, activation='relu'))
model3.add(Dropout(0.3))
model3.add(Dense(3, activation='softmax'))

model3.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model
history3 = model3.fit(
    X_train2_pad, y_train2_cat,
    epochs=5,
    batch_size=512,
    validation_data=(X_test2_pad, y_test2_cat)
)




Epoch 1/5
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 421ms/step - accuracy: 0.9049 - loss: 0.2944 - val_accuracy: 0.9855 - val_loss: 0.0393
Epoch 2/5
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 410ms/step - accuracy: 0.9859 - loss: 0.0395 - val_accuracy: 0.9855 - val_loss: 0.0378
Epoch 3/5
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 384ms/step - accuracy: 0.9849 - loss: 0.0397 - val_accuracy: 0.9855 - val_loss: 0.0372
Epoch 4/5
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 417ms/step - accuracy: 0.9849 - loss: 0.0387 - val_accuracy: 0.9855 - val_loss: 0.0359
Epoch 5/5
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 420ms/step - accuracy: 0.9859 - loss: 0.0350 - val_accuracy: 0.9855 - val_loss: 0.0325


#### Evaluasi Skema Pelatihan 3

In [13]:
# Tampilkan arsitektur model CNN
model3.summary()

# Prediksi terhadap data uji
y_pred3 = model3.predict(X_test2_pad)
y_pred3_classes = np.argmax(y_pred3, axis=1)

# True label dari one-hot encoding ke integer
y_test3 = np.argmax(y_test2_cat, axis=1)

# Akurasi dan Laporan Klasifikasi
print("Accuracy Skema 3:", accuracy_score(y_test3, y_pred3_classes))
print(classification_report(y_test3, y_pred3_classes, target_names=le.classes_))

[1m1575/1575[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 14ms/step
Accuracy Skema 3: 0.9854761904761905
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00     16800
     neutral       0.96      1.00      0.98     16800
    positive       1.00      0.96      0.98     16800

    accuracy                           0.99     50400
   macro avg       0.99      0.99      0.99     50400
weighted avg       0.99      0.99      0.99     50400



## Langkah 7 Inference Manual (Testing)


### Testing Model 1

In [14]:
# Contoh teks baru
new_texts = ["This app is amazing!", "I hate how slow it is", "It’s okay, nothing special"]

# 1. Cleaning (gunakan fungsi clean_text sebelumnya)
new_texts_cleaned = [clean_text(text) for text in new_texts]

# 2. Tokenisasi dan padding
new_sequences = tokenizer.texts_to_sequences(new_texts_cleaned)
new_padded = pad_sequences(new_sequences, maxlen=MAX_SEQUENCE_LENGTH)

# 3. Prediksi
pred_probs = model1.predict(new_padded)
pred_classes = pred_probs.argmax(axis=1)

# 4. Konversi ke label (gunakan label encoder yang sama)
pred_labels = le.inverse_transform(pred_classes)

# Tampilkan hasil
for text, label in zip(new_texts, pred_labels):
    print(f"Teks: '{text}' → Sentimen: {label}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 509ms/step
Teks: 'This app is amazing!' → Sentimen: positive
Teks: 'I hate how slow it is' → Sentimen: negative
Teks: 'It’s okay, nothing special' → Sentimen: neutral


### Testing Model 3

In [23]:
# Contoh teks baru
new_texts = ["I absolutely love this!", "I hate this app cause very slow.", "It's just average, nothing impressive."]


# 1. Cleaning (gunakan fungsi clean_text sebelumnya)
new_texts_cleaned = [clean_text(text) for text in new_texts]

# 2. Tokenisasi dan padding
new_sequences = tokenizer.texts_to_sequences(new_texts_cleaned)
new_padded = pad_sequences(new_sequences, maxlen=MAX_SEQUENCE_LENGTH)

# 3. Prediksi
pred_probs = model3.predict(new_padded)
pred_classes = pred_probs.argmax(axis=1)

# 4. Konversi ke label (gunakan label encoder yang sama)
pred_labels = le.inverse_transform(pred_classes)

# Tampilkan hasil
for text, label in zip(new_texts, pred_labels):
    print(f"Teks: '{text}' → Sentimen: {label}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 127ms/step
Teks: 'I absolutely love this!' → Sentimen: positive
Teks: 'I hate this app cause very slow.' → Sentimen: negative
Teks: 'It's just average, nothing impressive.' → Sentimen: neutral
