In [None]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import BertTokenizer, BertModel
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import pickle
import torch
import re

In [2]:
def load_imdb_data(path, limit=None):
    """
    Fungsi untuk memuat data dari folder dataset IMDb.
    Fungsi ini juga akan secara otomatis membersihkan tag HTML dari teks.
    """
    texts = []
    labels = []
    print(f"Membaca data dari: {path}")
    for label in ['pos', 'neg']:
        label_path = os.path.join(path, label)
        files = os.listdir(label_path)
        if limit:
            files = files[:limit]
        
        files_iterable = tqdm(files, desc=f'Memuat ulasan {label}')
        for file in files_iterable:
            with open(os.path.join(label_path, file), encoding='utf-8') as f:
                raw_text = f.read()
                clean_text = re.sub(r'<.*?>', ' ', raw_text) # Hapus HTML
                texts.append(clean_text)
                labels.append(1 if label == 'pos' else 0)
    return texts, labels

In [None]:
base_path = r"/path/folder/aclImdb"
train_path = os.path.join(base_path, "train")
test_path = os.path.join(base_path, "test")

In [4]:
#Muat data train

In [None]:
train_texts, train_labels = load_imdb_data(train_path, limit=1000)
print(f"Total data training: {len(train_texts)}")
print(f"Positif: {train_labels.count(1)} | Negatif: {train_labels.count(0)}\n")

In [6]:
#Muat data test

In [None]:
test_texts, test_labels = load_imdb_data(test_path, limit=400)
print(f"Total data testing: {len(test_texts)}")
print(f"Positif: {test_labels.count(1)} | Negatif: {test_labels.count(0)}")

In [None]:
#Setup Tokenizer dan Model BERT
print("\nMemuat pre-trained BERT tokenizer dan model...")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert = BertModel.from_pretrained("bert-base-uncased")
bert.eval()  # Set model ke mode evaluasi
print("BERT siap digunakan.")

In [9]:
#Mengubah teks menjadi embedding
def get_bert_embedding(text):
    """Mengubah satu teks menjadi vektor embedding BERT (768 dimensi)."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = bert(**inputs)
    # Ambil embedding dari token [CLS]
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

In [None]:
print("\nMembuat embedding untuk data training...")
X_train = [get_bert_embedding(text) for text in tqdm(train_texts)]

# Proses data testing
print("\nMembuat embedding untuk data testing...")
X_test = [get_bert_embedding(text) for text in tqdm(test_texts)]

# Konversi ke numpy array
X_train = np.array(X_train)
y_train = np.array(train_labels)
X_test = np.array(X_test)
y_test = np.array(test_labels)

# Cetak bentuk data untuk memastikan
print(f"\nBentuk X_train: {X_train.shape}")
print(f"Bentuk y_train: {y_train.shape}")
print(f"Bentuk X_test: {X_test.shape}")
print(f"Bentuk y_test: {y_test.shape}")


In [None]:
# Inisialisasi model SVM
clf = SVC(kernel='linear', probability=True)

# Latih model HANYA pada data training
print("\nMelatih model SVM pada seluruh data training...")
clf.fit(X_train, y_train)
print("Model selesai dilatih.")

# Evaluasi model pada data testing
print("\nMengevaluasi performa model pada data testing...")
y_pred = clf.predict(X_test)

# Tampilkan laporan klasifikasi
print("\nLaporan Klasifikasi:")
print(classification_report(y_test, y_pred, target_names=['Negatif', 'Positif']))


In [None]:
model_filename = "svm_imdb_model.pkl"
with open(model_filename, "wb") as f:
    pickle.dump(clf, f)
print(f"\nModel telah disimpan sebagai '{model_filename}'")