In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import BertTokenizer, BertModel
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import pickle
import torch

In [2]:
def load_imdb_data(path, limit=None):
    texts = []
    labels = []

    for label in ['pos', 'neg']:
        label_path = os.path.join(path, label)
        files = os.listdir(label_path)
        if limit:
            files = files[:limit]
        for file in files:
            with open(os.path.join(label_path, file), encoding='utf-8') as f:
                texts.append(f.read())
                labels.append(1 if label == 'pos' else 0)

    return texts, labels

# Load data dari train folder (limit 1000)
train_path = r"C:\Users\mahru\Downloads\aclImdb_v1\aclImdb\train"
texts, labels = load_imdb_data(train_path, limit=1000)
print(f"Total data: {len(texts)}")
print(f"Positif: {labels.count(1)} | Negatif: {labels.count(0)}")


Total data: 2000
Positif: 1000 | Negatif: 1000


In [3]:
print(os.listdir(train_path))


['labeledBow.feat', 'neg', 'pos', 'unsup', 'unsupBow.feat', 'urls_neg.txt', 'urls_pos.txt', 'urls_unsup.txt']


In [4]:
# Load pretrained BERT
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert = BertModel.from_pretrained("bert-base-uncased")
bert.eval()  # set model ke eval

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [5]:
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = bert(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()


In [6]:
X = []
for text in tqdm(texts):
    try:
        emb = get_bert_embedding(text)
        X.append(emb)
    except Exception as e:
        print("Error:", e)
        continue

X = np.array(X)
y = np.array(labels[:len(X)])

100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [02:47<00:00, 11.91it/s]


In [7]:
print(X.shape)


(2000, 768)


In [8]:
clf = SVC(kernel='linear', probability=True)
clf.fit(X, y)

In [9]:
# Train dan test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train ulang
clf = SVC(kernel='linear', probability=True)
clf.fit(X_train, y_train)

# Evaluasi
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=['Negatif', 'Positif']))

              precision    recall  f1-score   support

     Negatif       0.74      0.78      0.76       201
     Positif       0.76      0.72      0.74       199

    accuracy                           0.75       400
   macro avg       0.75      0.75      0.75       400
weighted avg       0.75      0.75      0.75       400



In [10]:
with open("svm_imdb_model.pkl", "wb") as f:
    pickle.dump(clf, f)

In [11]:
def predict_sentiment(text):
    emb = get_bert_embedding(text).reshape(1, -1)
    pred = clf.predict(emb)[0]
    return "Positif" if pred == 1 else "Negatif"

# Test review
print(predict_sentiment("I hated this movie."))
print(predict_sentiment("I loved the story."))

Negatif
Positif


In [12]:
with open("svm_imdb_model.pkl", "wb") as f:
    pickle.dump(clf, f)

In [15]:
import os
import numpy as np
import pandas as pd
import tqdm
import transformers
import sklearn
import torch
import streamlit

print("os         :", os.__version__ if hasattr(os, '__version__') else "built-in")
print("numpy      :", np.__version__)
print("pandas     :", pd.__version__)
print("tqdm       :", tqdm.__version__)
print("transformers:", transformers.__version__)
print("scikit-learn:", sklearn.__version__)
print("torch      :", torch.__version__)
print("streamlit      :", streamlit.__version__)


os         : built-in
numpy      : 2.0.2
pandas     : 2.2.3
tqdm       : 4.67.1
transformers: 4.53.0
scikit-learn: 1.5.2
torch      : 2.6.0+cpu
streamlit      : 1.46.1
