## Library

In [None]:
!pip install transformers datasets accelerate



In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.1/84.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [None]:
import pandas as pd
import numpy as np
import evaluate
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    Pipeline,
    DataCollatorWithPadding
)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Train Test Split

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Dataset Review Produk/clean_reviews.csv")
df.head()

Unnamed: 0,review_text,rating,sentiment,char_len,word_len,clean_text
0,Udah sering belanja trs tapi setiap pengajuan ...,1,negative,96,15,udah sering belanja trs tapi setiap pengajuan ...
1,Semenjak di upgrade.. SHOPEE JADI LEMOT,1,negative,39,6,semenjak di upgrade shopee jadi lemot
2,Penyelesaian masalah sangat buruk,1,negative,33,4,penyelesaian masalah sangat buruk
3,Apk enggaüòá jls,1,negative,14,3,apk engga jls
4,Lelet stress. Udah update terbaru tetap aja lemot,1,negative,49,8,lelet stress udah update terbaru tetap aja lemot


In [None]:
df_model = df[['clean_text', 'sentiment']]
df_model.head()

Unnamed: 0,clean_text,sentiment
0,udah sering belanja trs tapi setiap pengajuan ...,negative
1,semenjak di upgrade shopee jadi lemot,negative
2,penyelesaian masalah sangat buruk,negative
3,apk engga jls,negative
4,lelet stress udah update terbaru tetap aja lemot,negative


In [None]:
train_df, test_df = train_test_split(
    df_model,
    test_size=0.2,
    stratify=df["sentiment"],
    random_state=42
)

train_df = train_df.sample(500, random_state=42)
test_df = test_df.sample(500, random_state=42)

print("Train size:", train_df.shape)
print("Test size:", test_df.shape)

Train size: (500, 2)
Test size: (500, 2)


In [None]:
print(train_df["sentiment"].value_counts(normalize=True))
print(test_df["sentiment"].value_counts(normalize=True))

sentiment
positive    0.744
negative    0.218
neutral     0.038
Name: proportion, dtype: float64
sentiment
positive    0.742
negative    0.218
neutral     0.040
Name: proportion, dtype: float64


In [None]:
train_df.to_csv("/content/drive/MyDrive/Dataset Review Produk/train_reviews_500.csv", index=False)
test_df.to_csv("/content/drive/MyDrive/Dataset Review Produk/test_reviews_500.csv", index=False)

## Tokenzation

In [None]:
train_df = pd.read_csv("/content/drive/MyDrive/Dataset Review Produk/train_reviews_500.csv")
test_df = pd.read_csv("/content/drive/MyDrive/Dataset Review Produk/test_reviews_500.csv")

In [None]:
model_name = "w11wo/indonesian-roberta-base-sentiment-classifier"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
label2id = {"negative": 0, "neutral": 1, "positive": 2}
id2label = {0: "negative", 1: "neutral", 2: "positive"}

train_df["label"] = train_df["sentiment"].map(label2id)
test_df["label"] = test_df["sentiment"].map(label2id)

In [None]:
train_dataset = Dataset.from_pandas(train_df[['clean_text', 'label']])
test_dataset = Dataset.from_pandas(test_df[['clean_text', 'label']])

dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

In [None]:
def tokenize_function(example):
  return tokenizer(
      example["clean_text"],
      truncation=True,
      padding="max_length",
      max_length=64
  )

In [None]:
tokenize_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["clean_text"]
)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
tokenize_dataset["train"][0]

{'label': 1,
 'input_ids': [1154,
  5320,
  2623,
  1032,
  485,
  4016,
  5775,
  1032,
  1306,
  1627,
  283,
  3168,
  672,
  1627,
  887,
  672,
  4588,
  2446,
  9861,
  16002,
  1032,
  4588,
  30064,
  35924,
  16002,
  1032,
  11441,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Modeling

In [None]:
import evaluate

accuray_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)

  acc = accuray_metric.compute(predictions=predictions, references=labels)
  f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")

  return {
      "accuracy": acc["accuracy"],
      "f1": f1["f1"]
  }

In [None]:
import transformers
print(transformers.__version__)
# seharusnya >= 4.5.0 (lebih baik >= 4.20+)


4.57.1


In [None]:
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Dataset Review Produk/model_output",

    eval_strategy="epoch",
    save_strategy="epoch",

    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,

    load_best_model_at_end=True,
    metric_for_best_model="f1",
)


In [None]:
tokenize_dataset["train"].column_names

['label', 'input_ids', 'attention_mask']

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenize_dataset["train"],
    eval_dataset=tokenize_dataset["test"],
    processing_class=tokenizer, # Changed from tokenizer=tokenizer
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
# Train Model
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.411177,0.846,0.82728
2,No log,0.397548,0.858,0.840355
3,No log,0.406117,0.854,0.839445




TrainOutput(global_step=96, training_loss=0.627836267153422, metrics={'train_runtime': 2012.2503, 'train_samples_per_second': 0.745, 'train_steps_per_second': 0.048, 'total_flos': 98667531648000.0, 'train_loss': 0.627836267153422, 'epoch': 3.0})

In [None]:
# Evaluate
results = trainer.evaluate()
results



{'eval_loss': 0.3975478410720825,
 'eval_accuracy': 0.858,
 'eval_f1': 0.8403554071870049,
 'eval_runtime': 140.5306,
 'eval_samples_per_second': 3.558,
 'eval_steps_per_second': 0.228,
 'epoch': 3.0}

In [None]:
trainer.save_model("/content/drive/MyDrive/Dataset Review Produk/sentiment_model")
tokenizer.save_pretrained("/content/drive/MyDrive/Dataset Review Produk/sentiment_model")

('/content/drive/MyDrive/Dataset Review Produk/sentiment_model/tokenizer_config.json',
 '/content/drive/MyDrive/Dataset Review Produk/sentiment_model/special_tokens_map.json',
 '/content/drive/MyDrive/Dataset Review Produk/sentiment_model/vocab.json',
 '/content/drive/MyDrive/Dataset Review Produk/sentiment_model/merges.txt',
 '/content/drive/MyDrive/Dataset Review Produk/sentiment_model/added_tokens.json',
 '/content/drive/MyDrive/Dataset Review Produk/sentiment_model/tokenizer.json')

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [None]:
text = "Pengiriman lambat, barangnya lumayan!"
inputs = tokenizer(text, return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

pred = torch.argmax(logits).item()
id2label[pred]

'positive'

## Test

In [None]:
def predict_sentiment(text, model, tokenizer, id2label):
    # Tokenisasi input
    inputs = tokenizer(text, return_tensors="pt")

    # Non-training mode
    with torch.no_grad():
        logits = model(**inputs).logits

    # Ambil label dengan skor tertinggi
    pred_id = torch.argmax(logits, dim=1).item()
    return id2label[pred_id]


while True:
    user_input = input("Masukkan review (atau ketik 'exit' untuk keluar): ")

    if user_input.lower() == "exit":
        print("Program selesai.")
        break

    sentiment = predict_sentiment(user_input, model, tokenizer, id2label)
    print(f"Prediksi Sentiment: {sentiment}\n")


Masukkan review (atau ketik 'exit' untuk keluar): barang bagus, tapi pengiriman lama banget
Prediksi Sentiment: positive

Masukkan review (atau ketik 'exit' untuk keluar): Barang tidak sesuai, tapi pengirimannya cepat
Prediksi Sentiment: positive

Masukkan review (atau ketik 'exit' untuk keluar): Barang tidak sesuai
Prediksi Sentiment: negative

Masukkan review (atau ketik 'exit' untuk keluar): Ok
Prediksi Sentiment: positive

Masukkan review (atau ketik 'exit' untuk keluar): Aneh
Prediksi Sentiment: negative

Masukkan review (atau ketik 'exit' untuk keluar): hm
Prediksi Sentiment: positive

Masukkan review (atau ketik 'exit' untuk keluar): wxit
Prediksi Sentiment: negative

Masukkan review (atau ketik 'exit' untuk keluar): exit
Program selesai.
