## Library

In [4]:
!pip install transformers datasets accelerate



In [16]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.1/84.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [17]:
import pandas as pd
import numpy as np
import evaluate
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    Pipeline,
    DataCollatorWithPadding
)

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Train Test Split

In [60]:
df = pd.read_csv("/content/drive/MyDrive/Dataset Review Produk/clean_reviews.csv")
df.head()

Unnamed: 0,review_text,rating,sentiment,char_len,word_len,clean_text
0,Udah sering belanja trs tapi setiap pengajuan ...,1,negative,96,15,udah sering belanja trs tapi setiap pengajuan ...
1,Semenjak di upgrade.. SHOPEE JADI LEMOT,1,negative,39,6,semenjak di upgrade shopee jadi lemot
2,Penyelesaian masalah sangat buruk,1,negative,33,4,penyelesaian masalah sangat buruk
3,Apk enggaüòá jls,1,negative,14,3,apk engga jls
4,Lelet stress. Udah update terbaru tetap aja lemot,1,negative,49,8,lelet stress udah update terbaru tetap aja lemot


In [61]:
df_model = df[['clean_text', 'sentiment']]
df_model.head()

Unnamed: 0,clean_text,sentiment
0,udah sering belanja trs tapi setiap pengajuan ...,negative
1,semenjak di upgrade shopee jadi lemot,negative
2,penyelesaian masalah sangat buruk,negative
3,apk engga jls,negative
4,lelet stress udah update terbaru tetap aja lemot,negative


In [62]:
train_df, test_df = train_test_split(
    df_model,
    test_size=0.2,
    stratify=df["sentiment"],
    random_state=42
)

train_df = train_df.sample(1000, random_state=42)

print("Train size:", train_df.shape)
print("Test size:", test_df.shape)

Train size: (1000, 2)
Test size: (138136, 2)


In [63]:
print(train_df["sentiment"].value_counts(normalize=True))
print(test_df["sentiment"].value_counts(normalize=True))

sentiment
positive    0.758
negative    0.201
neutral     0.041
Name: proportion, dtype: float64
sentiment
positive    0.743550
negative    0.212081
neutral     0.044369
Name: proportion, dtype: float64


In [64]:
train_df.to_csv("/content/drive/MyDrive/Dataset Review Produk/train_reviews_1k.csv", index=False)
#test_df.to_csv("/content/drive/MyDrive/Dataset Review Produk/test_reviews.csv", index=False)

## Tokenzation

In [65]:
train_df = pd.read_csv("/content/drive/MyDrive/Dataset Review Produk/train_reviews_1k.csv")
test_df = pd.read_csv("/content/drive/MyDrive/Dataset Review Produk/test_reviews.csv")

In [66]:
model_name = "w11wo/indonesian-roberta-base-sentiment-classifier"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [67]:
label2id = {"negative": 0, "neutral": 1, "positive": 2}
id2label = {0: "negative", 1: "neutral", 2: "positive"}

train_df["label"] = train_df["sentiment"].map(label2id)
test_df["label"] = test_df["sentiment"].map(label2id)

In [68]:
train_dataset = Dataset.from_pandas(train_df[['clean_text', 'label']])
test_dataset = Dataset.from_pandas(test_df[['clean_text', 'label']])

dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

In [69]:
def tokenize_function(example):
  return tokenizer(
      example["clean_text"],
      truncation=True,
      padding="max_length",
      max_length=128
  )

In [70]:
tokenize_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["clean_text"]
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/138136 [00:00<?, ? examples/s]

In [71]:
tokenize_dataset["train"][0]

{'label': 1,
 'input_ids': [1154,
  5320,
  2623,
  1032,
  485,
  4016,
  5775,
  1032,
  1306,
  1627,
  283,
  3168,
  672,
  1627,
  887,
  672,
  4588,
  2446,
  9861,
  16002,
  1032,
  4588,
  30064,
  35924,
  16002,
  1032,
  11441,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,

In [72]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Modeling

In [73]:
import evaluate

accuray_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)

  acc = accuray_metric.compute(predictions=predictions, references=labels)
  f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")

  return {
      "accuracy": acc["accuracy"],
      "f1": f1["f1"]
  }

In [74]:
import transformers
print(transformers.__version__)
# seharusnya >= 4.5.0 (lebih baik >= 4.20+)


4.57.1


In [75]:
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Dataset Review Produk/model_output",

    eval_strategy="epoch",
    save_strategy="epoch",

    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,

    load_best_model_at_end=True,
    metric_for_best_model="f1",
)


In [77]:
tokenize_dataset["train"].column_names

['label', 'input_ids', 'attention_mask']

In [76]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenize_dataset["train"],
    eval_dataset=tokenize_dataset["test"],
    processing_class=tokenizer, # Changed from tokenizer=tokenizer
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
# Train Model
trainer.train()



Epoch,Training Loss,Validation Loss


In [None]:
# Evaluate
results = trainer.evaluate()
results

In [None]:
trainer.save_model("/content/drive/MyDrive/Dataset Review Produk/sentiment_model")
tokenizer.save_pretrained("/content/drive/MyDrive/Dataset Review Produk/sentiment_model")

In [None]:
text = "Pengiriman cepat, barangnya bagus!"
inputs = tokenizer(text, return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

pred = torch.argmax(logits).item()
id2label[pred]