In [None]:
# Install library yang dibutuhkan (Hanya di Google Colab)
# !pip install transformers datasets evaluate accelerate

import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import numpy as np
import evaluate

# 1. Load Data (Gunakan file yang sudah bersih)
train_df = pd.read_csv('dataset_final/train_final.csv').dropna(subset=['caption_cleaned'])
val_df = pd.read_csv('dataset_final/val_final.csv').dropna(subset=['caption_cleaned'])

# Mapping label emosi ke angka
emo_to_id = {val: i for i, val in enumerate(train_df['emotion'].unique())}
id_to_emo = {i: val for val, i in emo_to_id.items()}

train_df['label'] = train_df['emotion'].map(emo_to_id)
val_df['label'] = val_df['emotion'].map(emo_to_id)

# 2. Tokenization menggunakan IndoBERT
model_name = "indobenchmark/indobert-base-p1" # Model IndoBERT standar
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["caption_cleaned"], padding="max_length", truncation=True, max_length=128)

# Convert ke format dataset HuggingFace
train_ds = Dataset.from_pandas(train_df[['caption_cleaned', 'label']])
val_ds = Dataset.from_pandas(val_df[['caption_cleaned', 'label']])

tokenized_train = train_ds.map(tokenize_function, batched=True)
tokenized_val = val_ds.map(tokenize_function, batched=True)

# 3. Load Model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(emo_to_id))

# 4. Metrik Evaluasi (F1-Score sesuai saran dosen)
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1_metric.compute(predictions=predictions, references=labels, average="macro")

# 5. Training Arguments (Mengatasi Overfitting sesuai Poin 1 Dosen)
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,          # LR kecil agar tidak drastis berubah
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,          # Epoch sedikit dulu untuk cek overfitting
    weight_decay=0.01,
    evaluation_strategy="epoch", # Cek performa setiap selesai 1 epoch
    save_strategy="epoch",
    load_best_model_at_end=True, # Simpan model terbaik (bukan yang terakhir)
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics,
)

# Mulai Training
trainer.train()

2026-01-15 00:31:14.576718: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:02<?, ?B/s]

Map:   0%|          | 0/7853 [00:00<?, ? examples/s]

Map:   0%|          | 0/749 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]