In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('mobil_listrik.csv')
df.head()

Unnamed: 0,id_komentar,nama_akun,tanggal,text_cleaning,sentimen
0,Ugzbll5eyrIy3-gdUUJ4AaABAg,Sqn Ldr,2023-08-06 12:54:49+00:00,saran sih bikin harga ionic sama kayak brio ...,positif
1,UgzEDUiV3OTrV943p8p4AaABAg,lushen ace,2023-08-04 12:16:23+00:00,problem subsidi kualitas diturunin harga dinai...,negatif
2,UgwqJqu6JMF4EH2CsVV4AaABAg,Fatih Al-Ayyubi,2023-08-04 10:17:57+00:00,baik kualitas kembang dulu baik kualitas motor...,positif
3,UgyYicCMR1rKwuOj2Y14AaABAg,yp office,2023-08-04 08:29:54+00:00,model jelek kwalitas buruk harga mahal croot,negatif
4,UgxKAcLuAwZOQK6es-x4AaABAg,Lembur Kuring,2023-08-04 07:55:37+00:00,syarat ngaco woy anak muda blom punya ruma...,negatif


In [3]:
# drop nan values
df = df.dropna()

In [4]:
# Mapping sentiment categories to integers
sentiment_mapping = {
    'positif': 2,
    'netral': 1,
    'negatif': 0
}

# Apply the mapping
df['sentimen'] = df['sentimen'].map(sentiment_mapping)


In [5]:
# # Ensure your columns are named correctly
# texts = df['text_cleaning'].tolist()
# labels = df['sentimen'].tolist()

# IndoBERT

In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Ensure your columns are named correctly and labels are now integers
texts = df['text_cleaning'].tolist()
labels = df['sentimen'].tolist()

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")

# Tokenize the texts
encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)

# Create a Dataset object
dataset = Dataset.from_dict({
    'input_ids': encodings['input_ids'],
    'attention_mask': encodings['attention_mask'],
    'labels': labels  # Now labels are numerical
})

# Split the dataset into train and validation sets
train_size = int(0.8 * len(dataset))
train_dataset = dataset.shuffle(seed=42).select(range(train_size))
val_dataset = dataset.shuffle(seed=42).select(range(train_size, len(dataset)))

# Load the model
model = AutoModelForSequenceClassification.from_pretrained("indobenchmark/indobert-base-p1", num_labels=3)  # 3 labels for 'positif', 'netral', 'negatif'

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

# Start training
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/228 [00:00<?, ?it/s]

{'loss': 1.0219, 'grad_norm': 5.48943567276001, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.13}
{'loss': 0.9591, 'grad_norm': 6.98516845703125, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.26}
{'loss': 0.8889, 'grad_norm': 4.382805347442627, 'learning_rate': 3e-06, 'epoch': 0.39}
{'loss': 0.8751, 'grad_norm': 9.804364204406738, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.53}
{'loss': 0.8432, 'grad_norm': 2.9799387454986572, 'learning_rate': 5e-06, 'epoch': 0.66}
{'loss': 0.9644, 'grad_norm': 3.4710395336151123, 'learning_rate': 6e-06, 'epoch': 0.79}
{'loss': 0.8801, 'grad_norm': 6.464222431182861, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.92}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.9041135907173157, 'eval_runtime': 67.2223, 'eval_samples_per_second': 4.507, 'eval_steps_per_second': 0.283, 'epoch': 1.0}
{'loss': 0.8284, 'grad_norm': 4.8144989013671875, 'learning_rate': 8.000000000000001e-06, 'epoch': 1.05}
{'loss': 0.827, 'grad_norm': 8.977713584899902, 'learning_rate': 9e-06, 'epoch': 1.18}
{'loss': 0.8744, 'grad_norm': 5.212505340576172, 'learning_rate': 1e-05, 'epoch': 1.32}
{'loss': 0.8236, 'grad_norm': 4.318921089172363, 'learning_rate': 1.1000000000000001e-05, 'epoch': 1.45}
{'loss': 0.6382, 'grad_norm': 7.956592082977295, 'learning_rate': 1.2e-05, 'epoch': 1.58}
{'loss': 0.7039, 'grad_norm': 9.653879165649414, 'learning_rate': 1.3000000000000001e-05, 'epoch': 1.71}
{'loss': 0.8129, 'grad_norm': 7.754458427429199, 'learning_rate': 1.4000000000000001e-05, 'epoch': 1.84}
{'loss': 0.7942, 'grad_norm': 6.280426502227783, 'learning_rate': 1.5e-05, 'epoch': 1.97}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.7754224538803101, 'eval_runtime': 65.0634, 'eval_samples_per_second': 4.657, 'eval_steps_per_second': 0.292, 'epoch': 2.0}
{'loss': 0.6463, 'grad_norm': 4.637740135192871, 'learning_rate': 1.6000000000000003e-05, 'epoch': 2.11}
{'loss': 0.6662, 'grad_norm': 14.700541496276855, 'learning_rate': 1.7000000000000003e-05, 'epoch': 2.24}
{'loss': 0.5484, 'grad_norm': 16.64507484436035, 'learning_rate': 1.8e-05, 'epoch': 2.37}
{'loss': 0.6141, 'grad_norm': 9.949370384216309, 'learning_rate': 1.9e-05, 'epoch': 2.5}
{'loss': 0.6023, 'grad_norm': 14.241364479064941, 'learning_rate': 2e-05, 'epoch': 2.63}
{'loss': 0.6051, 'grad_norm': 8.520508766174316, 'learning_rate': 2.1e-05, 'epoch': 2.76}
{'loss': 0.6632, 'grad_norm': 10.883142471313477, 'learning_rate': 2.2000000000000003e-05, 'epoch': 2.89}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.7529634833335876, 'eval_runtime': 63.2951, 'eval_samples_per_second': 4.787, 'eval_steps_per_second': 0.3, 'epoch': 3.0}
{'train_runtime': 2811.8281, 'train_samples_per_second': 1.292, 'train_steps_per_second': 0.081, 'train_loss': 0.7662730729370787, 'epoch': 3.0}


TrainOutput(global_step=228, training_loss=0.7662730729370787, metrics={'train_runtime': 2811.8281, 'train_samples_per_second': 1.292, 'train_steps_per_second': 0.081, 'total_flos': 238972761651456.0, 'train_loss': 0.7662730729370787, 'epoch': 3.0})

In [7]:
eval_results = trainer.evaluate()
print(eval_results)

  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.7529634833335876, 'eval_runtime': 66.4974, 'eval_samples_per_second': 4.557, 'eval_steps_per_second': 0.286, 'epoch': 3.0}


In [8]:
predictions = trainer.predict(val_dataset)
print(predictions)

  0%|          | 0/19 [00:00<?, ?it/s]

PredictionOutput(predictions=array([[ 3.93093616e-01, -7.81248629e-01, -4.52652454e-01],
       [ 1.37337935e+00, -1.35612583e+00, -5.83563209e-01],
       [-1.05191016e+00, -1.77734363e+00,  2.41918063e+00],
       [ 1.05449408e-02, -1.58415353e+00,  8.54390264e-01],
       [ 1.22209835e+00, -1.79091978e+00, -7.82971680e-02],
       [ 1.40386784e+00, -2.05255222e+00,  2.85247356e-01],
       [ 1.75563347e+00, -1.15146518e+00, -1.13833189e+00],
       [ 3.03200275e-01, -5.79952359e-01, -1.72174141e-01],
       [-1.37964994e-01, -1.81685138e+00,  1.81773973e+00],
       [-5.02553463e-01, -1.71022213e+00,  1.96695662e+00],
       [ 2.10803986e+00, -1.58192945e+00, -7.70516992e-01],
       [ 9.25940812e-01, -6.45798087e-01, -8.18677187e-01],
       [ 1.00782192e+00, -1.81139934e+00,  5.64472556e-01],
       [ 1.25003946e+00, -1.60887837e+00,  1.92110285e-01],
       [ 2.11034083e+00, -1.53435576e+00, -9.21720266e-01],
       [ 2.03928208e+00, -1.61743355e+00, -7.88500428e-01],
       [ 2.

In [9]:
model.save_pretrained('./saved_model_initial')
tokenizer.save_pretrained('./saved_model_initial')


('./saved_model_initial\\tokenizer_config.json',
 './saved_model_initial\\special_tokens_map.json',
 './saved_model_initial\\vocab.txt',
 './saved_model_initial\\added_tokens.json',
 './saved_model_initial\\tokenizer.json')