In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('mobil_listrik.csv')
df.head()
# drop nan values
df = df.dropna()

In [3]:
df['sentimen'].value_counts()

sentimen
negatif    868
positif    504
netral     142
Name: count, dtype: int64

In [2]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, load_metric
import torch
import numpy as np

# # Load your dataframe
# # Ensure your columns are named correctly
# df = pd.DataFrame({
#     'text_cleaning': ["This is a positive text.", "This is a negative text.", "This is a neutral text."],
#     'sentimen': ['positif', 'negatif', 'netral']
# })

# Map the labels to integers
label_mapping = {'positif': 0, 'negatif': 1, 'netral': 2}
df['labels'] = df['sentimen'].map(label_mapping)

# Prepare the tokenizer
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")

# Tokenize the texts
texts = df['text_cleaning'].tolist()
labels = df['labels'].tolist()
encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)

# Create a Dataset object
dataset = Dataset.from_dict({
    'input_ids': encodings['input_ids'],
    'attention_mask': encodings['attention_mask'],
    'labels': labels
})

# Split the dataset into train and validation sets
train_dataset = dataset.shuffle(seed=42).select(range(int(0.8 * len(dataset))))
val_dataset = dataset.shuffle(seed=42).select(range(int(0.8 * len(dataset)), len(dataset)))

# Load the model
model = AutoModelForSequenceClassification.from_pretrained("indobenchmark/indobert-base-p1", num_labels=len(label_mapping))

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

# Define the F1 metric function with trust_remote_code=True
metric = load_metric("f1", trust_remote_code=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Convert logits to torch Tensor if they are numpy arrays
    if isinstance(logits, np.ndarray):
        logits = torch.tensor(logits)
    # Get predictions by finding the index with the highest value
    predictions = torch.argmax(logits, dim=-1)
    return metric.compute(predictions=predictions, references=labels, average='weighted')

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()

# Print the results
print("Evaluation results:", eval_results)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/228 [00:00<?, ?it/s]

{'loss': 0.982, 'grad_norm': 5.027266025543213, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.13}
{'loss': 0.9437, 'grad_norm': 7.30647611618042, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.26}
{'loss': 0.9311, 'grad_norm': 4.135098457336426, 'learning_rate': 3e-06, 'epoch': 0.39}
{'loss': 0.906, 'grad_norm': 9.081064224243164, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.53}
{'loss': 0.8562, 'grad_norm': 3.2632529735565186, 'learning_rate': 5e-06, 'epoch': 0.66}
{'loss': 0.9443, 'grad_norm': 4.320446491241455, 'learning_rate': 6e-06, 'epoch': 0.79}
{'loss': 0.8797, 'grad_norm': 5.549627780914307, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.92}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.9031966328620911, 'eval_f1': 0.38939801471348745, 'eval_runtime': 66.3564, 'eval_samples_per_second': 4.566, 'eval_steps_per_second': 0.286, 'epoch': 1.0}
{'loss': 0.8402, 'grad_norm': 4.391604423522949, 'learning_rate': 8.000000000000001e-06, 'epoch': 1.05}
{'loss': 0.827, 'grad_norm': 8.601346015930176, 'learning_rate': 9e-06, 'epoch': 1.18}
{'loss': 0.8386, 'grad_norm': 5.207475662231445, 'learning_rate': 1e-05, 'epoch': 1.32}
{'loss': 0.793, 'grad_norm': 3.9837377071380615, 'learning_rate': 1.1000000000000001e-05, 'epoch': 1.45}
{'loss': 0.6535, 'grad_norm': 7.207342147827148, 'learning_rate': 1.2e-05, 'epoch': 1.58}
{'loss': 0.6934, 'grad_norm': 9.432427406311035, 'learning_rate': 1.3000000000000001e-05, 'epoch': 1.71}
{'loss': 0.7952, 'grad_norm': 7.515537261962891, 'learning_rate': 1.4000000000000001e-05, 'epoch': 1.84}
{'loss': 0.7782, 'grad_norm': 5.841495990753174, 'learning_rate': 1.5e-05, 'epoch': 1.97}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.7547644972801208, 'eval_f1': 0.6353209861866547, 'eval_runtime': 64.6112, 'eval_samples_per_second': 4.69, 'eval_steps_per_second': 0.294, 'epoch': 2.0}
{'loss': 0.6455, 'grad_norm': 5.282618522644043, 'learning_rate': 1.6000000000000003e-05, 'epoch': 2.11}
{'loss': 0.6003, 'grad_norm': 12.560521125793457, 'learning_rate': 1.7000000000000003e-05, 'epoch': 2.24}
{'loss': 0.5531, 'grad_norm': 14.366499900817871, 'learning_rate': 1.8e-05, 'epoch': 2.37}
{'loss': 0.5361, 'grad_norm': 4.900355815887451, 'learning_rate': 1.9e-05, 'epoch': 2.5}
{'loss': 0.5499, 'grad_norm': 19.15966033935547, 'learning_rate': 2e-05, 'epoch': 2.63}
{'loss': 0.5561, 'grad_norm': 8.900321006774902, 'learning_rate': 2.1e-05, 'epoch': 2.76}
{'loss': 0.5761, 'grad_norm': 14.915722846984863, 'learning_rate': 2.2000000000000003e-05, 'epoch': 2.89}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.7779859304428101, 'eval_f1': 0.6592220676555992, 'eval_runtime': 64.8293, 'eval_samples_per_second': 4.674, 'eval_steps_per_second': 0.293, 'epoch': 3.0}
{'train_runtime': 2771.2, 'train_samples_per_second': 1.311, 'train_steps_per_second': 0.082, 'train_loss': 0.7477182798218309, 'epoch': 3.0}


  0%|          | 0/19 [00:00<?, ?it/s]

Evaluation results: {'eval_loss': 0.7779859304428101, 'eval_f1': 0.6592220676555992, 'eval_runtime': 65.1554, 'eval_samples_per_second': 4.65, 'eval_steps_per_second': 0.292, 'epoch': 3.0}


In [3]:
# Save the model
model_path = './saved_model_f1'
model.save_pretrained(model_path)

# Save the tokenizer
tokenizer_path = './saved_tokenizer_f1'
tokenizer.save_pretrained(tokenizer_path)

print(f"Model saved to {model_path}")
print(f"Tokenizer saved to {tokenizer_path}")


Model saved to ./saved_model_f1
Tokenizer saved to ./saved_tokenizer_f1


In [4]:
import numpy as np
from sklearn.metrics import classification_report

# Get predictions
predictions = trainer.predict(val_dataset)

# Convert logits to labels
logits = predictions.predictions
preds = np.argmax(logits, axis=-1)
labels = predictions.label_ids

# Print classification report
report = classification_report(labels, preds, target_names=['positif', 'negatif', 'netral'])
print("Classification Report:\n", report)

  0%|          | 0/19 [00:00<?, ?it/s]

Classification Report:
               precision    recall  f1-score   support

     positif       0.66      0.60      0.63       112
     negatif       0.70      0.86      0.77       161
      netral       0.60      0.10      0.17        30

    accuracy                           0.69       303
   macro avg       0.65      0.52      0.52       303
weighted avg       0.68      0.69      0.66       303

