<a href="https://www.kaggle.com/code/nicost312/sentiment-analysis-tokopedia?scriptVersionId=185410100" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [4]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import numpy as np
import pandas as pd
import re

from datasets import load_from_disk
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

from sklearn.metrics import f1_score

In [110]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("indolem/indobert-base-uncased")
review_token = '[REVIEW]'
aspect_token = '[ASPECT]'
special_tokens_dict = {'additional_special_tokens': [review_token, aspect_token]}
num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)


def clean_text(texts):
    cleaned_text = []
    
    for text in texts:
        
        text = text.lower()

        text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text) # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")

        punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
        for p in punctuations:
            text = text.replace(p,'') #Removing punctuations

        emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
        text = emoji_pattern.sub(r'', text) #Removing emojis
        cleaned_text.append(text)
    
    return cleaned_text

# Preprocess function
def preprocess_function(examples):
    combined_texts = [aspect_token + aspect + review_token + review for aspect, review in zip(examples["variable"], examples["review"])]
    encoding =  tokenizer(
        clean_text(examples["review"]), 
        padding="max_length", 
        truncation=True, 
        max_length=128
    )
    

    labels_matrix = np.zeros((len(examples['review']), 3))
    
#     print(labels_)
    for i, label in enumerate(examples["value"]):
#         print(label)
        labels_matrix[i, int(label)] = 1

        encoding["labels"] = labels_matrix.tolist()
  
    return encoding

train_df = pd.read_csv('/kaggle/input/bert-absa-dataset/out.csv').iloc[:4612, :]
test_df = pd.read_csv('/kaggle/input/bert-absa-dataset/out.csv').iloc[4612:, :]

# Convert your data to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

tokenized_train_dataset = tokenized_train_dataset.remove_columns(["review", "rating", "variable", "value"])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["review", "rating", "variable", "value"])

Map:   0%|          | 0/4612 [00:00<?, ? examples/s]

Map:   0%|          | 0/1152 [00:00<?, ? examples/s]

In [124]:
# Load pre-trained model with a classification head
model = AutoModelForSequenceClassification.from_pretrained("indolem/indobert-base-uncased", num_labels=3)
# model = BertForSequenceClassification.from_pretrained(
#     "bert-base-multilingual-cased", 
#     num_labels = 3,
#     output_attentions = False, 
#     output_hidden_states = False
# )
model.resize_token_embeddings(len(tokenizer))
# model = model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(31925, 768)

In [112]:
print(tokenized_train_dataset)

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 4612
})


In [113]:
print(tokenized_train_dataset['labels'][0])

[1.0, 0.0, 0.0]


In [125]:
from transformers import TrainingArguments, Trainer

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()



Step,Training Loss


TrainOutput(global_step=435, training_loss=0.23567090966235632, metrics={'train_runtime': 199.1327, 'train_samples_per_second': 69.481, 'train_steps_per_second': 2.184, 'total_flos': 910109311921152.0, 'train_loss': 0.23567090966235632, 'epoch': 3.0})

In [62]:
model.save_pretrained("indobert-absa-model-3")

In [126]:
results = trainer.evaluate(tokenized_test_dataset)
print(results)

{'eval_loss': 0.23042859137058258, 'eval_runtime': 7.6422, 'eval_samples_per_second': 150.742, 'eval_steps_per_second': 9.421, 'epoch': 3.0}


In [67]:
test = Dataset.from_pandas(test_df)
test = test.map(preprocess_function, batched=True)
test = test.remove_columns(["review", "rating", "variable", "value"])

Map:   0%|          | 0/1152 [00:00<?, ? examples/s]

In [127]:
import torch

# Example test sample
test_aspect = "barang"
test_review = "lumayan"

# Preprocess the test sample
def preprocess_single_sample(aspect, review):
    combined_texts = aspect_token + aspect + review_token + review
    print(len(combined_texts))
    encoding = tokenizer(
        combined_texts, 
        padding="max_length", 
        truncation=True, 
        max_length=128,
        return_tensors="pt"
    )
    
    return encoding

# Preprocess the sample
encoding = preprocess_single_sample(test_aspect, test_review)

# Move inputs to the same device as the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
inputs = {key: value.to(device) for key, value in encoding.items()}
model.to(device)

# Put model in evaluation mode
model.eval()

# Run inference
with torch.no_grad():
    outputs = model(**inputs)

# Get predicted label
logits = outputs.logits
# print(logits.shape)
predicted_class_id = torch.argmax(logits, dim=1).item()

# Map class id to label (Assuming 0: Negative, 1: Neutral, 2: Positive)
label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
predicted_label = label_map[predicted_class_id]

print(f"Aspect: {test_aspect}")
print(f"Review: {test_review}")
print(f"Predicted Sentiment: {predicted_label}")


29
Aspect: barang
Review: lumayan
Predicted Sentiment: Neutral
