<a href="https://www.kaggle.com/code/nicost312/sentiment-analysis-tokopedia?scriptVersionId=185608462" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import numpy as np
import pandas as pd
import re

from datasets import load_from_disk
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

from sklearn.metrics import f1_score

In [2]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("indolem/indobert-base-uncased")
review_token = '[REVIEW]'
aspect_token = '[ASPECT]'
special_tokens_dict = {'additional_special_tokens': [review_token, aspect_token]}
num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)


def clean_text(texts):
    cleaned_text = []
    
    for text in texts:
        
        text = text.lower()

        text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text) # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")

        punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
        for p in punctuations:
            text = text.replace(p,'') #Removing punctuations

        emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
        text = emoji_pattern.sub(r'', text) #Removing emojis
        cleaned_text.append(text)
    
    return cleaned_text

# Preprocess function
def preprocess_function(examples):
    combined_texts = [aspect_token + aspect + review_token + review for aspect, review in zip(examples["variable"], examples["review"])]
    encoding =  tokenizer(
        clean_text(examples["review"]), 
        padding="max_length", 
        truncation=True, 
        max_length=128
    )
    

    labels_matrix = np.zeros((len(examples['review']), 3))
    
    for i, label in enumerate(examples["value"]):
        labels_matrix[i, int(label)] = 1

        encoding["labels"] = labels_matrix.tolist()
  
    return encoding

train_df = pd.read_csv('/kaggle/input/bert-absa-dataset/out.csv').iloc[:4612, :]
test_df = pd.read_csv('/kaggle/input/bert-absa-dataset/out.csv').iloc[4612:, :]

# Convert your data to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

tokenized_train_dataset = tokenized_train_dataset.remove_columns(["review", "rating", "variable", "value"])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["review", "rating", "variable", "value"])

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/234k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/4612 [00:00<?, ? examples/s]

Map:   0%|          | 0/1152 [00:00<?, ? examples/s]

In [3]:
# Load pre-trained model with a classification head
model = AutoModelForSequenceClassification.from_pretrained("indolem/indobert-base-uncased", num_labels=3)
model.resize_token_embeddings(len(tokenizer))

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(31925, 768)

In [4]:
print(tokenized_train_dataset)

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 4612
})


In [5]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    
def accuracy_metric(predictions, labels, threshold=0.5):
    y_true = labels
    
#     print(predictions[5])
    y_pred_indices = np.argmax(predictions, axis=1)
    y_pred_one_hot = np.zeros_like(predictions)
    y_pred_one_hot[np.arange(predictions.shape[0]), y_pred_indices] = 1
    
#     print(y_true[5])
#     print(y_pred_one_hot[5])
    accuracy = accuracy_score(y_true, y_pred_one_hot)
    # return as dictionary
    metrics = {'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = accuracy_metric(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [6]:
from transformers import TrainingArguments, Trainer

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    load_best_model_at_end=True,
    weight_decay=0.01,
)

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01111340029999989, max=1.0)…



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.249628,0.84809
2,No log,0.237722,0.863715
3,No log,0.232481,0.864583




TrainOutput(global_step=435, training_loss=0.23683322599564477, metrics={'train_runtime': 244.3043, 'train_samples_per_second': 56.634, 'train_steps_per_second': 1.781, 'total_flos': 910109311921152.0, 'train_loss': 0.23683322599564477, 'epoch': 3.0})

In [None]:
model.save_pretrained("indobert-absa-model-final")

In [7]:
results = trainer.evaluate(tokenized_test_dataset)
print(results)



{'eval_loss': 0.23248052597045898, 'eval_accuracy': 0.8645833333333334, 'eval_runtime': 5.6028, 'eval_samples_per_second': 205.612, 'eval_steps_per_second': 6.425, 'epoch': 3.0}


In [None]:
import torch

# Example test sample
test_aspect = "barang"
test_review = "lumayan"

# Preprocess the test sample
def preprocess_single_sample(aspect, review):
    combined_texts = aspect_token + aspect + review_token + review
    print(len(combined_texts))
    encoding = tokenizer(
        combined_texts, 
        padding="max_length", 
        truncation=True, 
        max_length=128,
        return_tensors="pt"
    )
    
    return encoding

# Preprocess the sample
encoding = preprocess_single_sample(test_aspect, test_review)

# Move inputs to the same device as the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
inputs = {key: value.to(device) for key, value in encoding.items()}
model.to(device)

# Put model in evaluation mode
model.eval()

# Run inference
with torch.no_grad():
    outputs = model(**inputs)

# Get predicted label
logits = outputs.logits
# print(logits.shape)
predicted_class_id = torch.argmax(logits, dim=1).item()

# Map class id to label (Assuming 0: Negative, 1: Neutral, 2: Positive)
label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
predicted_label = label_map[predicted_class_id]

print(f"Aspect: {test_aspect}")
print(f"Review: {test_review}")
print(f"Predicted Sentiment: {predicted_label}")


# MULTILINGUAL

In [8]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
review_token = '[REVIEW]'
aspect_token = '[ASPECT]'
special_tokens_dict = {'additional_special_tokens': [review_token, aspect_token]}
num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)


def clean_text(texts):
    cleaned_text = []
    
    for text in texts:
        
        text = text.lower()

        text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text) # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")

        punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
        for p in punctuations:
            text = text.replace(p,'') #Removing punctuations

        emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
        text = emoji_pattern.sub(r'', text) #Removing emojis
        cleaned_text.append(text)
    
    return cleaned_text

# Preprocess function
def preprocess_function(examples):
    combined_texts = [aspect_token + aspect + review_token + review for aspect, review in zip(examples["variable"], examples["review"])]
    encoding =  tokenizer(
        clean_text(examples["review"]), 
        padding="max_length", 
        truncation=True, 
        max_length=128
    )
    

    labels_matrix = np.zeros((len(examples['review']), 3))
    
    for i, label in enumerate(examples["value"]):
        labels_matrix[i, int(label)] = 1

        encoding["labels"] = labels_matrix.tolist()
  
    return encoding

train_df = pd.read_csv('/kaggle/input/bert-absa-dataset/out.csv').iloc[:4612, :]
test_df = pd.read_csv('/kaggle/input/bert-absa-dataset/out.csv').iloc[4612:, :]

# Convert your data to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

tokenized_train_dataset = tokenized_train_dataset.remove_columns(["review", "rating", "variable", "value"])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["review", "rating", "variable", "value"])

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Map:   0%|          | 0/4612 [00:00<?, ? examples/s]

Map:   0%|          | 0/1152 [00:00<?, ? examples/s]

In [9]:
# Load pre-trained model with a classification head
model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=3)
model.resize_token_embeddings(len(tokenizer))

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(119549, 768)

In [6]:
from transformers import TrainingArguments, Trainer

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    load_best_model_at_end=True,
    weight_decay=0.01,
)

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.276519,0.820312
2,No log,0.245479,0.848958
3,No log,0.249207,0.853299




TrainOutput(global_step=435, training_loss=0.23655486709770115, metrics={'train_runtime': 337.3014, 'train_samples_per_second': 41.02, 'train_steps_per_second': 1.29, 'total_flos': 910109311921152.0, 'train_loss': 0.23655486709770115, 'epoch': 3.0})

In [7]:
results = trainer.evaluate(tokenized_test_dataset)
print(results)



{'eval_loss': 0.2454787641763687, 'eval_accuracy': 0.8489583333333334, 'eval_runtime': 6.6989, 'eval_samples_per_second': 171.969, 'eval_steps_per_second': 5.374, 'epoch': 3.0}


In [8]:
model.save_pretrained("multilingual-absa-model-final")

# Indo ROBERTA

In [9]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("cahya/roberta-base-indonesian-522M")
review_token = '[REVIEW]'
aspect_token = '[ASPECT]'
special_tokens_dict = {'additional_special_tokens': [review_token, aspect_token]}
num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)


def clean_text(texts):
    cleaned_text = []
    
    for text in texts:
        
        text = text.lower()

        text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text) # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")

        punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
        for p in punctuations:
            text = text.replace(p,'') #Removing punctuations

        emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
        text = emoji_pattern.sub(r'', text) #Removing emojis
        cleaned_text.append(text)
    
    return cleaned_text

# Preprocess function
def preprocess_function(examples):
    combined_texts = [aspect_token + aspect + review_token + review for aspect, review in zip(examples["variable"], examples["review"])]
    encoding =  tokenizer(
        clean_text(examples["review"]), 
        padding="max_length", 
        truncation=True, 
        max_length=128
    )
    

    labels_matrix = np.zeros((len(examples['review']), 3))
    
    for i, label in enumerate(examples["value"]):
        labels_matrix[i, int(label)] = 1

        encoding["labels"] = labels_matrix.tolist()
  
    return encoding

train_df = pd.read_csv('/kaggle/input/bert-absa-dataset/out.csv').iloc[:4612, :]
test_df = pd.read_csv('/kaggle/input/bert-absa-dataset/out.csv').iloc[4612:, :]

# Convert your data to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

tokenized_train_dataset = tokenized_train_dataset.remove_columns(["review", "rating", "variable", "value"])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["review", "rating", "variable", "value"])

tokenizer_config.json:   0%|          | 0.00/150 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/926k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/468k [00:00<?, ?B/s]

Map:   0%|          | 0/4612 [00:00<?, ? examples/s]

Map:   0%|          | 0/1152 [00:00<?, ? examples/s]

In [10]:
# Load pre-trained model with a classification head
model = AutoModelForSequenceClassification.from_pretrained("cahya/roberta-base-indonesian-522M", num_labels=3)
model.resize_token_embeddings(len(tokenizer))

pytorch_model.bin:   0%|          | 0.00/507M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cahya/roberta-base-indonesian-522M and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(52002, 768)

In [11]:
from transformers import TrainingArguments, Trainer

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    load_best_model_at_end=True,
    weight_decay=0.01,
)

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.258609,0.84375
2,No log,0.243934,0.845486
3,No log,0.246464,0.845486




TrainOutput(global_step=435, training_loss=0.20075331063106142, metrics={'train_runtime': 229.1889, 'train_samples_per_second': 60.369, 'train_steps_per_second': 1.898, 'total_flos': 910109311921152.0, 'train_loss': 0.20075331063106142, 'epoch': 3.0})

In [12]:
results = trainer.evaluate(tokenized_test_dataset)
print(results)



{'eval_loss': 0.24393394589424133, 'eval_accuracy': 0.8454861111111112, 'eval_runtime': 5.8078, 'eval_samples_per_second': 198.355, 'eval_steps_per_second': 6.199, 'epoch': 3.0}


In [13]:
model.save_pretrained("roberta-absa-model-final")