Identitas Kelompok  
Nico Samuelson / C14210017  
Darrell Cornelius Rivaldo / C14210025  
Nicholas Gunawan / C14210099  
Michael Adi Pratama / C14210016

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import numpy as np
import pandas as pd
import re

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

from sklearn.metrics import f1_score

In [2]:
seed = 123

# Reading Dataset

membaca data, data dishuffle, kemudian di split menjadi 2 satu untuk data training dan satu untuk data validasi

In [3]:
dataset = pd.read_csv('/kaggle/input/absa-aspect/aspect.csv')
dataset = dataset.sample(frac=1, random_state=seed) # shuffle

train_frac = 0.8
train_size = int(train_frac * dataset.shape[0])
train = Dataset.from_pandas(dataset.iloc[:train_size])
val = Dataset.from_pandas(dataset.iloc[train_size:])
train

Dataset({
    features: ['review', 'rating', 'pelayanan', 'pengiriman', 'barang', '__index_level_0__'],
    num_rows: 2894
})

# Preprocessing Dataset

1. membuat dictionary label2id dan id2label
1. melakukan preprocessing dataset
    - mengubah semua huruf pada review menjadi lowercase
    - menghapus semua tanda baca pada review
    - menghapus semua emoji pada review
    - tokenize review
    - melakukan multi hot encoding pada label

In [4]:
# Define the labels and mappings
labels = [label for label in dataset.columns if label not in ('review', 'rating', '__index_level_0__')]
id2label = {idx: label for idx, label in enumerate(labels)}
label2id = {label: idx for idx, label in enumerate(labels)}

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

def clean_text(texts):
    cleaned_text = []
    
    for text in texts:
        
        text = text.lower()

        text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text) # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")

        punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
        for p in punctuations:
            text = text.replace(p,'') #Removing punctuations

        emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
        text = emoji_pattern.sub(r'', text) #Removing emojis
        cleaned_text.append(text)
    
    return cleaned_text

# Define the preprocess function
def preprocess_data(examples):
    # Take a batch of texts
    text = clean_text(examples["review"])
    # Encode them
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
    # Add labels
    labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
    # Create numpy array of shape (batch_size, num_labels)
    labels_matrix = np.zeros((len(text), len(labels)))
    # Fill numpy array
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]

    encoding["labels"] = labels_matrix.tolist()
  
    return encoding

# Process the datasets
processed_train = train.map(preprocess_data, batched=True, remove_columns=train.column_names)
processed_val = val.map(preprocess_data, batched=True, remove_columns=val.column_names)

# Output the shapes of the processed datasets for verification
print(processed_train)
print(processed_val)

Map:   0%|          | 0/2894 [00:00<?, ? examples/s]

Map:   0%|          | 0/724 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 2894
})
Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 724
})


In [5]:
processed_train.set_format('torch')
processed_val.set_format('torch')

# Inisialisasi Model MultilingualBERT

In [6]:
# Load pre-trained model with a classification head
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased", 
    problem_type='multi_label_classification',
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Training

In [7]:
batch_size = 8
metric_name = "f1"

Model IndoBERT akan ditrain sebanyak 10 epoch dengan learning rate 0.00002 dan weight decay 0.01. Model terakhir yang akan diambil adalah model pada epoch dengan metric micro F1 tertinggi

In [8]:
args = TrainingArguments(
    f"bert-finetuned-aspect-extractor",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)

selain nilai micro f1 selama training, micro ROC AUC dan akurasi juga dihitung untuk tiap epochnya terhadap data validasi

In [9]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [10]:
trainer = Trainer(
    model,
    args,
    train_dataset=processed_train,
    eval_dataset=processed_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mc14210017[0m ([33mtokped_sentiment_analysis[0m). Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.317764,0.888981,0.87233,0.711326
2,No log,0.279974,0.906383,0.897125,0.763812
3,0.327500,0.272368,0.907127,0.900465,0.78453
4,0.327500,0.293635,0.903086,0.897178,0.779006
5,0.327500,0.285104,0.915385,0.907589,0.798343
6,0.157000,0.336133,0.901961,0.89262,0.770718
7,0.157000,0.326306,0.903896,0.897376,0.783149
8,0.157000,0.335926,0.911652,0.903341,0.788674
9,0.088400,0.345385,0.911904,0.904735,0.787293
10,0.088400,0.353831,0.912281,0.904441,0.787293




TrainOutput(global_step=1810, training_loss=0.16801249045693414, metrics={'train_runtime': 786.4746, 'train_samples_per_second': 36.797, 'train_steps_per_second': 2.301, 'total_flos': 1903625577262080.0, 'train_loss': 0.16801249045693414, 'epoch': 10.0})

In [11]:
trainer.evaluate()



{'eval_loss': 0.28510376811027527,
 'eval_f1': 0.9153846153846155,
 'eval_roc_auc': 0.9075891659962455,
 'eval_accuracy': 0.7983425414364641,
 'eval_runtime': 5.4633,
 'eval_samples_per_second': 132.521,
 'eval_steps_per_second': 8.42,
 'epoch': 10.0}

Jika melihat nilai micro f1 0.9154 dan akurasi 0.7983 model MultilingualBERT kalah dengan model IndoBERT. Model ini juga kalah tipis (0.0014) dari segi akurasi dengan model Roberta Indo namun dari segi micro f1 lebih unggul 0.0011

In [22]:
text = "proses dan kirim barang lama banget"

encoding = tokenizer(text, return_tensors="pt")
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

outputs = trainer.model(**encoding)
logits = outputs.logits

# apply sigmoid + threshold
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits.squeeze().cpu())
predictions = np.zeros(probs.shape)
predictions[np.where(probs >= 0.5)] = 1
# turn predicted id's into actual label names
predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
print(predicted_labels)

['pengiriman']


In [23]:
trainer.model.save_pretrained('multilingualbert-aspect-extractor')