In [1]:
import re
import torch
import time
import numpy as np
import pandas as pd
import torch.nn as nn
import matplotlib.pyplot as plt
from torch.multiprocessing import Manager
from torch.utils.data import DataLoader, Dataset
from accelerate import Accelerator, notebook_launcher
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score, classification_report
from transformers import BertTokenizer, BertModel, BertPreTrainedModel, BertConfig, BertForSequenceClassification

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic=True
    torch.backends.cudnn.benchmark=False

set_seed(42)

In [4]:
data = pd.read_csv('/kaggle/input/multi-label-hate-speech/re_dataset.csv', encoding='latin-1')

alay_dict = pd.read_csv('/kaggle/input/multi-label-hate-speech/new_kamusalay.csv', encoding='latin-1', header=None)
alay_dict = alay_dict.rename(columns={0: 'original', 
                                      1: 'replacement'})

In [5]:
print("Shape: ", data.shape)
data.head()

Shape:  (13169, 13)


Unnamed: 0,Tweet,HS,Abusive,HS_Individual,HS_Group,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Weak,HS_Moderate,HS_Strong
0,- disaat semua cowok berusaha melacak perhatia...,1,1,1,0,0,0,0,0,1,1,0,0
1,RT USER: USER siapa yang telat ngasih tau elu?...,0,1,0,0,0,0,0,0,0,0,0,0
2,"41. Kadang aku berfikir, kenapa aku tetap perc...",0,0,0,0,0,0,0,0,0,0,0,0
3,USER USER AKU ITU AKU\n\nKU TAU MATAMU SIPIT T...,0,0,0,0,0,0,0,0,0,0,0,0
4,USER USER Kaum cebong kapir udah keliatan dong...,1,1,0,1,1,0,0,0,0,0,1,0


In [6]:
data.HS.value_counts()

HS
0    7608
1    5561
Name: count, dtype: int64

In [7]:
data.Abusive.value_counts()

Abusive
0    8126
1    5043
Name: count, dtype: int64

In [8]:
print("Toxic shape: ", data[(data['HS'] == 1) | (data['Abusive'] == 1)].shape)
print("Non-toxic shape: ", data[(data['HS'] == 0) & (data['Abusive'] == 0)].shape)

Toxic shape:  (7309, 13)
Non-toxic shape:  (5860, 13)


In [9]:
print("Shape: ", alay_dict.shape)
alay_dict.head(15)

Shape:  (15167, 2)


Unnamed: 0,original,replacement
0,anakjakartaasikasik,anak jakarta asyik asyik
1,pakcikdahtua,pak cik sudah tua
2,pakcikmudalagi,pak cik muda lagi
3,t3tapjokowi,tetap jokowi
4,3x,tiga kali
5,aamiin,amin
6,aamiinn,amin
7,aamin,amin
8,aammiin,amin
9,abis,habis


In [10]:
def lowercase(text):
    return text.lower()

def remove_unnecessary_char(text):
    text = re.sub('\n',' ',text) # Remove every '\n'
    text = re.sub('rt',' ',text) # Remove every retweet symbol
    text = re.sub('user',' ',text) # Remove every username
    text = re.sub('url', ' ', text) # Remove every URL
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text) # Remove every URL
    text = re.sub(r'\b(?:x[a-fA-F0-9]{2}\s*)+\b', '', text) # Remove emoji bytecode
    text = re.sub('  +', ' ', text) # Remove extra spaces
    return text
    
def remove_nonaplhanumeric(text):
    text = re.sub('[^0-9a-zA-Z]+', ' ', text) 
    return text

alay_dict_map = dict(zip(alay_dict['original'], alay_dict['replacement']))
def normalize_alay(text):
    return ' '.join([alay_dict_map[word] if word in alay_dict_map else word for word in text.split(' ')])

print("remove_nonaplhanumeric: ", remove_nonaplhanumeric("Halooo,,,,, duniaa \x8f \xd2\1 !!"))
print("lowercase: ", lowercase("Halooo, duniaa!"))
print("remove_unnecessary_char: ", remove_unnecessary_char("Hehe\n\n RT USER USER apa kabs www.google.com\n  hehe URL xf8 x2a x89"))
print("normalize_alay: ", normalize_alay("aamiin adek abis"))

remove_nonaplhanumeric:  Halooo duniaa 
lowercase:  halooo, duniaa!
remove_unnecessary_char:  Hehe RT USER USER apa kabs hehe URL 
normalize_alay:  amin adik habis


In [11]:
def preprocess(text):
    text = lowercase(text) # 1
    text = remove_nonaplhanumeric(text) # 2
    text = remove_unnecessary_char(text) # 2
    text = normalize_alay(text) # 3

    return text

In [12]:
data['Tweet'] = data['Tweet'].apply(preprocess)
label_columns = data.columns[1:]

In [13]:
EPOCHS = 10
BATCH_SIZE = 64
LEARNING_RATE = 2e-5

In [14]:
def compute_metrics(p):
    preds = torch.tensor(p.predictions) # Sigmoid and threshold for multi-label
    labels = torch.tensor(p.label_ids)

    # Hamming accuracy: proportion of correctly predicted labels over total labels
    accuracy = (preds == labels).float().mean().item()

    # Standard multi-label precision, recall, and F1 metrics
    precision, recall, f1_micro, _ = precision_recall_fscore_support(labels, preds, average='micro', zero_division=0)
    _, _, f1_macro, _ = precision_recall_fscore_support(labels, preds, average='macro', zero_division=0)

    report = classification_report(
        labels, 
        preds, 
        target_names=['HS', 'Abusive', 'HS_Individual', 'HS_Group', 'HS_Religion', 'HS_Race', 'HS_Physical', 'HS_Gender', 'HS_Other', 'HS_Weak', 'HS_Moderate', 'HS_Strong'],
        zero_division=0
    )   
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'report': report
    }

In [15]:
class HateSpeechDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128, use_float=True):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.use_float = use_float

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        item = {key: val.squeeze() for key, val in encoding.items()}
        item['labels'] = torch.tensor(labels, dtype=torch.float if self.use_float else torch.long)
        return item

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

In [16]:
def get_dataloaders(X_train, y_train, X_val, y_val, sequence_length, num_workers=4):
    train_dataset = HateSpeechDataset(X_train, y_train, tokenizer, max_length=sequence_length)
    val_dataset = HateSpeechDataset(X_val, y_val, tokenizer, max_length=sequence_length)
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True, num_workers=num_workers
    )
    val_loader = torch.utils.data.DataLoader(
        val_dataset, batch_size=BATCH_SIZE, shuffle=False, pin_memory=True, num_workers=num_workers
    )

    return train_loader, val_loader

In [17]:
manager = Manager()
accuracies = manager.list()
f1_micros = manager.list()
f1_macros = manager.list()

In [18]:
def train_model(sequence_length, model_name, metrics, X_train, y_train, X_val, y_val, fold, seed=42, layers_freezed=6, num_workers=4):
    accelerator = Accelerator(mixed_precision='fp16')  # Initialize the accelerator
    device = accelerator.device

    with accelerator.main_process_first():
        model = BertForSequenceClassification.from_pretrained(
            model_name,
            num_labels=12,
            problem_type="multi_label_classification"
        )

    # Freeze the first few layers of the encoder
    for name, param in model.named_parameters():
        if "encoder.layer" in name:
            layer_num = name.split(".")[3]
            try:
                if int(layer_num) < layers_freezed:
                    param.requires_grad = False
            except ValueError:
                continue

    # Define DataLoaders
    train_loader, val_loader = get_dataloaders(X_train, y_train, X_val, y_val, sequence_length, num_workers)

    # Define optimizer and loss function
    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)
    loss_fn = torch.nn.BCEWithLogitsLoss()

    # Prepare everything with Accelerator
    model, optimizer, train_loader, val_loader = accelerator.prepare(
        model, optimizer, train_loader, val_loader
    )

    best_result = None
    start_time = time.time()

    for epoch in range(EPOCHS):
        model.train()
        epoch_loss = 0

        for batch in train_loader:
            inputs = {key: val for key, val in batch.items() if key != 'labels'}
            labels = batch['labels']

            optimizer.zero_grad()
            outputs = model(**inputs)
            loss = loss_fn(outputs.logits, labels)
            accelerator.backward(loss)
            optimizer.step()

            epoch_loss += loss.item()

        # Evaluation
        model.eval()
        all_preds = []
        all_labels = []
        
        with torch.no_grad():
            for batch in val_loader:
                inputs = {key: val for key, val in batch.items() if key != 'labels'}
                labels = batch['labels']
                
                outputs = model(**inputs)
                preds = torch.sigmoid(outputs.logits).round()

                # Gather predictions and labels from all devices
                all_preds.append(accelerator.gather(preds))
                all_labels.append(accelerator.gather(labels))

        all_preds = torch.cat(all_preds).cpu().numpy()
        all_labels = torch.cat(all_labels).cpu().numpy()
        
        result = compute_metrics(type('EvalOutput', (object,), {'predictions': all_preds, 'label_ids': all_labels}))

        if best_result is None or result['f1_micro'] >= best_result['f1_micro']:
            best_result = result
            
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(
                f'model-{fold + 1}',
                is_main_process=accelerator.is_main_process,
                save_function=accelerator.save,
            )

        accelerator.print(f"Epoch {epoch + 1}/{EPOCHS}, Train Loss: {round(epoch_loss / len(train_loader), 4)}, Accuracy: {round(result['accuracy'], 4)}, F1 Micro: {round(result['f1_micro'], 4)}, F1 Macro: {round(result['f1_macro'], 4)}")

    end_time = time.time()
    duration = end_time - start_time

    if accelerator.is_main_process:
        metrics[0].append(best_result['accuracy'])
        metrics[1].append(best_result['f1_micro'])
        metrics[2].append(best_result['f1_macro'])
        
    accelerator.print(f"\nAccuracy: {round(best_result['accuracy'], 4)}, F1 Micro: {round(best_result['f1_micro'], 4)}, F1 Macro: {round(best_result['f1_macro'], 4)}")
    accelerator.print(best_result['report'])
    accelerator.print(f"Duration: {duration}")

In [19]:
from sklearn.model_selection import KFold

N_SPLITS = 5 
RANDOM_SEED = 42

# Prepare data for K-Fold
label_columns = data.columns[1:]
X = data['Tweet'].values
y = data[label_columns].values
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)

# Shared resources for this fold's processes
accuracies = manager.list()
f1_micros = manager.list()
f1_macros = manager.list()

for fold, (train_index, val_index) in enumerate(kf.split(X)):
    print("===============================================")
    print(f"STARTING FOLD {fold + 1}/{N_SPLITS}")
    print("===============================================")

    X_train_fold, X_val_fold = X[train_index], X[val_index]
    y_train_fold, y_val_fold = y[train_index], y[val_index]

    seed = RANDOM_SEED + fold
    set_seed(seed)
    args = (80, 'indobenchmark/indobert-base-p1', (accuracies, f1_micros, f1_macros), X_train_fold, y_train_fold, X_val_fold, y_val_fold, fold, seed, 6)
    notebook_launcher(train_model, args, num_processes=2)

STARTING FOLD 1/5
Launching training on 2 GPUs.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10, Train Loss: 0.3793, Accuracy: 0.8888, F1 Micro: 0.6095, F1 Macro: 0.3054
Epoch 2/10, Train Loss: 0.2544, Accuracy: 0.9107, F1 Micro: 0.7144, F1 Macro: 0.5334
Epoch 3/10, Train Loss: 0.2044, Accuracy: 0.9218, F1 Micro: 0.7616, F1 Macro: 0.59
Epoch 4/10, Train Loss: 0.168, Accuracy: 0.9207, F1 Micro: 0.7764, F1 Macro: 0.6243
Epoch 5/10, Train Loss: 0.1417, Accuracy: 0.9248, F1 Micro: 0.7708, F1 Macro: 0.6344
Epoch 6/10, Train Loss: 0.1231, Accuracy: 0.9276, F1 Micro: 0.7877, F1 Macro: 0.6639
Epoch 7/10, Train Loss: 0.0999, Accuracy: 0.9268, F1 Micro: 0.7911, F1 Macro: 0.6971
Epoch 8/10, Train Loss: 0.0882, Accuracy: 0.9248, F1 Micro: 0.7815, F1 Macro: 0.6749
Epoch 9/10, Train Loss: 0.0761, Accuracy: 0.9288, F1 Micro: 0.7892, F1 Macro: 0.6912
Epoch 10/10, Train Loss: 0.0656, Accuracy: 0.9275, F1 Micro: 0.7865, F1 Macro: 0.7083

Accuracy: 0.9268, F1 Micro: 0.7911, F1 Macro: 0.6971
               precision    recall  f1-score   support

           HS       0.85      0.88      0.

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10, Train Loss: 0.3776, Accuracy: 0.8891, F1 Micro: 0.6478, F1 Macro: 0.3467
Epoch 2/10, Train Loss: 0.2532, Accuracy: 0.9076, F1 Micro: 0.698, F1 Macro: 0.4949
Epoch 3/10, Train Loss: 0.1969, Accuracy: 0.9178, F1 Micro: 0.7496, F1 Macro: 0.5891
Epoch 4/10, Train Loss: 0.1669, Accuracy: 0.918, F1 Micro: 0.7565, F1 Macro: 0.5881
Epoch 5/10, Train Loss: 0.1422, Accuracy: 0.9187, F1 Micro: 0.7706, F1 Macro: 0.6556
Epoch 6/10, Train Loss: 0.1136, Accuracy: 0.9244, F1 Micro: 0.7769, F1 Macro: 0.6653
Epoch 7/10, Train Loss: 0.095, Accuracy: 0.9248, F1 Micro: 0.7761, F1 Macro: 0.6824
Epoch 8/10, Train Loss: 0.0843, Accuracy: 0.9215, F1 Micro: 0.7735, F1 Macro: 0.6873
Epoch 9/10, Train Loss: 0.0696, Accuracy: 0.9211, F1 Micro: 0.7736, F1 Macro: 0.6959
Epoch 10/10, Train Loss: 0.0575, Accuracy: 0.9223, F1 Micro: 0.7751, F1 Macro: 0.7101

Accuracy: 0.9244, F1 Micro: 0.7769, F1 Macro: 0.6653
               precision    recall  f1-score   support

           HS       0.82      0.87      0.

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10, Train Loss: 0.3624, Accuracy: 0.8917, F1 Micro: 0.6554, F1 Macro: 0.4062
Epoch 2/10, Train Loss: 0.2488, Accuracy: 0.9117, F1 Micro: 0.7358, F1 Macro: 0.5443
Epoch 3/10, Train Loss: 0.2021, Accuracy: 0.9164, F1 Micro: 0.7326, F1 Macro: 0.5211
Epoch 4/10, Train Loss: 0.1703, Accuracy: 0.9213, F1 Micro: 0.7658, F1 Macro: 0.6045
Epoch 5/10, Train Loss: 0.1428, Accuracy: 0.9238, F1 Micro: 0.7806, F1 Macro: 0.6568
Epoch 6/10, Train Loss: 0.1207, Accuracy: 0.9264, F1 Micro: 0.7846, F1 Macro: 0.6771
Epoch 7/10, Train Loss: 0.0966, Accuracy: 0.9234, F1 Micro: 0.7819, F1 Macro: 0.6907
Epoch 8/10, Train Loss: 0.0813, Accuracy: 0.9234, F1 Micro: 0.7784, F1 Macro: 0.6889
Epoch 9/10, Train Loss: 0.0713, Accuracy: 0.9229, F1 Micro: 0.7787, F1 Macro: 0.6943
Epoch 10/10, Train Loss: 0.0607, Accuracy: 0.9256, F1 Micro: 0.7828, F1 Macro: 0.7109

Accuracy: 0.9264, F1 Micro: 0.7846, F1 Macro: 0.6771
               precision    recall  f1-score   support

           HS       0.85      0.87     

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10, Train Loss: 0.3682, Accuracy: 0.895, F1 Micro: 0.6446, F1 Macro: 0.3825
Epoch 2/10, Train Loss: 0.2479, Accuracy: 0.9134, F1 Micro: 0.7351, F1 Macro: 0.5316
Epoch 3/10, Train Loss: 0.2003, Accuracy: 0.9204, F1 Micro: 0.7674, F1 Macro: 0.5985
Epoch 4/10, Train Loss: 0.1672, Accuracy: 0.924, F1 Micro: 0.7791, F1 Macro: 0.6099
Epoch 5/10, Train Loss: 0.1433, Accuracy: 0.9286, F1 Micro: 0.7798, F1 Macro: 0.6145
Epoch 6/10, Train Loss: 0.1169, Accuracy: 0.9228, F1 Micro: 0.7812, F1 Macro: 0.667
Epoch 7/10, Train Loss: 0.0949, Accuracy: 0.9246, F1 Micro: 0.7855, F1 Macro: 0.6853
Epoch 8/10, Train Loss: 0.0824, Accuracy: 0.9266, F1 Micro: 0.782, F1 Macro: 0.6644
Epoch 9/10, Train Loss: 0.0712, Accuracy: 0.9269, F1 Micro: 0.783, F1 Macro: 0.6694
Epoch 10/10, Train Loss: 0.0623, Accuracy: 0.9275, F1 Micro: 0.7853, F1 Macro: 0.6984

Accuracy: 0.9246, F1 Micro: 0.7855, F1 Macro: 0.6853
               precision    recall  f1-score   support

           HS       0.81      0.91      0.86

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10, Train Loss: 0.3703, Accuracy: 0.8839, F1 Micro: 0.6525, F1 Macro: 0.3465
Epoch 2/10, Train Loss: 0.2494, Accuracy: 0.906, F1 Micro: 0.7126, F1 Macro: 0.5063
Epoch 3/10, Train Loss: 0.2031, Accuracy: 0.9169, F1 Micro: 0.7575, F1 Macro: 0.5909
Epoch 4/10, Train Loss: 0.1665, Accuracy: 0.9204, F1 Micro: 0.7719, F1 Macro: 0.635
Epoch 5/10, Train Loss: 0.1435, Accuracy: 0.9227, F1 Micro: 0.773, F1 Macro: 0.6727
Epoch 6/10, Train Loss: 0.1201, Accuracy: 0.9233, F1 Micro: 0.7826, F1 Macro: 0.6963
Epoch 7/10, Train Loss: 0.0964, Accuracy: 0.9203, F1 Micro: 0.7816, F1 Macro: 0.7035
Epoch 8/10, Train Loss: 0.0844, Accuracy: 0.9218, F1 Micro: 0.7829, F1 Macro: 0.7187
Epoch 9/10, Train Loss: 0.0708, Accuracy: 0.9212, F1 Micro: 0.7818, F1 Macro: 0.7235
Epoch 10/10, Train Loss: 0.0623, Accuracy: 0.9241, F1 Micro: 0.7862, F1 Macro: 0.7292

Accuracy: 0.9241, F1 Micro: 0.7862, F1 Macro: 0.7292
               precision    recall  f1-score   support

           HS       0.86      0.87      0.

In [20]:
results = pd.DataFrame({
    'Trial': [1,2,3,4,5],
    'Accuracy': list(accuracies),
    'F1 Micro': list(f1_micros),
    'F1 Macro': list(f1_macros),
})

results.to_csv(f'hsd-passive-kfold-result.csv', index=False)