In [1]:
import re
import torch
import time
import numpy as np
import pandas as pd
import torch.nn as nn
import matplotlib.pyplot as plt
from torch.multiprocessing import Manager
from torch.utils.data import DataLoader, Dataset
from accelerate import Accelerator, notebook_launcher
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score, classification_report
from transformers import BertTokenizer, BertModel, BertPreTrainedModel, BertConfig, BertForSequenceClassification

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic=True
    torch.backends.cudnn.benchmark=False

set_seed(42)

In [4]:
train_data = pd.read_csv('/kaggle/input/netifier-3/processed_train.csv', encoding='latin-1')
val_data = pd.read_csv('/kaggle/input/netifier-3/processed_test.csv', encoding='latin-1')

data = pd.concat([train_data, val_data], ignore_index=True)

data.head()

Unnamed: 0,original_text,source,pornografi,sara,radikalisme,pencemaran_nama_baik,processed_text
0,[QUOTE=jessepinkman16;5a50ac34d89b093f368b456e...,kaskus,0,0,0,1,jabar memang provinsi barokah boleh juga dan n...
1,"@verosvante kita2 aja nitizen yang pada kepo,t...",instagram,0,0,0,0,kita saja nitizen yang pada penasaran toh kelu...
2,"""#SidangAhok smg sipenista agama n ateknya mat...",twitter,0,1,1,1,sidangahok semoga sipenista agama dan ateknya ...
3,@bolususulembang.jkt barusan baca undang2 ini....,instagram,0,0,0,0,jakarta barusan baca undang ini tetap dibedaka...
4,bikin anak mulu lu nof \nkaga mikir apa kasian...,kaskus,0,0,0,0,buat anak melulu kamu nof nkaga mikir apa kasi...


In [5]:
EPOCHS = 10
BATCH_SIZE = 32
LEARNING_RATE = 2e-5

In [6]:
# Define custom Dataset class
class NetifierDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=96, use_float=True):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.use_float = use_float

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        item = {key: val.squeeze() for key, val in encoding.items()}
        item['labels'] = torch.tensor(labels, dtype=torch.float if self.use_float else torch.long)
        return item

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

In [7]:
def compute_metrics(p):
    preds = torch.tensor(p.predictions) # Sigmoid and threshold for multi-label
    labels = torch.tensor(p.label_ids)

    accuracy = (preds == labels).float().mean().item()

    # Standard multi-label precision, recall, and F1 metrics
    precision, recall, f1_micro, _ = precision_recall_fscore_support(labels, preds, average='micro', zero_division=0)
    _, _, f1_macro, _ = precision_recall_fscore_support(labels, preds, average='macro', zero_division=0)

    report = classification_report(
        labels, 
        preds, 
        target_names=['pornografi', 'sara', 'radikalisme', 'pencemaran_nama_baik'],
        zero_division=0
    ) 

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'report': report
    }

In [8]:
def seed_worker(worker_id):
    worker_seed = 42 + worker_id
    np.random.seed(worker_seed)

def get_dataloaders(X_train, y_train, X_val, y_val, sequence_length, num_workers=4):
    train_dataset = NetifierDataset(X_train, y_train, tokenizer, max_length=sequence_length)
    val_dataset = NetifierDataset(X_val, y_val, tokenizer, max_length=sequence_length)
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True, num_workers=num_workers, worker_init_fn=seed_worker,
    )
    val_loader = torch.utils.data.DataLoader(
        val_dataset, batch_size=BATCH_SIZE, shuffle=False, pin_memory=True, num_workers=num_workers, worker_init_fn=seed_worker,
    )

    return train_loader, val_loader

In [9]:
manager = Manager()
accuracies = manager.list()
f1_micros = manager.list()
f1_macros = manager.list()

In [10]:
def train_model(sequence_length, model_name, metrics, X_train, y_train, X_val, y_val, fold, seed=42, layers_freezed=6, num_workers=4):
    accelerator = Accelerator(mixed_precision='fp16')  # Initialize the accelerator
    device = accelerator.device

    with accelerator.main_process_first():
        model = BertForSequenceClassification.from_pretrained(
            model_name,
            num_labels=4,
            problem_type="multi_label_classification"
        )

    # Freeze the first few layers of the encoder
    for name, param in model.named_parameters():
        if "encoder.layer" in name:
            layer_num = name.split(".")[3]
            try:
                if int(layer_num) < layers_freezed:
                    param.requires_grad = False
            except ValueError:
                continue

    # Define DataLoaders
    train_loader, val_loader = get_dataloaders(X_train, y_train, X_val, y_val, sequence_length, num_workers)

    # Define optimizer and loss function
    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)
    loss_fn = torch.nn.BCEWithLogitsLoss()

    # Prepare everything with Accelerator
    model, optimizer, train_loader, val_loader = accelerator.prepare(
        model, optimizer, train_loader, val_loader
    )

    best_result = None
    start_time = time.time()

    for epoch in range(EPOCHS):
        model.train()
        epoch_loss = 0

        for batch in train_loader:
            inputs = {key: val for key, val in batch.items() if key != 'labels'}
            labels = batch['labels']

            optimizer.zero_grad()
            outputs = model(**inputs)
            loss = loss_fn(outputs.logits, labels)
            accelerator.backward(loss)
            optimizer.step()

            epoch_loss += loss.item()

        # Evaluation
        model.eval()
        all_preds = []
        all_labels = []
        
        with torch.no_grad():
            for batch in val_loader:
                inputs = {key: val for key, val in batch.items() if key != 'labels'}
                labels = batch['labels']
                
                outputs = model(**inputs)
                preds = torch.sigmoid(outputs.logits).round()

                # Gather predictions and labels from all devices
                all_preds.append(accelerator.gather(preds))
                all_labels.append(accelerator.gather(labels))

        all_preds = torch.cat(all_preds).cpu().numpy()
        all_labels = torch.cat(all_labels).cpu().numpy()
        
        result = compute_metrics(type('EvalOutput', (object,), {'predictions': all_preds, 'label_ids': all_labels}))

        if best_result is None or result['f1_micro'] >= best_result['f1_micro']:
            best_result = result
            
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(
                f'model-{fold + 1}',
                is_main_process=accelerator.is_main_process,
                save_function=accelerator.save,
            )

        accelerator.print(f"Epoch {epoch + 1}/{EPOCHS}, Train Loss: {round(epoch_loss / len(train_loader), 4)}, Accuracy: {round(result['accuracy'], 4)}, F1 Micro: {round(result['f1_micro'], 4)}, F1 Macro: {round(result['f1_macro'], 4)}")

    end_time = time.time()
    duration = end_time - start_time

    if accelerator.is_main_process:
        metrics[0].append(best_result['accuracy'])
        metrics[1].append(best_result['f1_micro'])
        metrics[2].append(best_result['f1_macro'])
        
    accelerator.print(f"\nAccuracy: {round(best_result['accuracy'], 4)}, F1 Micro: {round(best_result['f1_micro'], 4)}, F1 Macro: {round(best_result['f1_macro'], 4)}")
    accelerator.print(best_result['report'])
    accelerator.print(f"Duration: {duration}")

In [11]:
from sklearn.model_selection import KFold

N_SPLITS = 5 
RANDOM_SEED = 42

# Prepare data for K-Fold
label_columns = data.columns[2:6]
X = data['processed_text'].values
y = data[label_columns].values
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)

# Shared resources for this fold's processes
accuracies = manager.list()
f1_micros = manager.list()
f1_macros = manager.list()

for fold, (train_index, val_index) in enumerate(kf.split(X)):
    print("===============================================")
    print(f"STARTING FOLD {fold + 1}/{N_SPLITS}")
    print("===============================================")

    X_train_fold, X_val_fold = X[train_index], X[val_index]
    y_train_fold, y_val_fold = y[train_index], y[val_index]

    seed = RANDOM_SEED + fold
    set_seed(seed)
    args = (96, 'indobenchmark/indobert-base-p1', (accuracies, f1_micros, f1_macros), X_train_fold, y_train_fold, X_val_fold, y_val_fold, fold, seed, 6)
    notebook_launcher(train_model, args, num_processes=2)

STARTING FOLD 1/5
Launching training on 2 GPUs.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10, Train Loss: 0.3569, Accuracy: 0.8891, F1 Micro: 0.7227, F1 Macro: 0.716
Epoch 2/10, Train Loss: 0.2266, Accuracy: 0.8991, F1 Micro: 0.7414, F1 Macro: 0.7351
Epoch 3/10, Train Loss: 0.1944, Accuracy: 0.9075, F1 Micro: 0.7725, F1 Macro: 0.7698
Epoch 4/10, Train Loss: 0.1505, Accuracy: 0.9064, F1 Micro: 0.7652, F1 Macro: 0.755
Epoch 5/10, Train Loss: 0.1072, Accuracy: 0.9055, F1 Micro: 0.7794, F1 Macro: 0.7766
Epoch 6/10, Train Loss: 0.0829, Accuracy: 0.9047, F1 Micro: 0.7804, F1 Macro: 0.7799
Epoch 7/10, Train Loss: 0.0654, Accuracy: 0.9042, F1 Micro: 0.7744, F1 Macro: 0.7712
Epoch 8/10, Train Loss: 0.0504, Accuracy: 0.905, F1 Micro: 0.7632, F1 Macro: 0.7577
Epoch 9/10, Train Loss: 0.037, Accuracy: 0.9044, F1 Micro: 0.7723, F1 Macro: 0.7726
Epoch 10/10, Train Loss: 0.0257, Accuracy: 0.902, F1 Micro: 0.7571, F1 Macro: 0.7489

Accuracy: 0.9047, F1 Micro: 0.7804, F1 Macro: 0.7799
                      precision    recall  f1-score   support

          pornografi       0.94      

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10, Train Loss: 0.3567, Accuracy: 0.8881, F1 Micro: 0.7034, F1 Macro: 0.6723
Epoch 2/10, Train Loss: 0.2304, Accuracy: 0.9039, F1 Micro: 0.7564, F1 Macro: 0.7495
Epoch 3/10, Train Loss: 0.1879, Accuracy: 0.9111, F1 Micro: 0.7974, F1 Macro: 0.795
Epoch 4/10, Train Loss: 0.1534, Accuracy: 0.9114, F1 Micro: 0.7936, F1 Macro: 0.7842
Epoch 5/10, Train Loss: 0.1168, Accuracy: 0.9111, F1 Micro: 0.789, F1 Macro: 0.7838
Epoch 6/10, Train Loss: 0.0789, Accuracy: 0.908, F1 Micro: 0.7787, F1 Macro: 0.7713
Epoch 7/10, Train Loss: 0.0608, Accuracy: 0.9106, F1 Micro: 0.784, F1 Macro: 0.7723
Epoch 8/10, Train Loss: 0.0422, Accuracy: 0.9086, F1 Micro: 0.7829, F1 Macro: 0.7782
Epoch 9/10, Train Loss: 0.0355, Accuracy: 0.91, F1 Micro: 0.7947, F1 Macro: 0.7905
Epoch 10/10, Train Loss: 0.0266, Accuracy: 0.9052, F1 Micro: 0.789, F1 Macro: 0.7822

Accuracy: 0.9111, F1 Micro: 0.7974, F1 Macro: 0.795
                      precision    recall  f1-score   support

          pornografi       0.94      0.9

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10, Train Loss: 0.3765, Accuracy: 0.8878, F1 Micro: 0.7341, F1 Macro: 0.7288
Epoch 2/10, Train Loss: 0.2331, Accuracy: 0.8955, F1 Micro: 0.7678, F1 Macro: 0.7691
Epoch 3/10, Train Loss: 0.1933, Accuracy: 0.8966, F1 Micro: 0.7573, F1 Macro: 0.7523
Epoch 4/10, Train Loss: 0.1524, Accuracy: 0.8986, F1 Micro: 0.7685, F1 Macro: 0.7707
Epoch 5/10, Train Loss: 0.1145, Accuracy: 0.8922, F1 Micro: 0.7637, F1 Macro: 0.7606
Epoch 6/10, Train Loss: 0.0849, Accuracy: 0.8973, F1 Micro: 0.7638, F1 Macro: 0.7654
Epoch 7/10, Train Loss: 0.0653, Accuracy: 0.8906, F1 Micro: 0.7583, F1 Macro: 0.7582
Epoch 8/10, Train Loss: 0.0484, Accuracy: 0.8944, F1 Micro: 0.7674, F1 Macro: 0.7689
Epoch 9/10, Train Loss: 0.0345, Accuracy: 0.8933, F1 Micro: 0.7514, F1 Macro: 0.75
Epoch 10/10, Train Loss: 0.0302, Accuracy: 0.8911, F1 Micro: 0.7656, F1 Macro: 0.7671

Accuracy: 0.8986, F1 Micro: 0.7685, F1 Macro: 0.7707
                      precision    recall  f1-score   support

          pornografi       0.91   

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10, Train Loss: 0.3457, Accuracy: 0.8856, F1 Micro: 0.7391, F1 Macro: 0.737
Epoch 2/10, Train Loss: 0.2255, Accuracy: 0.8948, F1 Micro: 0.7759, F1 Macro: 0.7773
Epoch 3/10, Train Loss: 0.1913, Accuracy: 0.9067, F1 Micro: 0.7831, F1 Macro: 0.7765
Epoch 4/10, Train Loss: 0.1516, Accuracy: 0.903, F1 Micro: 0.7746, F1 Macro: 0.7699
Epoch 5/10, Train Loss: 0.1132, Accuracy: 0.9056, F1 Micro: 0.7781, F1 Macro: 0.7732
Epoch 6/10, Train Loss: 0.0838, Accuracy: 0.9011, F1 Micro: 0.7753, F1 Macro: 0.7752
Epoch 7/10, Train Loss: 0.0598, Accuracy: 0.903, F1 Micro: 0.7744, F1 Macro: 0.7699
Epoch 8/10, Train Loss: 0.0422, Accuracy: 0.9033, F1 Micro: 0.7771, F1 Macro: 0.7723
Epoch 9/10, Train Loss: 0.0339, Accuracy: 0.9, F1 Micro: 0.769, F1 Macro: 0.7661
Epoch 10/10, Train Loss: 0.0256, Accuracy: 0.8995, F1 Micro: 0.7803, F1 Macro: 0.7792

Accuracy: 0.9067, F1 Micro: 0.7831, F1 Macro: 0.7765
                      precision    recall  f1-score   support

          pornografi       0.92      0.

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10, Train Loss: 0.3575, Accuracy: 0.8938, F1 Micro: 0.748, F1 Macro: 0.7457
Epoch 2/10, Train Loss: 0.231, Accuracy: 0.9058, F1 Micro: 0.7736, F1 Macro: 0.7738
Epoch 3/10, Train Loss: 0.1854, Accuracy: 0.8958, F1 Micro: 0.7731, F1 Macro: 0.7738
Epoch 4/10, Train Loss: 0.1495, Accuracy: 0.91, F1 Micro: 0.7812, F1 Macro: 0.7753
Epoch 5/10, Train Loss: 0.1145, Accuracy: 0.9089, F1 Micro: 0.7838, F1 Macro: 0.7836
Epoch 6/10, Train Loss: 0.084, Accuracy: 0.9103, F1 Micro: 0.7816, F1 Macro: 0.7812
Epoch 7/10, Train Loss: 0.0633, Accuracy: 0.9072, F1 Micro: 0.7793, F1 Macro: 0.7773
Epoch 8/10, Train Loss: 0.0463, Accuracy: 0.9067, F1 Micro: 0.7745, F1 Macro: 0.7697
Epoch 9/10, Train Loss: 0.0385, Accuracy: 0.9038, F1 Micro: 0.7819, F1 Macro: 0.7814
Epoch 10/10, Train Loss: 0.0289, Accuracy: 0.9064, F1 Micro: 0.779, F1 Macro: 0.7777

Accuracy: 0.9089, F1 Micro: 0.7838, F1 Macro: 0.7836
                      precision    recall  f1-score   support

          pornografi       0.95      0

In [12]:
results = pd.DataFrame({
    'Trial': [1,2,3,4,5],
    'Accuracy': list(accuracies),
    'F1 Micro': list(f1_micros),
    'F1 Macro': list(f1_macros),
})

results.to_csv(f'netifier-passive-kfold-result.csv', index=False)