In [1]:
import os
import math
import time
import torch
import random
import warnings
import numpy as np
import pandas as pd
import torch.nn as nn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score, classification_report
from accelerate import Accelerator, notebook_launcher
from torch.multiprocessing import Manager
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, BertPreTrainedModel, BertConfig, BertForSequenceClassification

In [2]:
warnings.filterwarnings('ignore')

In [3]:
def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic=True
    torch.backends.cudnn.benchmark=False

set_seed(42)

In [4]:
manager = Manager()

# Shared resources
accuracies = manager.list()
f1_micros = manager.list()
f1_macros = manager.list()
data_used = manager.list()
sampling_dur = manager.list()
new_samples = manager.list()

# Non shared resources
filename = 'netifier-lc'

# ORIGINAL DATASET

In [5]:
train_data = pd.read_csv('/kaggle/input/netifier-3/processed_train.csv', encoding='latin-1')
val_data = pd.read_csv('/kaggle/input/netifier-3/processed_test.csv', encoding='latin-1')

data = pd.concat([train_data, val_data], ignore_index=True)

In [6]:
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42, shuffle=False)

train_data = train_data.sort_values(by=['processed_text', 'pornografi', 'sara', 'radikalisme', 'pencemaran_nama_baik'])

train_labels = train_data.columns[2:6]
val_labels = val_data.columns[2:6]

# Extract features and labels for training and validation
X_train = train_data['processed_text'].values
y_train = train_data[train_labels].values
X_val = val_data['processed_text'].values
y_val = val_data[val_labels].values

print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_train[-5:])

(6218,) (6218, 4)
(1555,) (1555, 4)
['yos di rt aku yang pacar tidak pernah di sedih rt rt nyata saja papua mau pisah tidak guna ikut negara kesatuan republik indonesia ruginya pasti'
 'yu rt siap di rt heh ketek acem jembut lebat pada mau ngaco tidak neh malam'
 'yups setuju kakak laki laki orang sekarang punya bnyak istri alsanya agama tapi knytanya tidak sesuai dengan printah agama krena yang dinikahin itu prempuan yang dari segi ekonominya trgolong mampu bukan dari janda miskin yang mang butuh untuk ditolong poligami hnyalah topeng untuk mnutupi nafsu belaka bukan brdasarkan apa yang nabi muhamad lakukakn'
 'zaman sekrng poligami sudah salah tidak ngikutin niatan nabi menyantunin anak yatim dn menikahi para janda yang ditingal wafat para suami yang gugur di brjuang'
 'zaman sudah berkembang kok di paksa pakai budaya abad pertengahan bang mamad boleh juga ibarat zaman sudah buat pesawat ke mars masi di paksa pakai busana ninja dan naik onta boleh juga sunguh terlalu boleh juga ancam

# CHECK ACQUIRED DATA

In [7]:
acquired_data = pd.read_csv('/kaggle/input/netifier-acquired-6218/netifier-lc-1-data-6218.csv')
acquired_data = acquired_data.sort_values(by=['processed_text', 'pornografi', 'sara', 'radikalisme', 'pencemaran_nama_baik'])

acq_X_train = acquired_data['processed_text'].values
acq_y_train = acquired_data[acquired_data.columns[1:]].values

print(acq_X_train.shape, acq_y_train.shape)
print(acq_X_train[-5:])

(6218,) (6218, 4)
['yos di rt aku yang pacar tidak pernah di sedih rt rt nyata saja papua mau pisah tidak guna ikut negara kesatuan republik indonesia ruginya pasti'
 'yu rt siap di rt heh ketek acem jembut lebat pada mau ngaco tidak neh malam'
 'yups setuju kakak laki laki orang sekarang punya bnyak istri alsanya agama tapi knytanya tidak sesuai dengan printah agama krena yang dinikahin itu prempuan yang dari segi ekonominya trgolong mampu bukan dari janda miskin yang mang butuh untuk ditolong poligami hnyalah topeng untuk mnutupi nafsu belaka bukan brdasarkan apa yang nabi muhamad lakukakn'
 'zaman sekrng poligami sudah salah tidak ngikutin niatan nabi menyantunin anak yatim dn menikahi para janda yang ditingal wafat para suami yang gugur di brjuang'
 'zaman sudah berkembang kok di paksa pakai budaya abad pertengahan bang mamad boleh juga ibarat zaman sudah buat pesawat ke mars masi di paksa pakai busana ninja dan naik onta boleh juga sunguh terlalu boleh juga ancam neraka bre boleh 

In [8]:
if np.array_equal(X_train, acq_X_train):
    print("X_train and acq_X_train contain the same elements (ignoring order).")
else:
    print("X_train and acq_X_train have different elements.")

set_X_train = set(X_train)
set_acq_X_train = set(acq_X_train)

print("Elements in X_train but not in acq_X_train:", len(set_X_train - set_acq_X_train))
print("Elements in acq_X_train but not in X_train:", len(set_acq_X_train - set_X_train))

diff_indices = np.where(X_train != acq_X_train)[0]  # Get indices where elements differ
print("Mismatched indices:", diff_indices)
print("X_train mismatches:", X_train[diff_indices])
print("acq_X_train mismatches:", acq_X_train[diff_indices])

X_train and acq_X_train contain the same elements (ignoring order).
Elements in X_train but not in acq_X_train: 0
Elements in acq_X_train but not in X_train: 0
Mismatched indices: []
X_train mismatches: []
acq_X_train mismatches: []


In [9]:
# Count how many labels differ in total
diff_count = np.sum(y_train != acq_y_train)
print(f"Number of different label values: {diff_count} out of {y_train.size}")

# Identify which rows have different labels
diff_rows = np.where(np.any(y_train != acq_y_train, axis=1))[0]
print(f"Number of rows with different labels: {len(diff_rows)} out of {len(y_train)}")

# Display a sample of the differences
if len(diff_rows) > 0:
    sample_size = min(10, len(diff_rows))
    sample_indices = diff_rows[:sample_size]
    
    print("\nSample of differences:")
    for idx in sample_indices:
        print(f"Text: {X_train[idx]}")
        print(f"Original labels: {y_train[idx]}")
        print(f"Acquired labels: {acq_y_train[idx]}")
        print(f"Difference: {y_train[idx] != acq_y_train[idx]}")
        print("-" * 50)
        
    # Calculate label-wise differences
    label_diffs = []
    for i in range(y_train.shape[1]):
        label_diff = np.sum(y_train[:, i] != acq_y_train[:, i])
        label_diffs.append((train_labels[i], label_diff))
        
    print("\nDifferences by label:")
    for label, diff in label_diffs:
        print(f"{label}: {diff} differences")

Number of different label values: 0 out of 24872
Number of rows with different labels: 0 out of 6218


# TRAIN TO VERIFY RESULT

In [10]:
EPOCHS = 3
BATCH_SIZE = 32
LEARNING_RATE = 2e-5

In [11]:
# Define custom Dataset class
class NetifierDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=96, use_float=True):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.use_float = use_float

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        item = {key: val.squeeze() for key, val in encoding.items()}
        item['labels'] = torch.tensor(labels, dtype=torch.float if self.use_float else torch.long)
        return item

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

In [12]:
def compute_metrics(p):
    preds = torch.tensor(p.predictions) # Sigmoid and threshold for multi-label
    labels = torch.tensor(p.label_ids)

    # Hamming accuracy: proportion of correctly predicted labels over total labels
    hamming_accuracy = (preds == labels).float().mean().item()

    # Standard multi-label precision, recall, and F1 metrics
    precision, recall, f1_micro, _ = precision_recall_fscore_support(labels, preds, average='micro', zero_division=0)
    _, _, f1_macro, _ = precision_recall_fscore_support(labels, preds, average='macro', zero_division=0)

    report = classification_report(
        labels, 
        preds, 
        target_names=['pornografi', 'sara', 'radikalisme', 'pencemaran_nama_baik'],
        zero_division=0
    ) 

    return {
        'accuracy': hamming_accuracy,
        'precision': precision,
        'recall': recall,
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'report': report
    }

In [13]:
def get_dataloaders(train_x, train_y, val_x, val_y, sequence_length, num_workers=4):
    train_dataset = NetifierDataset(train_x, train_y, tokenizer, max_length=sequence_length)
    val_dataset = NetifierDataset(val_x, val_y, tokenizer, max_length=sequence_length)
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True, num_workers=num_workers,
    )
    val_loader = torch.utils.data.DataLoader(
        val_dataset, batch_size=BATCH_SIZE, shuffle=False, pin_memory=True, num_workers=num_workers,
    )

    return train_loader, val_loader

In [None]:
def train_model(sequence_length, model_name, train_x, train_y, val_x, val_y, seed=42):
    set_seed(seed)
    accelerator = Accelerator(mixed_precision='fp16')  # Initialize the accelerator
    device = accelerator.device

    with accelerator.main_process_first():
        model = BertForSequenceClassification.from_pretrained(
            model_name,
            num_labels=len(train_labels),
            problem_type="multi_label_classification"
        )

    # Freeze the first few layers of the encoder
    for name, param in model.named_parameters():
        if "encoder.layer" in name:
            layer_num = name.split(".")[3]
            try:
                if int(layer_num) < 6:
                    param.requires_grad = False
            except ValueError:
                continue

    # Define DataLoaders
    train_loader, val_loader = get_dataloaders(train_x, train_y, val_x, val_y, sequence_length)

    # Define optimizer and loss function
    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)
    loss_fn = torch.nn.BCEWithLogitsLoss()

    # Prepare everything with Accelerator
    model, optimizer, train_loader, val_loader = accelerator.prepare(
        model, optimizer, train_loader, val_loader
    )

    best_result = None
    start_time = time.time()

    for epoch in range(EPOCHS):
        model.train()
        epoch_loss = 0

        for batch in train_loader:
            inputs = {key: val for key, val in batch.items() if key != 'labels'}
            labels = batch['labels']

            optimizer.zero_grad()
            outputs = model(**inputs)
            loss = loss_fn(outputs.logits, labels)
            
            accelerator.backward(loss)
            optimizer.step()

            epoch_loss += loss.item()

        # Evaluation
        model.eval()
        all_preds = []
        all_labels = []
        
        with torch.no_grad():
            for batch in val_loader:
                inputs = {key: val for key, val in batch.items() if key != 'labels'}
                labels = batch['labels']
                
                outputs = model(**inputs)
                preds = torch.sigmoid(outputs.logits).round()

                # Gather predictions and labels from all devices
                all_preds.append(accelerator.gather(preds))
                all_labels.append(accelerator.gather(labels))

        all_preds = torch.cat(all_preds).cpu().numpy()
        all_labels = torch.cat(all_labels).cpu().numpy()
        
        result = compute_metrics(type('EvalOutput', (object,), {'predictions': all_preds, 'label_ids': all_labels}))

        if best_result is None or result['f1_micro'] >= best_result['f1_micro']:
            accelerator.print("Higher F1 achieved, saving model")

            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(
                f'{filename}-model',
                is_main_process=accelerator.is_main_process,
                save_function=accelerator.save,
            )
            
            best_result = result

        accelerator.print(f"Epoch {epoch + 1}/{EPOCHS}, Train Loss: {round(epoch_loss / len(train_loader), 4)}, Accuracy: {round(result['accuracy'], 4)}, F1 Micro: {round(result['f1_micro'], 4)}, F1 Macro: {round(result['f1_macro'], 4)}")

    end_time = time.time()
    duration = end_time - start_time
        
    accelerator.print(f"\nAccuracy: {round(best_result['accuracy'], 4)}, F1 Micro: {round(best_result['f1_micro'], 4)}, F1 Macro: {round(best_result['f1_macro'], 4)}")
    accelerator.print(best_result['report'])
    accelerator.print(f"Duration: {duration}")

In [15]:
seed = 50

# Train on sorted original data

In [None]:
args = (96, 'indobenchmark/indobert-base-p1', X_train, y_train, X_val, y_val, seed)
notebook_launcher(train_model, args, num_processes=2)

Launching training on 2 GPUs.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Higher F1 achieved, saving model
Epoch 1/3, Train Loss: 0.3657, Accuracy: 0.8884, F1 Micro: 0.731, F1 Macro: 0.7274
Higher F1 achieved, saving model
Epoch 2/3, Train Loss: 0.2359, Accuracy: 0.898, F1 Micro: 0.7718, F1 Macro: 0.7702
Higher F1 achieved, saving model
Epoch 3/3, Train Loss: 0.2019, Accuracy: 0.9022, F1 Micro: 0.772, F1 Macro: 0.7641

Accuracy: 0.9022, F1 Micro: 0.772, F1 Macro: 0.7641
                      precision    recall  f1-score   support

          pornografi       0.90      0.91      0.90       370
                sara       0.68      0.62      0.65       248
         radikalisme       0.75      0.78      0.77       243
pencemaran_nama_baik       0.72      0.75      0.73       504

           micro avg       0.77      0.78      0.77      1365
           macro avg       0.76      0.77      0.76      1365
        weighted avg       0.77      0.78      0.77      1365
         samples avg       0.45      0.44      0.44      1365

Duration: 61.84658408164978


# Train on sorted acquired data

In [None]:
args = (96, 'indobenchmark/indobert-base-p1', acq_X_train, acq_y_train, X_val, y_val, seed)
notebook_launcher(train_model, args, num_processes=2)

Launching training on 2 GPUs.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Higher F1 achieved, saving model
Epoch 1/3, Train Loss: 0.3657, Accuracy: 0.8884, F1 Micro: 0.731, F1 Macro: 0.7274
Higher F1 achieved, saving model
Epoch 2/3, Train Loss: 0.2359, Accuracy: 0.898, F1 Micro: 0.7718, F1 Macro: 0.7702
Higher F1 achieved, saving model
Epoch 3/3, Train Loss: 0.2019, Accuracy: 0.9022, F1 Micro: 0.772, F1 Macro: 0.7641

Accuracy: 0.9022, F1 Micro: 0.772, F1 Macro: 0.7641
                      precision    recall  f1-score   support

          pornografi       0.90      0.91      0.90       370
                sara       0.68      0.62      0.65       248
         radikalisme       0.75      0.78      0.77       243
pencemaran_nama_baik       0.72      0.75      0.73       504

           micro avg       0.77      0.78      0.77      1365
           macro avg       0.76      0.77      0.76      1365
        weighted avg       0.77      0.78      0.77      1365
         samples avg       0.45      0.44      0.44      1365

Duration: 63.85293936729431
