## 1. Download Packages & Resource
Download packages yang diperlukan untuk preprocessing, model ataupun training.

In [227]:
# FOR SOME SECTION
import torch
import os

# FOR PREPROCESSING SECTION
import re
import json
import nltk
import random
import pandas as pd
from nltk.corpus import stopwords
# from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
# from nltk.stem import WordNetLemmatizer
# from simplemma import text_lemmatize
from torch.utils.data import Dataset
from sklearn.model_selection import StratifiedKFold
from transformers import AutoTokenizer

# FOR MODEL SECTION
from torch import nn # from torch import torch.nn as nn
import torch.nn.functional as F

# FOR TRAIN SECTION
import wandb
import argparse
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm, trange
from datetime import datetime
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau
# from muon import SingleDeviceMuonWithAuxAdam

In [228]:
# Download NLTK yang diperlukan
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('tokenizers/punkt_tab')
    nltk.data.find('corpora/stopwords')
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('punkt')
    nltk.download('punkt_tab')
    nltk.download('stopwords')
    nltk.download('wordnet')

STOPWORDS = set(stopwords.words('indonesian'))

## 2. Dataset Class
Kelas dari dataset yang mana dilakukan proses preprocessing.

In [229]:
class CyberbullyingDataset(Dataset):
    def __init__(
            self,
            file_path="../dataset/cyberbullying.csv",
            tokenizer_name="indobenchmark/indobert-base-p1",
            folds_file="k_folds.json",
            random_state=29082002,
            split="train",
            fold=0,
            n_folds=5,
            max_length=128,
            augmentasi_file="../dataset/dictionary/augmentation.json",
            slang_word_file="../dataset/dictionary/slang-word-specific.json",
    ):        
        self.file_path = file_path
        self.folds_file = folds_file
        self.random_state = random_state
        self.split = split
        self.fold = fold
        self.n_folds = n_folds
        self.max_length = max_length
        self.augmentasi_data = self.load_file(augmentasi_file)
        self.slang_dict = self.load_file(slang_word_file)
        
        # Initialize tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.vocab_size = len(self.tokenizer)

        # Load dataset
        self.load_data()
        # Setup n-Fold Cross Validation
        self.setup_folds()
        # Mempersiapkan Indices (bentuk jamak index)
        self.setup_indices()

    def __len__(self):
        return len(self.indices)
    
    def __getitem__(self, idx):
        # Mengambil index dari data yang akan diambil
        idx = self.indices[idx]
        # Mengambil data komentar dan sentiment
        text = str(self.df.iloc[idx]["comment"])
        label = str(self.df.iloc[idx]["sentiment"])
        # Melakukan Pre-Processing
        comment_processed = self.preprocess(text)
        # Tokenisasi
        encoding = self.tokenizer(
            comment_processed,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        data = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(int(label), dtype=torch.long),
            'original_text': text,
            'processed_text': comment_processed,
            'original_index': idx
        }
        return data

    def load_file(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            return json.load(file)
        
        # Load dictionary slang dari file JSON
        with open('slang_dictionary.json', 'r') as file:
            slang_dict = json.load(file)

    def aeda_augment(self, text):
        """
        Melakukan augmentasi teks dengan metode AEDA:
        Menyisipkan tanda baca secara acak di posisi acak dalam teks.
        """
        punctuations = [".", ";", "?", ":", "!", ","]
        words = text.split()
        if len(words) == 0:
            return text
    
        # # Tentukan berapa banyak tanda baca yang akan disisipkan
        # n_insert = random.randint(1, max(1, len(words) // 3))
        
        # # Pilih posisi acak untuk sisipan
        # positions = random.sample(range(len(words)), n_insert)
        # positions.sort(reverse=True)  # disisipkan dari belakang biar indeks tidak bergeser
    
        # for pos in positions:
        #     punct = random.choice(punctuations)
        #     words.insert(pos, punct)

        # Pilih posisi acak untuk sisipan
        position = random.randint(0, len(words) - 1)
        
        # Pilih tanda baca acak
        punct = random.choice(punctuations)
        
        # Sisipkan tanda baca ke dalam list kata
        words.insert(position, punct)
    
        return " ".join(words)

    def random_typo(self, text):
        words = text.split()
        if len(words) < 1:
            return text
        idx = random.randint(0, len(words) - 1)
        word = words[idx]
        if len(word) > 1:
            char_list = list(word)
            i = random.randint(0, len(char_list) - 2)
            char_list[i], char_list[i+1] = char_list[i+1], char_list[i]  # swap 2 huruf berdekatan
            words[idx] = ''.join(char_list)
        return ' '.join(words)

    def random_swap(self, text):
        words = text.split()
        if len(words) < 2:
            return text
        idx1, idx2 = random.sample(range(len(words)), 2)
        words[idx1], words[idx2] = words[idx2], words[idx1]
        return ' '.join(words)

    def random_delete(self, text):
        words = text.split()
        if len(words) <= 1:
            return text
        idx = random.randint(0, len(words) - 1)
        del words[idx]
        return ' '.join(words)

    def augmentation_text(self, text, probability=0.5):
        # Hanya lakukan augmentasi dengan probabilitas tertentu
        if random.random() > probability:
            return text

        # PROBABILITAS ACAK
        # Daftar semua fungsi augmentasi yang tersedia
        augmentations = [
            self.aeda_augment,
            self.random_typo,
            self.random_swap,
            self.random_delete
        ]
        
        # Pecah kalimat menjadi kumpulan kata
        words = text.split()
        # Tentukan berapa banyak augmentasi yang akan dilakukan
        n_insert = random.randint(1, max(1, len(words) // 3))
        for i in range(n_insert):
            # Pilih satu augmentasi secara acak
            augmentation_func = random.choice(augmentations)
            # Terapkan augmentasi yang dipilih
            text = augmentation_func(text)
        return text

        # # Pilih satu augmentasi secara acak
        # augmentation_func = random.choice(augmentations)
        # # Mengembalikan augmentasi yang dipilih
        # return augmentation_func(text)

    def preprocess(self, text):
        # Konversi ke huruf kecil
        text = text.lower()

        # Hapus mention (@...) dan hashtag (#...) => ada kolom comment yang #VALUE!
        text = re.sub(r'@\w+|#\w+', '', text)

        # # Hapus emoji dan karakter non-ASCII
        text = re.sub(r'[^\x00-\x7F]+', '', text)

        # Stemming
        # factory = StemmerFactory()
        # stemmer = factory.create_stemmer()
        # text   = stemmer.stem(text)

        # Augmentasi
        text = self.augmentation_text(text)
        
        # Tokenisasi
        words = nltk.word_tokenize(text)

        # Lemmatization
        # lemmatizer = WordNetLemmatizer()
        # words = [lemmatizer.lemmatize(word) for word in words]

        # Menghapus stopwords
        words = [word for word in words if word not in STOPWORDS]

        # Menggabungkan kembali kata-kata menjadi kalimat
        text = ' '.join(words)

        return text

    def setup_indices(self):
        '''
        Mempersiapkan indices untuk data yang akan di-training
        '''
        fold_key = f"fold_{self.fold}"
        if self.split == "train":
            self.indices = self.fold_indices[fold_key]['train_indices']
        else:
            self.indices = self.fold_indices[fold_key]['val_indices']

    def setup_folds(self):
        # Jika fold sudah ada, maka load fold
        if os.path.exists(self.folds_file):
            self.load_folds()
        # Jika tidak ada, maka buat fold
        else:
            self.create_folds()

    def load_folds(self):
        '''
        Apabila fold sudah ada, maka load fold
        '''
        with open(self.folds_file, 'r') as f:
            fold_data = json.load(f)
        self.fold_indices = fold_data['fold_indices']
        print(f"Menggunakan {fold_data['n_folds']} folds dengan {fold_data['n_samples']} samples")
    
    def create_folds(self):
        '''
        Apabila fold sudah ada, maka load fold
        '''
        print(f"Membuat n-fold CV dengan random state {self.random_state}")
        skf = StratifiedKFold(n_splits=self.n_folds, shuffle=True, random_state=self.random_state)

        # print("\nStratified k-fold positif samples per fold:")
        # for _, val_idx in skf.split(self.df, self.df['sentiment']):
        #     print(f"{np.sum(self.df['sentiment'].iloc[val_idx] == 1)} out of {len(val_idx)}")

        fold_indices = {}
        for fold, (train_idx, val_idx) in enumerate(skf.split(self.df, self.df['sentiment'])):
            fold_indices[f"fold_{fold}"] = {
                'train_indices': train_idx.tolist(),
                'val_indices': val_idx.tolist()
            }
        
        # Simpan fold ke file
        with open(self.folds_file, 'w') as f:
            json.dump({
                'fold_indices': fold_indices,
                'n_samples': len(self.df),
                'n_folds': self.n_folds,
                'random_state': self.random_state
            }, f)

            self.fold_indices = fold_indices
            print(f'Created {self.n_folds}-fold indices and saved to {self.folds_file}')

    def load_data(self):
        print(f'Loading data from {self.file_path}...')
        self.df = pd.read_csv(self.file_path) # Load csv
        # self.df.columns = ['sentiment', 'comment'] # Rename columns
        if len(self.df.columns) == 2:
            self.df.columns = ['sentiment', 'comment']
        else:
            print("‚ö†Ô∏è Jumlah kolom tidak sesuai, kolom asli:", self.df.columns)
        self.df = self.df.dropna(subset=['sentiment', 'comment']) # Drop NaN values
        self.df['sentiment'] = self.df['sentiment'].astype(int) # Convert sentiment to int
        self.df['sentiment'] = self.df['sentiment'].apply(lambda x: 1 if x == -1 else 0) # Transform labels: convert -1 to 1, and 1 to 0
        self.df = self.df[(self.df['sentiment'] == 0) | (self.df['sentiment'] == 1)] # Filter sentiment

        # Undersampling menyeimbangkan dataset (hanya untuk split "train")
        if self.split == "train":
            df_label_0 = self.df[self.df['sentiment'] == 0]
            df_label_1 = self.df[self.df['sentiment'] == 1]

            min_samples_per_class = min(len(df_label_0), len(df_label_1))

            df_label_0_undersampled = df_label_0.sample(n=min_samples_per_class, random_state=self.random_state)
            df_label_1_undersampled = df_label_1.sample(n=min_samples_per_class, random_state=self.random_state)

            self.df = pd.concat([df_label_0_undersampled, df_label_1_undersampled])
            self.df = self.df.sample(frac=1, random_state=self.random_state).reset_index(drop=True)


## 2x. Main Section Preprocessing
Bagian utama untuk menjalankan program.

In [230]:
# if __name__ == "__main__":
#     dataset = CyberbullyingDataset(fold=0, split="train")
#     data = dataset[0]
#     print(data)

## 3. Model Class
Kelas dari model machine learning yang akan di training.

In [231]:
class TextCNNLight(nn.Module):
    """
    TextCNNLight is a lightweight version of TextCNN for text classification.
    vocab_size: Size of the vocabulary
    embed_dim: Dimension of the word embeddings
    num_classes: Number of output classes
    output_dim: Dimension of the output after convolution
    kernel_size: List of kernel sizes for convolutional layers
    This model uses two convolutional layers with kernel sizes 3 and 4,panjang jendela (window) digunakan filter untuk melihat urutan kata secara lokal.
    followed by max pooling and a fully connected layer.
    The output dimension is set to 64, and dropout is applied to prevent overfitting.
    The model is designed to be efficient and suitable for smaller datasets or real-time applications.
    """
    def __init__(self, vocab_size, embed_dim=100, num_classes=2, output_dim=64, dropout_rate=0.3):
        super(TextCNNLight, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.conv1 = nn.Conv1d(embed_dim, output_dim, kernel_size=3)
        self.conv2 = nn.Conv1d(embed_dim, output_dim, kernel_size=4)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear((output_dim * 2), num_classes)

    def forward(self, x): # Input dan output layer tidak ditulis eksplisit
        x = self.embedding(x) # Embedding
        x = x.permute(0, 2, 1) # Permute
        x1 = F.relu(self.conv1(x)).max(dim=2)[0] # Convolution 1
        x2 = F.relu(self.conv2(x)).max(dim=2)[0] # Convolution 2
        x = torch.cat((x1, x2), dim=1) # Concatenate (gabungkan fitur dari 2 konvolusi)
        x = self.dropout(x) # Dropout
        return self.fc(x) # Fully Connected

# PERCOBAAN 1
class TextCNNLightResNorm(nn.Module):
    """
    TextCNNLightResNorm
    - Gabungan TextCNNLight + BatchNorm + Residual connection
    - Stabil, ringan, cocok untuk teks pendek seperti komentar media sosial.
    """

    def __init__(self, vocab_size, embed_dim=100, num_classes=2, output_dim=64, dropout_rate=0.3):
        super(TextCNNLightResNorm, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)

        # Dua layer konvolusi dengan ukuran kernel berbeda
        self.conv1 = nn.Conv1d(embed_dim, output_dim, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(embed_dim, output_dim, kernel_size=4, padding=1)

        # Normalisasi batch setelah konvolusi
        self.bn1 = nn.BatchNorm1d(output_dim)
        self.bn2 = nn.BatchNorm1d(output_dim)

        # Shortcut projection agar dimensi sama (residual)
        self.shortcut = nn.Linear(embed_dim, output_dim * 2)

        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(output_dim * 2, num_classes)

    def forward(self, x):
        # 1Ô∏è‚É£ Embedding
        x_embed = self.embedding(x)  # (batch, seq_len, embed_dim)
        x_embed_t = x_embed.permute(0, 2, 1)  # (batch, embed_dim, seq_len)

        # 2Ô∏è‚É£ Convolution + ReLU + BatchNorm + Global Max Pooling
        x1 = F.relu(self.bn1(self.conv1(x_embed_t))).max(dim=2)[0]
        x2 = F.relu(self.bn2(self.conv2(x_embed_t))).max(dim=2)[0]

        # 3Ô∏è‚É£ Concatenate hasil konvolusi
        x_cat = torch.cat((x1, x2), dim=1)  # (batch, output_dim * 2)

        # 4Ô∏è‚É£ Residual connection: proyeksikan embedding ke dimensi yang sama
        residual = self.shortcut(x_embed.mean(dim=1))  # rata-rata embedding ‚Üí dim (output_dim*2)
        x_res = x_cat + residual  # tambah residual shortcut

        # 5Ô∏è‚É£ Dropout dan FC
        x_res = self.dropout(x_res)
        out = self.fc(x_res)
        return out

# üß† SE (Squeeze-and-Excitation) Block
class SEBlock(nn.Module):
    def __init__(self, channels, reduction=16): # Default = 16, amdhan = 4
        super(SEBlock, self).__init__()
        self.fc1 = nn.Linear(channels, channels // reduction)
        self.fc2 = nn.Linear(channels // reduction, channels)

    def forward(self, x):
        w = torch.mean(x, dim=0, keepdim=True)  # Squeeze
        w = F.relu(self.fc1(w))
        w = torch.sigmoid(self.fc2(w))
        return x * w  # Excitation

# PERCOBAAN 2
# üîß TextCNN Enhanced (simplified)
class TextCNNEnhanced(nn.Module):
    def __init__(self, vocab_size, embed_dim=100, num_classes=2, output_dim=64, dropout_rate=0.3):
        super(TextCNNEnhanced, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)

        # Depthwise + Pointwise convolution blocks
        self.depthwise_conv1 = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, dilation=1, groups=embed_dim, padding=1)
        self.pointwise_conv1 = nn.Conv1d(embed_dim, output_dim, kernel_size=1)

        self.depthwise_conv2 = nn.Conv1d(embed_dim, embed_dim, kernel_size=4, dilation=2, groups=embed_dim, padding=3)
        self.pointwise_conv2 = nn.Conv1d(embed_dim, output_dim, kernel_size=1)

        self.dropout = nn.Dropout(dropout_rate)
        self.se_block = SEBlock(output_dim * 2)
        self.fc = nn.Linear(output_dim * 2, num_classes)

    # Helper block untuk konvolusi + aktivasi + pooling
    def conv_block(self, x, depthwise, pointwise):
        x = depthwise(x)
        x = F.relu(pointwise(x))
        x = F.max_pool1d(x, kernel_size=x.size(2)).squeeze(2)
        return x

    def forward(self, x):
        x = self.embedding(x)            # (batch, seq_len, embed_dim)
        x = x.permute(0, 2, 1)           # (batch, embed_dim, seq_len)

        # üî∏ Dua jalur konvolusi dengan cara yang sama
        x1 = self.conv_block(x, self.depthwise_conv1, self.pointwise_conv1)
        x2 = self.conv_block(x, self.depthwise_conv2, self.pointwise_conv2)

        # üî∏ Gabung dan lanjut ke SE + FC
        x_cat = torch.cat((x1, x2), dim=1)
        x_cat = self.se_block(x_cat)
        x_cat = self.dropout(x_cat)
        out = self.fc(x_cat)
        return out

# PERCOBAAN 3
# üîß TextCNN Enhanced (simplified) + ResNorm
class TextCNNEnhancedUltimate(nn.Module):
    def __init__(self, vocab_size, embed_dim=100, num_classes=2, output_dim=64, dropout_rate=0.3):
        super(TextCNNEnhancedUltimate, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)

        # Depthwise + Pointwise convolution blocks
        self.depthwise_conv1 = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, dilation=1, groups=embed_dim, padding=1)
        self.pointwise_conv1 = nn.Conv1d(embed_dim, output_dim, kernel_size=1)

        self.depthwise_conv2 = nn.Conv1d(embed_dim, embed_dim, kernel_size=4, dilation=2, groups=embed_dim, padding=3)
        self.pointwise_conv2 = nn.Conv1d(embed_dim, output_dim, kernel_size=1)

        self.residual_proj = nn.Conv1d(embed_dim, output_dim, kernel_size=1)

        self.dropout = nn.Dropout(dropout_rate)
        self.se_block = SEBlock(output_dim * 2)
        self.fc = nn.Linear(output_dim * 2, num_classes)

    # Helper block untuk konvolusi + aktivasi + pooling
    def conv_block(self, x, depthwise, pointwise):
        residual = x  # simpan input
        x = depthwise(x)
        x = F.relu(pointwise(x))
        if residual.shape[1] != x.shape[1]: # Jika ukuran dimensi berbeda
            residual = self.residual_proj(residual)
        x = x + residual  # tambahkan residual
        x = F.max_pool1d(x, kernel_size=x.size(2)).squeeze(2)
        return x

    def forward(self, x):
        x_embed = self.embedding(x)            # (batch, seq_len, embed_dim)
        x = x_embed.permute(0, 2, 1)           # (batch, embed_dim, seq_len)

        # üî∏ Dua jalur konvolusi dengan cara yang sama
        x1 = self.conv_block(x, self.depthwise_conv1, self.pointwise_conv1)
        x2 = self.conv_block(x, self.depthwise_conv2, self.pointwise_conv2)

        # üî∏ Gabung dan lanjut ke SE + FC
        x_cat = torch.cat((x1, x2), dim=1)
        x_cat = self.se_block(x_cat)
        # residual = self.shortcut(x_embed.mean(dim=1))
        # x_cat = x_cat + residual
        x_cat = self.dropout(x_cat)
        out = self.fc(x_cat)
        return out

class AmdhanTextCNNEnhanced(nn.Module):
    def __init__(self, vocab_size, embed_dim=100, num_classes=2, output_dim=64, dropout_rate=0.3):
        super(AmdhanTextCNNEnhanced, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)

        self.conv1 = nn.Conv1d(embed_dim, output_dim, kernel_size=3)

        # Depthwise + Pointwise convolution blocks
        self.depthwise_conv2 = nn.Conv1d(embed_dim, embed_dim, kernel_size=4, dilation=1, groups=embed_dim, padding=1)
        self.pointwise_conv2 = nn.Conv1d(embed_dim, output_dim, kernel_size=1)

        self.depthwise_conv3 = nn.Conv1d(embed_dim, embed_dim, kernel_size=5, dilation=2, groups=embed_dim, padding=3)
        self.pointwise_conv3 = nn.Conv1d(embed_dim, output_dim, kernel_size=1)


        self.dropout = nn.Dropout(dropout_rate)
        self.se_block = SEBlock(output_dim * 3)
        self.fc = nn.Linear(output_dim * 3, num_classes)

    # Helper block untuk konvolusi + aktivasi + pooling
    def conv_block(self, x, depthwise, pointwise):
        x = depthwise(x)
        x = F.relu(pointwise(x))
        x = F.max_pool1d(x, kernel_size=x.size(2)).squeeze(2)
        return x

    def forward(self, x):
        x = self.embedding(x)            # (batch, seq_len, embed_dim)
        x = x.permute(0, 2, 1)           # (batch, embed_dim, seq_len)

        # üî∏ Dua jalur konvolusi
        x1 = F.relu(self.conv1(x)).max(dim=2)[0]
        x2 = self.conv_block(x, self.depthwise_conv2, self.pointwise_conv2)
        x3 = self.conv_block(x, self.depthwise_conv3, self.pointwise_conv3)

        # üî∏ Gabung dan lanjut ke SE + FC
        x_cat = torch.cat((x1, x2, x3), dim=1)
        x_cat = self.se_block(x_cat)
        x_cat = self.dropout(x_cat)
        out = self.fc(x_cat)
        return out

class MineTextCNNEnhanced(nn.Module):
    def __init__(self, vocab_size, embed_dim=100, num_classes=2, output_dim=64, dropout_rate=0.3):
        super(MineTextCNNEnhanced, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)

        self.conv1 = nn.Conv1d(embed_dim, output_dim, kernel_size=3)

        self.dilated_conv2 = nn.Conv1d(embed_dim, output_dim, kernel_size=5, dilation=2, padding=4)

        self.depthwise_conv3 = nn.Conv1d(embed_dim, embed_dim, kernel_size=5, dilation=2, groups=embed_dim, padding=3)
        self.pointwise_conv3 = nn.Conv1d(embed_dim, output_dim, kernel_size=1)


        self.dropout = nn.Dropout(dropout_rate)
        self.se_block = SEBlock(output_dim * 3)
        self.fc = nn.Linear(output_dim * 3, num_classes)

    # Helper block untuk konvolusi + aktivasi + pooling
    def conv_block(self, x, depthwise, pointwise):
        x = depthwise(x)
        x = F.relu(pointwise(x))
        x = F.max_pool1d(x, kernel_size=x.size(2)).squeeze(2)
        return x

    def forward(self, x):
        x = self.embedding(x)            # (batch, seq_len, embed_dim)
        x = x.permute(0, 2, 1)           # (batch, embed_dim, seq_len)

        # üî∏ Dua jalur konvolusi
        x1 = F.relu(self.conv1(x)).max(dim=2)[0]
        x_dilated = F.relu(self.dilated_conv2(x))
        x2 = F.max_pool1d(x_dilated, kernel_size=x_dilated.size(2)).squeeze(2)
        x3 = self.conv_block(x, self.depthwise_conv3, self.pointwise_conv3)

        # üî∏ Gabung dan lanjut ke SE + FC
        x_cat = torch.cat((x1, x2, x3), dim=1)
        x_cat = self.se_block(x_cat)
        x_cat = self.dropout(x_cat)
        out = self.fc(x_cat)
        return out

## 4. TRAIN SECTION
Bagian untuk train model yang sudah dibuat.

In [232]:
# Reproducibility
SEED = 29082002
# Training Model
DATASET_PATH = '../dataset/cyberbullying.csv'
MODEL_OUTPUT_PATH = 'model_outputs'
# K-fold Cross-validation
N_FOLDS = 5
MAX_LENGTH = 128
# Training Model
EPOCHS = 25
BATCH_SIZE = 16
LEARNING_RATE = 5e-3
TOKENIZER_NAME = 'indobenchmark/indobert-base-p1'
# CNN
DROPOUT_RATE = 0.1
NUM_CLASSES = 2
EMBEDDING_DIM = 300
NUM_FILTERS = 100
KERNEL_SIZE = [2, 4]
OUT_CHANNELS = 50
# Early Stopping
PATIENCE = 5 # Patience for early stopping (epochs to wait after no improvement)
MIN_DELTA = 0.001 # Minimum change in val_loss to be considered an improvement for early stopping
# Action
PYPLOT = False
IMAGE_NAME = 'training_metrics.png'
WANDB_NOTES = 'augmentasi dg probabilitas sama rata'

In [233]:
def ploting_result(epochs, lr, train_loss, train_acc, train_f1, val_loss, val_acc, val_f1):
    """Ploting hasil train (untuk mode offline)"""    
    fig, axs = plt.subplots(1, 3, figsize=(18, 5))

    # Plot loss
    axs[0].plot(epochs, train_loss, label='Train Loss', marker='o')
    axs[0].plot(epochs, val_loss, label='Val loss', marker='s')
    axs[0].set_title('Loss per Epoch')
    axs[0].set_xlabel('Epoch')
    axs[0].set_ylabel('Loss')
    axs[0].legend()
    axs[0].grid(True, linestyle='--', alpha=0.6)

    # Plot Accuracy
    axs[1].plot(epochs, train_acc, label='Train Acc', marker='o')
    axs[1].plot(epochs, val_acc, label='Val Acc', marker='s')
    axs[1].set_title('Accuracy per Epoch')
    axs[1].set_xlabel('Epoch')
    axs[1].set_ylabel('Accuracy')
    axs[1].legend()
    axs[1].grid(True, linestyle='--', alpha=0.6)

    # Plot F1-score & Learning Rate
    ax2 = axs[2].twinx()
    
    axs[2].plot(epochs, train_f1, label='Train F1-score', marker='o')
    axs[2].plot(epochs, val_f1, label='Val F1-score', marker='s')
    ax2.plot(epochs, lr,  color='orange', linestyle='--', label='Learning Rate', marker='x')
    axs[2].set_title('F1-Score & Learning Rate')
    axs[2].set_xlabel('Epoch')
    axs[2].set_ylabel('F1-score')
    ax2.set_ylabel('Learning Rate', color='orange')
    axs[2].grid(True, linestyle='--', alpha=0.6)

    # Gabungkan Legend Kedua Axis
    lns1, lbls1 = axs[2].get_legend_handles_labels()
    lns2, lbls2 = ax2.get_legend_handles_labels()
    axs[2].legend(lns1 + lns2, lbls1 + lbls2, loc='center right')

    plt.tight_layout()

    # Simpan File
    plt.savefig(IMAGE_NAME, dpi=300, bbox_inches='tight')

    # Tampilkan di layar
    plt.show()
    

In [234]:
def set_seed(seed):
    """Set random seed for reproducibility"""
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def get_dataloaders_for_fold():
    """Create train and validation datasets/dataloaders for the fold"""
    train_dataset = CyberbullyingDataset(
        file_path=DATASET_PATH,
        tokenizer_name=TOKENIZER_NAME,
        random_state=SEED,
        split="train",
        n_folds=N_FOLDS,
        max_length=MAX_LENGTH,
    ) # Membuat fold train dataset

    val_dataset = CyberbullyingDataset(
        file_path=DATASET_PATH,
        tokenizer_name=TOKENIZER_NAME,
        random_state=SEED,
        split="val",
        n_folds=N_FOLDS,
        max_length=MAX_LENGTH,
    ) # Membuat fold val dataset
    
    # Membuat DataLoader
    train_loader = DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False
    )
    
    return train_loader, val_loader

class EarlyStopping:
    def __init__(self, patience=5, min_delta=0.001):
        # Jumlah epoch berturut-turut tanpa peningkatan signifikan yang masih ditoleransi
        self.patience = patience
        
        # Selisih minimal antara val_loss sebelumnya dan sekarang yang dianggap sebagai perbaikan
        self.min_delta = min_delta
        
        # Counter untuk menghitung berapa kali val_loss tidak membaik secara signifikan
        self.counter = 0
        
        # Menyimpan nilai val_loss terbaik yang pernah dicapai
        self.best_loss = float('inf')  # Awalnya di-set sangat besar

    def __call__(self, val_loss):
        # Cek apakah val_loss sekarang lebih baik dari best_loss sebelumnya dengan selisih signifikan
        if self.best_loss - val_loss > self.min_delta:
            # Jika ya, anggap ini sebagai peningkatan
            self.best_loss = val_loss  # Perbarui best_loss
            self.counter = 0           # Reset counter karena ada peningkatan
        else:
            # Jika tidak ada peningkatan signifikan, tambahkan counter
            self.counter += 1

        # Jika counter melebihi atau sama dengan batas patience, kembalikan True (hentikan training)
        if self.counter >= self.patience:
            return True

        # Jika belum melebihi patience, teruskan training
        return False

def cnn_train_fold():
    print(f"\n{'='*5} Fold {N_FOLDS+1} {'='*5}")
    
    # Setup device
    device = get_device()
    
    train_loader, val_loader = get_dataloaders_for_fold()

    model = TextCNNEnhancedUltimate(
        vocab_size=train_loader.dataset.vocab_size,
        embed_dim=EMBEDDING_DIM,
        num_classes=NUM_CLASSES,
        output_dim=OUT_CHANNELS,
        dropout_rate=DROPOUT_RATE
    )

    # model = AmdhanTextCNNEnhanced(
    #     vocab_size=train_loader.dataset.vocab_size,
    #     embed_dim=EMBEDDING_DIM,
    #     num_classes=NUM_CLASSES,
    #     output_dim=OUT_CHANNELS,
    #     dropout_rate=DROPOUT_RATE
    # )

    # model = MineTextCNNEnhanced(
    #     vocab_size=train_loader.dataset.vocab_size,
    #     embed_dim=EMBEDDING_DIM,
    #     num_classes=NUM_CLASSES,
    #     output_dim=OUT_CHANNELS,
    #     dropout_rate=DROPOUT_RATE
    # )

    model = model.to(device)

    # hidden_weights = [p for p in model.parameters() if p.ndim >= 2]
    # other_params = [p for p in model.parameters() if p.ndim < 2]

    # param_groups = [
    #     {"params": hidden_weights, "use_muon": True, "lr": 0.02, "weight_decay": 0.01},
    #     {"params": other_params, "use_muon": False, "lr": 3e-4, "betas": (0.9, 0.95), "weight_decay": 0.01},
    # ]

    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE) 
    # optimizer = SingleDeviceMuonWithAuxAdam(param_groups)

    criterion = torch.nn.CrossEntropyLoss().to(device)

    # Learning rate scheduler
    scheduler = StepLR(optimizer, step_size=10, gamma=0.5)
    # scheduler = ReduceLROnPlateau(
    #     optimizer,       # optimizer yang mau diatur LR-nya
    #     mode='max',      # 'max' kalau pakai metrik seperti F1 atau accuracy
    #     factor=0.5,      # turunkan LR jadi setengah
    #     patience=3,      # kalau 3 epoch berturut-turut val F1 tidak naik, LR diturunkan
    #     verbose=True     # biar muncul log di output
    # )

    # üîΩ Inisialisasi EarlyStopping
    # early_stopping = EarlyStopping(patience=PATIENCE, min_delta=MIN_DELTA)

    # PYPLOT
    epochs = []
    lr = []
    train_loss_py = []
    train_acc = []
    train_f1 = []
    val_loss_py = []
    val_acc = []
    val_f1 = []
    # END
    
    for epoch in range(EPOCHS):
        model.train()
        train_loss = 0
        train_correct = 0
        train_total = 0
        # PYPLOT
        train_preds = []
        train_labels = []
        # END
        
        # Training loop
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}", leave=False):
            # Move batch to device
            batch = to_device(batch, device)
            
            optimizer.zero_grad()
            outputs = model(batch['input_ids'])
            loss = criterion(outputs, batch['labels'])
            
            _, predicted = torch.max(outputs.data, 1)
            
            train_total += batch['labels'].size(0)
            train_correct += (predicted == batch['labels']).sum().item()
            
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

            #PYPLOT
            labels = batch['labels']
            train_preds.extend(predicted.cpu().numpy())
            train_labels.extend(labels.cpu().numpy())
            #END
            
        model.eval()
        val_loss = 0
        val_correct = 0
        val_total = 0
        # PYPLOT
        val_preds = []
        val_labels = []
        # END

        # Validation loop
        with torch.no_grad():
            for batch in tqdm(val_loader, desc="Validating", leave=False):
                # Move batch to device
                batch = to_device(batch, device)
                
                outputs = model(batch['input_ids'])
                loss = criterion(outputs, batch['labels'])
                
                _, predicted = torch.max(outputs.data, 1)
                
                val_total += batch['labels'].size(0)
                val_correct += (predicted == batch['labels']).sum().item()
                val_loss += loss.item()

                #PYPLOT
                labels = batch['labels']
                val_preds.extend(predicted.cpu().numpy())
                val_labels.extend(labels.cpu().numpy())
                #END

        # Print GPU memory usage if available
        if torch.cuda.is_available():
            memory_info = get_gpu_memory()
            print(f"GPU Memory Usage - Allocated: {memory_info['allocated']}, Cached: {memory_info['cached']}")
        
        # Calculate metrics
        avg_train_loss = train_loss / len(train_loader)
        train_accuracy = 100 * train_correct / train_total
        avg_val_loss = val_loss / len(val_loader)
        val_accuracy = 100 * val_correct / val_total

        # PYPLOT
        epochs.append(epoch+1)
        lr.append(optimizer.param_groups[0]['lr'])

        train_loss_py.append(avg_train_loss)
        train_acc.append(train_accuracy)
        train_f1.append(f1_score(train_labels, train_preds, average='weighted'))
        
        val_loss_py.append(avg_val_loss)
        val_acc.append(val_accuracy)
        val_f1.append(f1_score(val_labels, val_preds, average='weighted'))
        # END
        
        # Log metrics
        if PYPLOT == False:
            wandb.log({
                "train_loss": avg_train_loss,
                "train_accuracy": train_accuracy,
                "train_f1_score": f1_score(train_labels, train_preds, average='weighted'),
                "val_loss": avg_val_loss,
                "val_accuracy": val_accuracy,
                "val_f1_score": f1_score(val_labels, val_preds, average='weighted'),
                "learning_rate": optimizer.param_groups[0]['lr'],
                # "epoch": epoch + 1
            })

        scheduler.step()
        # scheduler.step(f1)
        
        # Print results
        print(f"Epoch {epoch+1}/{EPOCHS}")
        print(f"Learning Rate: {optimizer.param_groups[0]['lr']:.6f}")
        print(f"Train Loss: {avg_train_loss:.4f}, Accuracy: {train_accuracy:.2f}%")
        print(f"Val Loss: {avg_val_loss:.4f}, Accuracy: {val_accuracy:.2f}%")

        # üîΩ Early stopping check
        # if early_stopping(avg_val_loss):
        #     print(f"Early stopping triggered at epoch {epoch+1}.")
        #     break

    if PYPLOT == True:
        ploting_result(epochs, lr, train_loss_py, train_acc, train_f1, val_loss_py, val_acc, val_f1)

    return model

def get_device():
    """Get the device to use (GPU if available, else CPU)"""
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        print(f"Using GPU: {torch.cuda.get_device_name(0)}")
    else:
        device = torch.device("cpu")
        print("Using CPU")
    return device

def to_device(data, device):
    """Move data to specified device"""
    if isinstance(data, (list, tuple)):
        return [to_device(x, device) for x in data]
    elif isinstance(data, dict):
        return {k: to_device(v, device) for k, v in data.items()}
    elif isinstance(data, torch.Tensor):
        return data.to(device)
    return data

def get_gpu_memory():
    """Get GPU memory usage if available"""
    if torch.cuda.is_available():
        return {
            "allocated": f"{torch.cuda.memory_allocated()/1e9:.2f} GB",
            "cached": f"{torch.cuda.memory_reserved()/1e9:.2f} GB"
        }
    return None

In [235]:
# TRAIN MAIN FUNCTION
def main():
    set_seed(SEED)

    if PYPLOT == False:
        # Menggunakan WandB
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        wandb.init(
            project="sentiment-analys-cyberbullying",
            name=f"exp_{timestamp}",
            config={
                # Data & tokenization
                "max_length": MAX_LENGTH,
                "tokenizer": TOKENIZER_NAME,
        
                # Model parameters
                "dropout": DROPOUT_RATE,
                "batch_size": BATCH_SIZE,
                "embed_dim": EMBEDDING_DIM,
                "num_classes": NUM_CLASSES,
                "num_filters": NUM_FILTERS,
                "kernel_size": KERNEL_SIZE,
                "out_channels": OUT_CHANNELS,
        
                # Training parameters
                "n_folds": N_FOLDS,
                "epochs": EPOCHS,
                "learning_rate": LEARNING_RATE,
            },
            notes=WANDB_NOTES
        )
    
        # Train
        cnn_train_fold()
            
        wandb.finish()

    if PYPLOT == True:
        cnn_train_fold()

if __name__ == "__main__":
    main()


===== Fold 6 =====
Using CPU
Loading data from ../dataset/cyberbullying.csv...
Menggunakan 5 folds dengan 1350 samples
Loading data from ../dataset/cyberbullying.csv...
Menggunakan 5 folds dengan 1350 samples


                                                                                                                       

Epoch 1/25
Learning Rate: 0.005000
Train Loss: 0.7068, Accuracy: 53.89%
Val Loss: 0.5748, Accuracy: 70.37%


                                                                                                                       

Epoch 2/25
Learning Rate: 0.005000
Train Loss: 0.5379, Accuracy: 71.94%
Val Loss: 0.4068, Accuracy: 81.11%


                                                                                                                       

Epoch 3/25
Learning Rate: 0.005000
Train Loss: 0.3350, Accuracy: 86.39%
Val Loss: 0.3419, Accuracy: 88.89%


                                                                                                                       

Epoch 4/25
Learning Rate: 0.005000
Train Loss: 0.1782, Accuracy: 93.43%
Val Loss: 0.3941, Accuracy: 85.93%


                                                                                                                       

Epoch 5/25
Learning Rate: 0.005000
Train Loss: 0.1385, Accuracy: 95.37%
Val Loss: 0.3956, Accuracy: 88.52%


                                                                                                                       

Epoch 6/25
Learning Rate: 0.005000
Train Loss: 0.1246, Accuracy: 95.00%
Val Loss: 0.4266, Accuracy: 87.41%


                                                                                                                       

Epoch 7/25
Learning Rate: 0.005000
Train Loss: 0.1181, Accuracy: 95.28%
Val Loss: 0.4714, Accuracy: 87.04%


                                                                                                                       

Epoch 8/25
Learning Rate: 0.005000
Train Loss: 0.0922, Accuracy: 96.94%
Val Loss: 0.4511, Accuracy: 88.52%


                                                                                                                       

Epoch 9/25
Learning Rate: 0.005000
Train Loss: 0.0927, Accuracy: 96.67%
Val Loss: 0.4533, Accuracy: 88.15%


                                                                                                                       

Epoch 10/25
Learning Rate: 0.002500
Train Loss: 0.0939, Accuracy: 96.85%
Val Loss: 0.4862, Accuracy: 86.67%


                                                                                                                       

Epoch 11/25
Learning Rate: 0.002500
Train Loss: 0.0707, Accuracy: 97.13%
Val Loss: 0.4837, Accuracy: 87.04%


                                                                                                                       

Epoch 12/25
Learning Rate: 0.002500
Train Loss: 0.0867, Accuracy: 97.41%
Val Loss: 0.4613, Accuracy: 87.04%


                                                                                                                       

Epoch 13/25
Learning Rate: 0.002500
Train Loss: 0.0769, Accuracy: 97.41%
Val Loss: 0.3992, Accuracy: 88.89%


                                                                                                                       

Epoch 14/25
Learning Rate: 0.002500
Train Loss: 0.0649, Accuracy: 97.69%
Val Loss: 0.3833, Accuracy: 88.52%


                                                                                                                       

Epoch 15/25
Learning Rate: 0.002500
Train Loss: 0.0582, Accuracy: 98.06%
Val Loss: 0.3619, Accuracy: 88.89%


                                                                                                                       

Epoch 16/25
Learning Rate: 0.002500
Train Loss: 0.0649, Accuracy: 97.78%
Val Loss: 0.4188, Accuracy: 88.52%


                                                                                                                       

Epoch 17/25
Learning Rate: 0.002500
Train Loss: 0.0570, Accuracy: 98.24%
Val Loss: 0.4766, Accuracy: 88.15%


                                                                                                                       

Epoch 18/25
Learning Rate: 0.002500
Train Loss: 0.0587, Accuracy: 98.06%
Val Loss: 0.4205, Accuracy: 89.26%


                                                                                                                       

Epoch 19/25
Learning Rate: 0.002500
Train Loss: 0.0466, Accuracy: 98.33%
Val Loss: 0.4719, Accuracy: 88.52%


                                                                                                                       

Epoch 20/25
Learning Rate: 0.001250
Train Loss: 0.0620, Accuracy: 97.96%
Val Loss: 0.4005, Accuracy: 89.63%


                                                                                                                       

Epoch 21/25
Learning Rate: 0.001250
Train Loss: 0.0429, Accuracy: 98.24%
Val Loss: 0.3996, Accuracy: 89.63%


                                                                                                                       

Epoch 22/25
Learning Rate: 0.001250
Train Loss: 0.0450, Accuracy: 98.24%
Val Loss: 0.4445, Accuracy: 86.30%


                                                                                                                       

Epoch 23/25
Learning Rate: 0.001250
Train Loss: 0.0529, Accuracy: 98.33%
Val Loss: 0.3817, Accuracy: 89.63%


                                                                                                                       

Epoch 24/25
Learning Rate: 0.001250
Train Loss: 0.0362, Accuracy: 98.61%
Val Loss: 0.4233, Accuracy: 88.89%


                                                                                                                       

Epoch 25/25
Learning Rate: 0.001250
Train Loss: 0.0573, Accuracy: 97.87%
Val Loss: 0.4174, Accuracy: 88.52%


0,1
learning_rate,‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ
train_accuracy,‚ñÅ‚ñÑ‚ñÜ‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
train_f1_score,‚ñÅ‚ñÑ‚ñÜ‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
train_loss,‚ñà‚ñÜ‚ñÑ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ
val_accuracy,‚ñÅ‚ñÖ‚ñà‚ñá‚ñà‚ñá‚ñá‚ñà‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà‚ñà‚ñá‚ñà‚ñà‚ñà‚ñà‚ñá‚ñà‚ñà‚ñà
val_f1_score,‚ñÅ‚ñÖ‚ñà‚ñá‚ñà‚ñá‚ñá‚ñà‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà‚ñà‚ñá‚ñà‚ñà‚ñà‚ñà‚ñá‚ñà‚ñà‚ñà
val_loss,‚ñà‚ñÉ‚ñÅ‚ñÉ‚ñÉ‚ñÑ‚ñÖ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÉ‚ñÇ‚ñÇ‚ñÉ‚ñÖ‚ñÉ‚ñÖ‚ñÉ‚ñÉ‚ñÑ‚ñÇ‚ñÉ‚ñÉ

0,1
learning_rate,0.00125
train_accuracy,97.87037
train_f1_score,0.9787
train_loss,0.05731
val_accuracy,88.51852
val_f1_score,0.88499
val_loss,0.41742
