In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch




In [2]:
file_path = r'C:\Users\MSI\OneDrive\Dokumen\Data MIning\online-gambling-detection-project\data\processed\cleaned_comments.csv'
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,textOriginal,processed_text,label
0,sekilas kaya michael scofield,kilas kaya michael scofield,0
1,menit 23:00 adalah niche yg bisa membuat penon...,menit 2300 niche yg tonton sandi,0
2,Puasa jgn bocor,puasa jgn bocor,0
3,KPI nya penyerapan,kpi nya serap,0
4,Ada ordal selesai,ordal selesai,0


In [3]:
df['textOriginal'] = df['textOriginal'].fillna('')
X = df['textOriginal']
y = df['label']

In [4]:
train_texts, val_texts, train_labels, val_labels = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
class CommentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])  
        label = int(self.labels.iloc[idx])  
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [6]:
# Load IndoBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
model = BertForSequenceClassification.from_pretrained("indobenchmark/indobert-base-p1", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
train_dataset = CommentDataset(train_texts, train_labels, tokenizer)
val_dataset = CommentDataset(val_texts, val_labels, tokenizer)

In [8]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    accuracy = (predictions == torch.tensor(labels)).float().mean().item()
    return {"accuracy": accuracy}

In [9]:
training_args = TrainingArguments(
    output_dir='../../models/indoBERT',
    num_train_epochs=3,
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=16,
    eval_strategy="epoch",  # Ganti evaluation_strategy dengan eval_strategy
    save_strategy="epoch",
    logging_dir='../../models/indoBERT/logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=1
)

In [10]:
# Define a Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics  # Tambahkan metrik evaluasi
)

In [11]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.0188,0.055343,0.990273
2,0.0011,0.036862,0.99364
3,0.0008,0.041348,0.993266


TrainOutput(global_step=2007, training_loss=0.03581837790681297, metrics={'train_runtime': 11917.1816, 'train_samples_per_second': 2.691, 'train_steps_per_second': 0.168, 'total_flos': 2109295553057280.0, 'train_loss': 0.03581837790681297, 'epoch': 3.0})

In [12]:
model.save_pretrained('../../models/indoBERT')
tokenizer.save_pretrained('../../models/indoBERT')
print("IndoBERT model and tokenizer saved!")

IndoBERT model and tokenizer saved!


In [2]:
model_path = 'C:/Users/MSI/OneDrive/Dokumen/Data Mining/online-gambling-detection-project/models/indoBERT'
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)
print("Model dan tokenizer berhasil dimuat!")

Model dan tokenizer berhasil dimuat!
