<h1>Import Libraries


In [9]:
import torch
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score


<h1>Data Pre-Processing


In [None]:
# Path to your CSV file
file_path = 'final_dataset.xlsx'

# Read the dataset
data = pd.read_excel(file_path)

# Get unique classes
unique_classes = data['labels'].unique()
print("Kelas yang terdapat dalam data:", unique_classes)

# Prepare the label encoding
data['label_encoded'] = data['labels'].astype('category').cat.codes

# Create a mapping of encoded values to labels
label_mapping = {label: code for label, code in zip(data['labels'].astype('category').cat.categories, range(len(unique_classes)))}
print("Mapping label ke encoded value:", label_mapping)

# Split the data (80% train, 10% val, 10% test)
train_data, temp_data = train_test_split(data, test_size=0.2, stratify=data['label_encoded'], random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, stratify=temp_data['label_encoded'], random_state=42)

# Display the first few rows of the data
print(data.head(10))


In [11]:
# Dataset untuk DataLoader
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


<h1>BERT Model

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

model_name = BertForSequenceClassification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)

<h1>RoBERTa Model

In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

model_name = RobertaForSequenceClassification
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=6)

<h1>ALBERT Model

In [None]:
from transformers import AlbertTokenizer, AlbertForSequenceClassification

model_name = AlbertForSequenceClassification
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=6)

<h1>ELECTRA

In [None]:
from transformers import ElectraTokenizer, ElectraForSequenceClassification

model_name = ElectraForSequenceClassification
tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')
model = ElectraForSequenceClassification.from_pretrained('google/electra-base-discriminator', num_labels=6)

<h1> DeBERTa Model

In [None]:
from transformers import DebertaTokenizer, DebertaForSequenceClassification

model_name = DebertaForSequenceClassification
tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')
model = DebertaForSequenceClassification.from_pretrained('microsoft/deberta-base', num_labels=6)

<h1>Initiate Model

In [None]:
# Inisialisasi model dan tokenizer
from transformers import DebertaTokenizer, DebertaForSequenceClassification

model_name = DebertaForSequenceClassification
tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')
model = DebertaForSequenceClassification.from_pretrained('microsoft/deberta-base', num_labels=6)

# Tokenisasi data
train_encodings = tokenizer(list(train_data['data']), truncation=True, padding=True, max_length=64)
val_encodings = tokenizer(list(val_data['data']), truncation=True, padding=True, max_length=64)
test_encodings = tokenizer(list(test_data['data']), truncation=True, padding=True, max_length=64)

# Membuat DataLoader
train_dataset = TextDataset(train_encodings, train_data['label_encoded'].tolist())
val_dataset = TextDataset(val_encodings, val_data['label_encoded'].tolist())
test_dataset = TextDataset(test_encodings, test_data['label_encoded'].tolist())
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=8)

<h1>Train Function

In [6]:
def train(model, train_loader, val_loader, device):
    model.train()
    optimizer = AdamW(model.parameters(), lr=5e-5)
    
    train_accuracies = []
    val_accuracies = []
    
    for epoch in range(10):  # Number of epochs
        total_loss = 0
        correct_predictions = 0
        total_predictions = 0

        all_train_true_labels = []
        all_train_predictions = []
        
        for batch_idx, batch in enumerate(train_loader):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()

            # Calculate training accuracy
            preds = torch.argmax(outputs.logits, dim=1)
            correct_predictions += (preds == labels).sum().item()
            total_predictions += labels.size(0)

            all_train_true_labels.extend(labels.cpu().numpy())
            all_train_predictions.extend(preds.cpu().numpy())
            
            if batch_idx % 10 == 0:  # Display every 10 batches
                print(f"Epoch [{epoch + 1}/10], Batch [{batch_idx}], Loss: {loss.item():.4f}")

        # Calculate training accuracy for the epoch
        train_accuracy = correct_predictions / total_predictions
        train_accuracies.append(train_accuracy)

        print(f"\nEpoch [{epoch + 1}/10] - Training Accuracy: {train_accuracy:.4f}")
        
        # Validation accuracy (simplified)
        model.eval()
        val_correct_predictions = 0
        val_total_predictions = 0

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                preds = torch.argmax(outputs.logits, dim=1)
                val_correct_predictions += (preds == labels).sum().item()
                val_total_predictions += labels.size(0)
        
        val_accuracy = val_correct_predictions / val_total_predictions
        val_accuracies.append(val_accuracy)
        print(f"Epoch [{epoch + 1}/10] - Validation Accuracy: {val_accuracy:.4f}")
        
        model.train()  # Switch back to training mode
    
    # Save the trained model
    torch.save(model.state_dict(), "DeBERTa.pth")
    print("Model has been saved to 'trained_model.pth'")

    # Plotting training and validation accuracies
    plt.figure(figsize=(11, 6))
    plt.plot(range(1, 11), train_accuracies, label='Train Accuracy', marker='o')
    plt.plot(range(1, 11), val_accuracies, label='Validation Accuracy', marker='o')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.title('Training and Validation Accuracy Comparison')
    plt.legend()
    plt.show()

    # Final Confusion Matrix and Classification Report for Training Data
    print("\nFinal Confusion Matrix (Training):")
    label_names = ['Religion', 'business', 'entertainment', 'politics', 'sport', 'tech']
    cm_train = confusion_matrix(all_train_true_labels, all_train_predictions)
    sns.heatmap(cm_train, annot=True, fmt='d', cmap='Blues', xticklabels=label_names, yticklabels=label_names)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('DeBERTa Confusion Matrix (Training)')
    plt.show()

    print("\nFinal Classification Report (Training):")
    print(classification_report(all_train_true_labels, all_train_predictions, target_names=label_names))


<h1>Evaluate Function

In [12]:
# Evaluate the model on the test set after training
def evaluate_test(model, test_loader, device):
    # Load state dict dari file yang telah disimpan
    model.load_state_dict(torch.load("DeBERTa.pth"))

    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch_idx, batch in enumerate(test_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

            if batch_idx % 10 == 0:  # Display every 10 batches
                print(f"Evaluating Batch [{batch_idx}]...")

    # Calculate test metrics
    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions, average='weighted')
    recall = recall_score(true_labels, predictions, average='weighted')
    f1 = f1_score(true_labels, predictions, average='weighted')

    print(f"\nTest Accuracy: {accuracy:.4f}")
    print(f"Test Precision: {precision:.4f}")
    print(f"Test Recall: {recall:.4f}")
    print(f"Test F1 Score: {f1:.4f}")

    # Confusion matrix for test set
    cm_test = confusion_matrix(true_labels, predictions)
    label_names = ['Religion', 'business', 'entertainment', 'politics', 'sport', 'tech']
    sns.heatmap(cm_test, annot=True, fmt='d', cmap='Blues', xticklabels=label_names, yticklabels=label_names)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('Confusion Matrix DeBERTa Model')
    plt.show()     

    # Print the classification report for test set
    print("\nClassification Report (Test):")
    print(classification_report(true_labels, predictions, target_names=label_names))

    return accuracy

<h1>Execution

In [None]:
# Menjalankan model di GPU atau CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
torch.cuda.empty_cache()

# Run the training and evaluation process
train(model, train_loader, val_loader, device)


In [None]:
# After training, evaluate on the test set
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
torch.cuda.empty_cache()
evaluate_test(model, test_loader, device)