Muhamad Nahrowi Tesis, IndoBERT - LSTM

Pra-Pemrosesan

In [6]:
import pandas as pd
import re
from transformers import BertTokenizer, BertModel
import torch
import emoji

# Load dataset
df = pd.read_csv('dataset_politik_indonesia.csv')

# 1. Bersihkan Teks
def clean_text(text):
    text = text.lower()  # Case folding
    text = re.sub(r'http\S+', '', text)  # Hapus URL
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Hapus karakter non-alfabet kecuali emotikon
    return text

df['cleaned_comment'] = df['comment'].apply(clean_text)

# 2. Label Encoding
df['label'] = df['sarcasm'].astype(int)
df['has_emoticon'] = df['emoticon'].astype(int)

# 3. Ekstraksi Fitur dengan IndoBERT
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
model = BertModel.from_pretrained('indobenchmark/indobert-base-p1')

def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=50)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

df['bert_embedding'] = df['cleaned_comment'].apply(get_bert_embedding)

# 4. Ekstraksi Fitur Emotikon
def extract_emoticons(text):
    return [char for char in text if emoji.is_emoji(char)]

df['emoticons'] = df['comment'].apply(extract_emoticons)

# 5. Representasi Emotikon
# Untuk representasi emotikon, kita dapat menggunakan pendekatan one-hot encoding atau embedding khusus emotikon.
# Sebagai contoh, berikut adalah implementasi sederhana dengan one-hot encoding:

from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
emoticon_features = mlb.fit_transform(df['emoticons'])

# Gabungkan fitur IndoBERT dan fitur emotikon
import numpy as np

# Pastikan panjang vektor bert_embedding konsisten
bert_embeddings = np.stack(df['bert_embedding'].values)

# Gabungkan fitur
combined_features = np.hstack((bert_embeddings, emoticon_features))

# Dataset final siap untuk pelatihan model
print(combined_features.shape)  # Pastikan dimensinya sesuai

(2000, 788)


In [18]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
import torch

df['combined_features'] = list(combined_features)

# Prepare Data for Training
X = torch.tensor(np.array(df['combined_features'].tolist()), dtype=torch.float32)
y = torch.tensor(df['label'].values, dtype=torch.long)

# Split data into training and validation sets
dataset = TensorDataset(X, y)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Define LSTM Model
# class SarcasmClassifier(nn.Module):
#     def __init__(self, input_size, hidden_size, num_layers, num_classes, dropout_prob):
#         super(SarcasmClassifier, self).__init__()
#         self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout_prob)
#         self.fc = nn.Linear(hidden_size, num_classes)
#         self.dropout = nn.Dropout(dropout_prob)

#     def forward(self, x):
#         h0 = torch.zeros(2, x.size(0), 128).to(x.device)  # Initialize hidden state
#         c0 = torch.zeros(2, x.size(0), 128).to(x.device)  # Initialize cell state
#         out, _ = self.lstm(x, (h0, c0))
#         out = self.dropout(out[:, -1, :])
#         out = self.fc(out)
#         return out

class SarcasmClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes, dropout_prob):
        super(SarcasmClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout_prob)
        self.fc = nn.Linear(hidden_size, num_classes)
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, x):
        # Pastikan input memiliki dimensi (batch_size, sequence_length, input_size)
        if x.dim() == 2:
            x = x.unsqueeze(1)  # Tambahkan dimensi untuk sequence_length jika tidak ada

        # Inisialisasi hidden state dan cell state sesuai dengan batch size
        h0 = torch.zeros(self.lstm.num_layers, x.size(0), self.lstm.hidden_size).to(x.device)
        c0 = torch.zeros(self.lstm.num_layers, x.size(0), self.lstm.hidden_size).to(x.device)
        
        out, _ = self.lstm(x, (h0, c0))  # LSTM expects input of (batch_size, seq_len, input_size)
        out = self.dropout(out[:, -1, :])  # Ambil output dari timestep terakhir
        out = self.fc(out)
        return out

# Hyperparameters
input_size = X.shape[1]
hidden_size = 128
num_layers = 2
num_classes = 2
dropout_prob = 0.5
num_epochs = 50
learning_rate = 0.001

# cpu
device = torch.device('cpu')


model = SarcasmClassifier(input_size, hidden_size, num_layers, num_classes, dropout_prob).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training and Validation Loop with Early Stopping
best_val_loss = float('inf')
patience, trials = 5, 0  # Early stopping criteria

for epoch in range(num_epochs):
    model.train()
    total_loss, correct, total = 0, 0, 0
    
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)
    
    train_accuracy = 100 * correct / total
    val_loss, val_correct, val_total = 0, 0, 0
    model.eval()
    
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_correct += (predicted == labels).sum().item()
            val_total += labels.size(0)

    val_loss /= len(val_loader)
    val_accuracy = 100 * val_correct / val_total
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}, '
          f'Accuracy: {train_accuracy:.2f}%, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%')
    
    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        trials = 0
    else:
        trials += 1
        if trials >= patience:
            print("Early stopping triggered.")
            break

Epoch [1/50], Loss: 0.1800, Accuracy: 98.19%, Val Loss: 0.0008, Val Accuracy: 100.00%
Epoch [2/50], Loss: 0.0008, Accuracy: 100.00%, Val Loss: 0.0002, Val Accuracy: 100.00%
Epoch [3/50], Loss: 0.0004, Accuracy: 100.00%, Val Loss: 0.0001, Val Accuracy: 100.00%
Epoch [4/50], Loss: 0.0003, Accuracy: 100.00%, Val Loss: 0.0001, Val Accuracy: 100.00%
Epoch [5/50], Loss: 0.0002, Accuracy: 100.00%, Val Loss: 0.0000, Val Accuracy: 100.00%
Epoch [6/50], Loss: 0.0001, Accuracy: 100.00%, Val Loss: 0.0000, Val Accuracy: 100.00%
Epoch [7/50], Loss: 0.0001, Accuracy: 100.00%, Val Loss: 0.0000, Val Accuracy: 100.00%
Epoch [8/50], Loss: 0.0001, Accuracy: 100.00%, Val Loss: 0.0000, Val Accuracy: 100.00%
Epoch [9/50], Loss: 0.0001, Accuracy: 100.00%, Val Loss: 0.0000, Val Accuracy: 100.00%
Epoch [10/50], Loss: 0.0001, Accuracy: 100.00%, Val Loss: 0.0000, Val Accuracy: 100.00%
Epoch [11/50], Loss: 0.0000, Accuracy: 100.00%, Val Loss: 0.0000, Val Accuracy: 100.00%
Epoch [12/50], Loss: 0.0000, Accuracy: 100