# Enhancing Khmer Sentiment Analysis Using Transformer Models

# Install Dependencies

## For Word Segmentation

In [None]:
! git clone https://github.com/rinabuoy/KhmerNLP

In [None]:
%cd KhmerNLP

In [None]:
!pip install sklearn_crfsuite

In [None]:
import os
from transformers import TFAutoModel,AutoTokenizer,BertModel,AdamW,XLMRobertaTokenizer,XLMRobertaForSequenceClassification, Trainer, TrainingArguments,XLMRobertaModel
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from sklearn.metrics import classification_report, confusion_matrix
#from khmerwordsegmentor import KhmerWordSegmentor
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm

# Load Data and Preprocessing

In [None]:
data = pd.read_csv('/content/kh_sentiment_data_segmented (1).csv')

In [None]:
data.head(10)

In [None]:
data.isna().sum() #missing value

In [None]:
data['Sentiment_label'].value_counts() # Check class label, prevent from imbalance class

# Word Segmentation

In [None]:
# Initialize the segmentor
seg = KhmerWordSegmentor()
def segment_khmer_text(text, model="lstm"):
    """
    Segment Khmer text using the specified model (LSTM or CRF).

    Parameters:
        text (str): The input Khmer text to be segmented.
        model (str): The model to use for segmentation ('lstm' or 'crf').

    Returns:
        str: The segmented text as a string.
    """
    try:
        # Segment the text
        segmented_text = seg.segment(text, model=model)
        return segmented_text
    except Exception as e:
        print(f"Error during segmentation: {e}")
        return ""

In [None]:
data['seg_text_lstm'] = data['Text'].apply(lambda x: segment_khmer_text(x, model='lstm')) # apply word segmentation using "LSTM"
data['seg_text_crf'] = data['Text'].apply(lambda x: segment_khmer_text(x,model = 'crf')) # apply word segmentation using "CRF"

In [None]:
data.head(10)

In [None]:
data.to_csv('kh_sentiment_data_segmented.csv', index=False)

In [None]:
data = data[['seg_text_lstm', 'Sentiment_label']] # Use segmented text column and sentiment label

# Split into train and validation sets
train_data, val_data = train_test_split(data, test_size=0.2)

# Tokenization

In [None]:
# Load the XLM-Roberta tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

# Create a customer dataset using Pytorch

In [None]:
#Custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.texts = dataframe['seg_text_lstm'].tolist()
        self.labels = dataframe['Sentiment_label'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize the text
        encoded = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )

        # Return tensors
        return {
            'input_ids': encoded['input_ids'].squeeze(0),  # Remove batch dimension
            'attention_mask': encoded['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }



In [None]:
# Define parameters
max_length = 128  # Maximum token length
batch_size = 16   # Batch size

# Create datasets
train_dataset = CustomDataset(train_data, tokenizer, max_length)
val_dataset = CustomDataset(val_data, tokenizer, max_length)

In [None]:
# Retrieve the first sample
sample = train_dataset[0]
# Print the tokenized data
print("Input IDs:", sample['input_ids'])
print("Attention Mask:", sample['attention_mask'])
print("Label:", sample['label'])

In [None]:
# Create dataloaders
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_data_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Define Pre-train Model

In [None]:
class XLMRClassifier(nn.Module):
    def __init__(self, model_name, num_classes):
        super(XLMRClassifier, self).__init__()
        self.xlm_roberta = XLMRobertaModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)

        # For binary classification, output 2 logits (for class 0 and 1)
        self.fc = nn.Linear(self.xlm_roberta.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.xlm_roberta(input_ids=input_ids, attention_mask=attention_mask)

        # Extract the [CLS] token representation
        pooled_output = outputs[0][:, 0, :]

        # Apply dropout and fully connected layer
        x = self.dropout(pooled_output)
        logits = self.fc(x)

        return logits


# Training

In [None]:

# Initialize the model
model = XLMRClassifier(model_name='xlm-roberta-base', num_classes=2)

# Define the optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=0.00001)
criterion = torch.nn.CrossEntropyLoss()  # For classification tasks


In [None]:
# Initialize a list to store loss history
loss_history = []
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Training loop
epochs = 10
for epoch in range(epochs):
    model.train()  # Set the model to training mode
    running_loss = 0.0

    # tqdm for showing the progress bar
    for batch in tqdm(train_data_loader, desc=f"Epoch {epoch + 1}/{epochs}"):
        optimizer.zero_grad()

        # Get input and output from the batch
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # Get logits (the first element of the tuple)
        logits = outputs

        # Compute loss
        loss = criterion(logits, labels)

        # Backpropagation
        loss.backward()

        # Update parameters
        optimizer.step()

        running_loss += loss.item()

    # Compute the average loss for this epoch
    epoch_loss = running_loss / len(train_data_loader)
    loss_history.append(epoch_loss)  # Save the loss for this epoch
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss:.4f}")

In [None]:
# Plot the loss history
plt.figure(figsize=(8, 6))
plt.plot(range(1, epochs + 1), loss_history, marker='o', label='Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss History During Training')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Set model to evaluation mode
model.eval()

# Lists to store predictions and true labels
all_preds = []
all_labels = []

# Disable gradient computation for evaluation
with torch.no_grad():
    for batch in val_data_loader:
        # Send inputs and labels to the device (GPU/CPU)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Forward pass through the model
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # Extract logits and compute predictions
        logits = outputs
        _, preds = torch.max(logits, dim=1)

        # Store predictions and true labels
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Print the classification report
print("Classification Report:")
print(classification_report(all_labels, all_preds))

# Print the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(all_labels, all_preds))

# Visualization of the confusion matrix
cm = confusion_matrix(all_labels, all_preds)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc, precision_recall_curve
import matplotlib.pyplot as plt

# Extract probabilities for the positive class (class 1)
all_probs = []

# Disable gradient computation for evaluation
with torch.no_grad():
    for batch in val_data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Forward pass through the model
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        logits = outputs
        probs = torch.softmax(logits, dim=1)  # Convert logits to probabilities
        all_probs.extend(probs.cpu().numpy()[:, 1])  # Get probabilities for class 1

# AUC Calculation
fpr, tpr, _ = roc_curve(all_labels, all_probs)
roc_auc = auc(fpr, tpr)

# Plot ROC Curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(all_labels, all_probs)

# Plot Precision-Recall Curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, color='blue', lw=2)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.show()

# Print AUC score
print(f"AUC: {roc_auc:.4f}")

In [None]:
from sklearn.metrics import matthews_corrcoef
mcc = matthews_corrcoef(all_labels, all_preds)
print(f"MCC: {mcc:.4f}")


In [None]:
# Save the entire model
model_save_path = "xlmr_sentiment_model_full.pth"
torch.save(model, model_save_path)
print(f"Model saved to {model_save_path}")
