In [None]:
from transformers import XLNetTokenizer, XLNetForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import torch

# Load and preprocess data
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long),
        }

# Load CSV data
data = pd.read_csv("aqua_OnlineMedia_21-Jun-2024_20-Sep-2024_auY2Vi02Qm.csv", delimiter=";")
data['text'] = data['title'] + " " + data['body']
data = data.dropna(subset=['sentiment'])

texts = data['text'].tolist()
labels = data['sentiment'].map({'positive': 1, 'negative': 0, 'neutral': 2}).tolist()

# Split into train and validation sets
train_texts, val_texts = texts[:int(len(texts)*0.8)], texts[int(len(texts)*0.8):]
train_labels, val_labels = labels[:int(len(labels)*0.8)], labels[int(len(labels)*0.8):]

# Load XLNet tokenizer and model
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
train_dataset = SentimentDataset(train_texts, train_labels, tokenizer)
val_dataset = SentimentDataset(val_texts, val_labels, tokenizer)

# Load XLNet model
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=3)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./xlnet-sentiment-model",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=10,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Save the model and tokenizer
model.save_pretrained("xlnet_sentiment_model")
tokenizer.save_pretrained("xlnet_sentiment_model")


In [None]:
from transformers import XLNetTokenizer, XLNetForSequenceClassification
import torch

# Load the trained model and tokenizer
model = XLNetForSequenceClassification.from_pretrained("xlnet_sentiment_model")
tokenizer = XLNetTokenizer.from_pretrained("xlnet_sentiment_model")

# Example texts to test the model
test_texts = [
    "Produk ini luar biasa! Saya sangat menyukainya.",
    "Saya sangat kecewa dengan layanan ini.",
    "Tidak buruk, tetapi juga tidak istimewa."
]

# Tokenize the input texts
test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors="pt")

# Run inference
with torch.no_grad():
    outputs = model(**test_encodings)
    predictions = torch.argmax(outputs.logits, dim=1)

# Map predictions to sentiment labels
label_mapping = {1: 'positive', 0: 'negative', 2: 'neutral'}
predicted_labels = [label_mapping[pred.item()] for pred in predictions]
print(predicted_labels)  # Output the sentiments
