In [None]:
import pandas as pd
import torch
from transformers import XLNetTokenizer, XLNetForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from torch.nn import CrossEntropyLoss
from torch.optim import lr_scheduler
from tqdm import tqdm

In [None]:
# Define Dataset Class
class ArticleDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['label'] = torch.tensor(label, dtype=torch.long)
        return item

In [None]:
# Load and preprocess data
data_path = "cleaned_combined_output.csv"
data = pd.read_csv(data_path)

In [None]:
# Map labels to integers if necessary (e.g., 'positive': 2, 'neutral': 1, 'negative': 0)
label_mapping = {'positive': 2, 'neutral': 1, 'negative': 0}
data['sentiment'] = data['sentiment'].map(label_mapping)


In [None]:
# Split dataset into training, validation, and testing
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    data['body'], data['sentiment'], test_size=0.3, random_state=42
)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=42
)


In [None]:
# Load XLNet tokenizer and define parameters
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
max_length = 512
batch_size = 16


In [None]:
# Create Dataset objects
train_dataset = ArticleDataset(train_texts.tolist(), train_labels.tolist(), tokenizer, max_length)
val_dataset = ArticleDataset(val_texts.tolist(), val_labels.tolist(), tokenizer, max_length)
test_dataset = ArticleDataset(test_texts.tolist(), test_labels.tolist(), tokenizer, max_length)

In [None]:
# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
# Initialize XLNet model
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=3)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
# Define optimizer, loss function, and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)
loss_fn = CrossEntropyLoss()


In [None]:
# Training function
def train_epoch(model, data_loader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0
    for batch in tqdm(data_loader, desc="Training", leave=False):
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'label'}
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(data_loader)

In [None]:
# Validation function
def evaluate_model(model, data_loader, loss_fn, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Validation", leave=False):
            inputs = {key: val.to(device) for key, val in batch.items() if key != 'label'}
            labels = batch['label'].to(device)
            outputs = model(**inputs)

            loss = loss_fn(outputs.logits, labels)
            total_loss += loss.item()

            preds = torch.argmax(outputs.logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    return total_loss / len(data_loader), accuracy


In [None]:
# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train_loss = train_epoch(model, train_loader, optimizer, loss_fn, device)
    val_loss, val_accuracy = evaluate_model(model, val_loader, loss_fn, device)

    scheduler.step()

    print(f"Train Loss: {train_loss:.4f}")
    print(f"Validation Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.4f}")

# Save the trained model
model.save_pretrained("xlnet_sentiment_model_v2")
tokenizer.save_pretrained("xlnet_sentiment_tokenizer_v2")

print("Model training complete and saved!")