In [10]:
%pip install transformers torch pandas scikit-learn


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import XLNetTokenizer, XLNetForSequenceClassification, AdamW

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. Load Dataset with Correct Delimiter and Clean Columns
dataset_path = "aqua_OnlineMedia_21-Jun-2024_20-Sep-2024_auY2Vi02Qm.csv"
data = pd.read_csv(dataset_path, delimiter=';')

# Clean column names by stripping spaces and removing quotes
data.columns = data.columns.str.replace('"', '').str.strip()

# Keep only necessary columns
data = data[['title', 'body', 'sentiment']]

# Combine title and body as input text for the model
data['text'] = data['title'] + " " + data['body']

# Remove any rows with missing sentiment
data = data.dropna(subset=['sentiment'])

# Display the cleaned data
print(data.head())

                                               title  \
0  'Peluang Emas, Pabrik Le Minerale (Mayora Grou...   
1  'Melalui Program Persija Belajar Bola Bareng, ...   
2  'Le Minerale Dan Persija Majukan Talenta Muda ...   
3  'Fresh Graduate Merapat PT Tirta Fresindo Jaya...   
4  'Melalui Program Persija Belajar Bola Bareng, ...   

                                                body  sentiment  \
0  'MEDIA PAKUAN - PT Tirta Fresindo Jaya, bagian...   'neutral   
1  'Ini menjadi kesempatan bagi tim dan pelatih u...  'positive   
2  'RM.id Rakyat Merdeka - Le Minerale turut berp...  'positive   
3  'PortalMagetan.com -Info seputar lowongan kerj...  'positive   
4  'JAKARTA, suaramerdeka.com â€“ Le Minerale mendu...  'positive   

                                                text  
0  'Peluang Emas, Pabrik Le Minerale (Mayora Grou...  
1  'Melalui Program Persija Belajar Bola Bareng, ...  
2  'Le Minerale Dan Persija Majukan Talenta Muda ...  
3  'Fresh Graduate Merapat PT Tirta Fr

In [12]:
# 1. Encode Sentiment Labels into Integer Values (0, 1, 2)
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['sentiment'])

# Split into train, validation, and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['text'], data['label'], test_size=0.2, random_state=42
)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    test_texts, test_labels, test_size=0.5, random_state=42
)

# 2. Tokenization
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
max_length = 128

def tokenize_data(texts, labels):
    encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=max_length, return_tensors="pt")
    return encodings, torch.tensor(labels.values)  # Ensure labels are numpy array

train_encodings, train_labels = tokenize_data(train_texts, train_labels)
val_encodings, val_labels = tokenize_data(val_texts, val_labels)
test_encodings, test_labels = tokenize_data(test_texts, test_labels)

# 3. Create PyTorch Dataset
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)
test_dataset = SentimentDataset(test_encodings, test_labels)

# 4. Load Model
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=3)
model.to(device)

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0-11): 12 x XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation_function): GELUActivation()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (sequence_summary): SequenceSummary(
    (summary): Linear(in_features=768, out_features=768, bias=True)
    (activation): Tanh()
    (first_dropout): Identity()
    (last

In [13]:
# 6. Training Setup
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training Function
def train_epoch(model, loader, optimizer):
    model.train()
    total_loss = 0
    for batch in loader:
        optimizer.zero_grad()
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
        labels = batch['labels'].to(device)
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

# Validation Function
def evaluate_model(model, loader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in loader:
            inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
            labels = batch['labels'].to(device)
            outputs = model(**inputs)
            logits = outputs.logits
            predictions.extend(torch.argmax(logits, axis=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    return classification_report(true_labels, predictions, target_names=label_encoder.classes_)

# Training Loop
epochs = 3
for epoch in range(epochs):
    train_loss = train_epoch(model, train_loader, optimizer)
    print(f"Epoch {epoch + 1}/{epochs} - Loss: {train_loss:.4f}")
    print("Validation Results:")
    print(evaluate_model(model, val_loader))

# 7. Save the Model and Tokenizer before Testing
model_save_path = "xlnet_sentiment_model"
tokenizer_save_path = "xlnet_sentiment_tokenizer"

# Save the model
model.save_pretrained(model_save_path)

# Save the tokenizer
tokenizer.save_pretrained(tokenizer_save_path)

print(f"Model and tokenizer saved to {model_save_path} and {tokenizer_save_path}")



Epoch 1/3 - Loss: 0.7075
Validation Results:


ValueError: Number of classes, 2, does not match size of target_names, 3. Try specifying the labels parameter

In [None]:
# Load the saved model and tokenizer
model = XLNetForSequenceClassification.from_pretrained("xlnet_sentiment_model")
tokenizer = XLNetTokenizer.from_pretrained("xlnet_sentiment_tokenizer")

# Move model to the correct device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Sample paragraphs for testing
sample_paragraphs = [
    "The economic growth in the past year has been exceptional, with an increase in employment and the stock market reaching new highs.",
    "The recent political situation has created a lot of uncertainty and unrest among the citizens. Protests have erupted, and people are concerned about the future.",
    "The weather has been mild this season, with occasional rain and cool temperatures, making it a pleasant time to spend outdoors."
]

# Tokenize the sample paragraphs
inputs = tokenizer(sample_paragraphs, truncation=True, padding=True, max_length=128, return_tensors="pt").to(device)

# Get model predictions
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# Get predicted labels (0 = Negative, 1 = Neutral, 2 = Positive)
predictions = torch.argmax(logits, dim=-1).cpu().numpy()

# Map the predicted labels back to sentiment classes
sentiment_labels = ['Negative', 'Neutral', 'Positive']

# Print the results for each paragraph
for text, pred in zip(sample_paragraphs, predictions):
    print(f"Text: {text}")
    print(f"Predicted Sentiment: {sentiment_labels[pred]}")
    print("-" * 50)
