In [1]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import pandas as pd


In [2]:
# Load CSV files
train_data = pd.read_csv(r'D:\Downloads\dataset\X_train_cleaned.csv')
test_data = pd.read_csv(r'D:\Downloads\dataset\X_test_cleaned.csv')

# Extract features and targets
X_train_cleaned = train_data['review_body'].tolist()  # Text data
Y_train = train_data['sentiment'].tolist()  # Labels
X_test_cleaned = test_data['review_body'].tolist()
Y_test = test_data['sentiment'].tolist()


In [3]:
# Ensure the data is a list of strings
X_train_cleaned = [str(item) for item in X_train_cleaned]
X_test_cleaned = [str(item) for item in X_test_cleaned]


In [4]:
# Load Roberta tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenize and pad sequences to a fixed length
X_train_encoded = tokenizer(X_train_cleaned, padding=True, truncation=True, return_tensors='pt', max_length=128)
X_test_encoded = tokenizer(X_test_cleaned, padding=True, truncation=True, return_tensors='pt', max_length=128)

# Convert labels to tensors
Y_train_tensor = torch.tensor(Y_train)
Y_test_tensor = torch.tensor(Y_test)


In [5]:
# Create TensorDatasets for training and testing
train_dataset = TensorDataset(X_train_encoded['input_ids'], X_train_encoded['attention_mask'], Y_train_tensor)
test_dataset = TensorDataset(X_test_encoded['input_ids'], X_test_encoded['attention_mask'], Y_test_tensor)

# Create DataLoader for batching
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)


In [6]:
# Create TensorDatasets for training and testing
train_dataset = TensorDataset(X_train_encoded['input_ids'], X_train_encoded['attention_mask'], Y_train_tensor)
test_dataset = TensorDataset(X_test_encoded['input_ids'], X_test_encoded['attention_mask'], Y_test_tensor)

# Create DataLoader for batching
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)


In [7]:
# Load Roberta model for sequence classification
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)  # For 3 classes (0, 1, 2)
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from tqdm.notebook import tqdm  # For Jupyter Notebooks

# Define optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Training loop with validation
for epoch in tqdm(range(3)):  # Loop through the epochs
    loop = tqdm(train_loader, desc=f'Epoch {epoch + 1}')
    total_loss = 0
    model.train()

    for batch in loop:
        input_ids, attention_mask, labels = [b.to('cuda' if torch.cuda.is_available() else 'cpu') for b in batch]

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass
        loss.backward()

        # Update parameters
        optimizer.step()

        loop.set_postfix(loss=loss.item())

    print(f'Epoch {epoch + 1} loss: {total_loss / len(train_loader)}')

    # Validation loop
    model.eval()
    val_loss = 0
    correct_preds = 0
    total_preds = 0

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            input_ids, attention_mask, labels = [b.to('cuda' if torch.cuda.is_available() else 'cpu') for b in batch]

            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

            logits = outputs.logits

            # Get predictions
            preds = torch.argmax(logits, dim=1)
            correct_preds += (preds == labels).sum().item()
            total_preds += labels.size(0)

    val_accuracy = correct_preds / total_preds
    print(f'Validation Loss: {val_loss / len(val_loader):.4f}')
    print(f'Validation Accuracy: {val_accuracy:.4f}')




  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/21454 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:

# Final evaluation on the test set
model.eval()
correct_preds = 0
total_preds = 0

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating on Test Set"):
        input_ids, attention_mask, labels = [b.to('cuda' if torch.cuda.is_available() else 'cpu') for b in batch]

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Get predictions
        preds = torch.argmax(logits, dim=1)

        correct_preds += (preds == labels).sum().item()
        total_preds += labels.size(0)

    test_accuracy = correct_preds / total_preds
    print(f'Test Accuracy: {test_accuracy:.4f}')