In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import pandas as pd
# Load CSV files
train_data = pd.read_csv(r'D:\Downloads\dataset\X_train_cleaned.csv')
test_data = pd.read_csv(r'D:\Downloads\dataset\X_test_cleaned.csv')

# Extract features and targets
X_train_cleaned = train_data['review_body'].tolist()  # Text data
Y_train = train_data['sentiment'].tolist()  # Labels
X_test_cleaned = test_data['review_body'].tolist()
Y_test = test_data['sentiment'].tolist()

# Ensure the data is a list of strings
X_train_cleaned = [str(item) for item in X_train_cleaned]
X_test_cleaned = [str(item) for item in X_test_cleaned]

# Split training data into train and validation sets
X_train, X_val, Y_train, Y_val = train_test_split(
    X_train_cleaned, Y_train, test_size=0.2, random_state=42, stratify=Y_train
)

# Load Roberta tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenize and pad sequences to a fixed length for train, validation, and test sets
X_train_encoded = tokenizer(X_train, padding=True, truncation=True, return_tensors='pt', max_length=128)
X_val_encoded = tokenizer(X_val, padding=True, truncation=True, return_tensors='pt', max_length=128)
X_test_encoded = tokenizer(X_test_cleaned, padding=True, truncation=True, return_tensors='pt', max_length=128)

# Convert labels to tensors
Y_train_tensor = torch.tensor(Y_train)
Y_val_tensor = torch.tensor(Y_val)
Y_test_tensor = torch.tensor(Y_test)

# Create TensorDatasets for train, validation, and test sets
train_dataset = TensorDataset(X_train_encoded['input_ids'], X_train_encoded['attention_mask'], Y_train_tensor)
val_dataset = TensorDataset(X_val_encoded['input_ids'], X_val_encoded['attention_mask'], Y_val_tensor)
test_dataset = TensorDataset(X_test_encoded['input_ids'], X_test_encoded['attention_mask'], Y_test_tensor)

# Create DataLoaders for batching
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

# Load Roberta model for sequence classification
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)  # For 3 classes (0, 1, 2)
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')



In [2]:
# Define optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Training loop
for epoch in tqdm(range(3)):  # Loop through the epochs
    print(f"Epoch {epoch + 1}")
    model.train()
    total_train_loss = 0

    loop = tqdm(train_loader, desc="Training", leave=False)
    for batch in loop:
        input_ids, attention_mask, labels = [b.to('cuda' if torch.cuda.is_available() else 'cpu') for b in batch]

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_train_loss += loss.item()

        # Backward pass
        loss.backward()

        # Update parameters
        optimizer.step()

        loop.set_postfix(loss=loss.item())

    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Average training loss: {avg_train_loss:.4f}")

    # Validation loop
    model.eval()
    total_val_loss = 0
    correct_preds = 0
    total_preds = 0

    with torch.no_grad():
        loop = tqdm(val_loader, desc="Validating", leave=False)
        for batch in loop:
            input_ids, attention_mask, labels = [b.to('cuda' if torch.cuda.is_available() else 'cpu') for b in batch]

            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_val_loss += loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            correct_preds += (preds == labels).sum().item()
            total_preds += labels.size(0)

    avg_val_loss = total_val_loss / len(val_loader)
    val_accuracy = correct_preds / total_preds
    print(f"Validation loss: {avg_val_loss:.4f}, Accuracy: {val_accuracy:.4f}")

# Test loop
model.eval()
correct_preds = 0
total_preds = 0

with torch.no_grad():
    loop = tqdm(test_loader, desc="Testing", leave=False)
    for batch in loop:
        input_ids, attention_mask, labels = [b.to('cuda' if torch.cuda.is_available() else 'cpu') for b in batch]

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        correct_preds += (preds == labels).sum().item()
        total_preds += labels.size(0)

test_accuracy = correct_preds / total_preds
print(f"Test set accuracy: {test_accuracy:.4f}")




  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1


Training:   0%|          | 0/17163 [00:00<?, ?it/s]

Average training loss: 0.5195


Validating:   0%|          | 0/4291 [00:00<?, ?it/s]

Validation loss: 0.4883, Accuracy: 0.7933
Epoch 2


Training:   0%|          | 0/17163 [00:00<?, ?it/s]

Average training loss: 0.4558


Validating:   0%|          | 0/4291 [00:00<?, ?it/s]

Validation loss: 0.4764, Accuracy: 0.8024
Epoch 3


Training:   0%|          | 0/17163 [00:00<?, ?it/s]

Average training loss: 0.4158


Validating:   0%|          | 0/4291 [00:00<?, ?it/s]

Validation loss: 0.5048, Accuracy: 0.7967


Testing:   0%|          | 0/5364 [00:00<?, ?it/s]

Test set accuracy: 0.7977


In [3]:
import os
from transformers import RobertaForSequenceClassification, RobertaTokenizer

# Define directories to save the model and tokenizer
model_save_path = "./saved_model"
os.makedirs(model_save_path, exist_ok=True)

# Save the model
model.save_pretrained(model_save_path)

# Save the tokenizer
tokenizer.save_pretrained(model_save_path)

print(f"Model and tokenizer saved to {model_save_path}")


Model and tokenizer saved to ./saved_model


In [2]:
import os
import pandas as pd
from sklearn.metrics import classification_report
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from torch.utils.data import DataLoader, Dataset
import torch

# Define the path to the saved model and tokenizer
model_save_path = "./saved_model"

# Load the model and tokenizer
model = RobertaForSequenceClassification.from_pretrained(model_save_path)
tokenizer = RobertaTokenizer.from_pretrained(model_save_path)

print(f"Model and tokenizer loaded from {model_save_path}")

# Load the data
test_data = pd.read_csv(r"D:/Downloads/dataset/X_test_cleaned.csv").dropna()

# Define a custom dataset class for tokenized inputs
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(label, dtype=torch.long),
        }

# Prepare the test dataset
X_test_cleaned = test_data["review_body"]
Y_test = test_data["sentiment"]

test_dataset = CustomDataset(X_test_cleaned, Y_test, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=16)

# Evaluate the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=-1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Generate classification report
report = classification_report(all_labels, all_preds, target_names=["Class 0", "Class 1"])
print("Classification Report:\n")
print(report)


Model and tokenizer loaded from ./saved_model


ValueError: Number of classes, 3, does not match size of target_names, 2. Try specifying the labels parameter

In [3]:

# Generate classification report
report = classification_report(all_labels, all_preds, target_names=["Class 0", "Class 1","Class 2"])
print("Classification Report:\n")
print(report)


Classification Report:

              precision    recall  f1-score   support

     Class 0       0.76      0.88      0.82     28603
     Class 1       0.75      0.64      0.69     28599
     Class 2       0.88      0.88      0.88     28595

    accuracy                           0.80     85797
   macro avg       0.80      0.80      0.79     85797
weighted avg       0.80      0.80      0.79     85797

