## RoBERTA - Version of BERT

In [16]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import classification_report


## Data Preprocessing

In [17]:
# Load the dataset
data = pd.read_csv(r'/Users/abdulrabbani/Desktop/SEM-2/Information Storage and retrieval/Project/complete_work/complete_work/data.csv')

# Function to preprocess the data
def preprocess_data(data):
    data = data.dropna(subset=['selected_text', 'sentiment'])
    sentiment_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
    data['sentiment_label'] = data['sentiment'].map(sentiment_mapping)
    return data

# Preprocess the data
processed_data = preprocess_data(data)

# Splitting the data into train and test sets
train_data, test_data = train_test_split(processed_data, test_size=0.2, random_state=42)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['sentiment_label'] = data['sentiment'].map(sentiment_mapping)


## Tokenizer for RoBERTa to encdode the tweet text to unravel the sentiment context of the tweet

In [18]:
# Tokenizer for RoBERTa
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Creating a custom dataset for PyTorch
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=64,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }




## Creating Dataset and Dataloaders in batch size to give as an input to the RoBERTa model

In [None]:
# Create datasets and dataloaders
train_dataset = SentimentDataset(train_data['selected_text'], train_data['sentiment_label'], tokenizer)
test_dataset = SentimentDataset(test_data['selected_text'], test_data['sentiment_label'], tokenizer)
train_loader = DataLoader(train_dataset, batch_size=7, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=7, shuffle=False)

## Training and Evaluation of RoBERTa model

In [19]:
# Preparing for model training with RoBERTa
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

# Function for training, evaluation, and generating classification report
# Function for training and evaluation
def train_eval_model(model, train_loader, test_loader, optimizer, device, epochs=1):
    for epoch in range(epochs):
        # Training
        model.train()
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs[0]
            loss.backward()
            optimizer.step()

        # Evaluation
        model.eval()
        predictions, true_labels = [], []
        with torch.no_grad():
            for batch in test_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                logits = outputs[1]
                predictions.extend(torch.argmax(logits, dim=1).tolist())
                true_labels.extend(labels.tolist())

        # Calculate metrics
        accuracy = accuracy_score(true_labels, predictions)
        precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
        print(f'Epoch {epoch + 1}/{epochs} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}')
        report = classification_report(true_labels, predictions, output_dict=True)
        bert_classification_report = pd.DataFrame(report).transpose()
        return bert_classification_report



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Classification Report of Large Language Model - RoBERTa

In [20]:
# Train the model and generate classification report
classification_report_df = train_eval_model(model, train_loader, test_loader, optimizer, device)

# Display the classification report
print(classification_report_df)

Epoch 1/1 - Accuracy: 0.8923, Precision: 0.8937, Recall: 0.8923, F1: 0.8925
              precision    recall  f1-score      support
0              0.862363  0.900763  0.881145  1572.000000
1              0.882559  0.900716  0.891545  2236.000000
2              0.937659  0.873223  0.904294  1688.000000
accuracy       0.892285  0.892285  0.892285     0.892285
macro avg      0.894194  0.891567  0.892328  5496.000000
weighted avg   0.893705  0.892285  0.892486  5496.000000


## THE END