In [None]:
#RoBERTa-base pretrained model

import torch, os
import pandas as pd
from transformers import pipeline, RobertaForSequenceClassification, RobertaTokenizer, RobertaModel,BertTokenizerFast,BertForSequenceClassification, AutoModelForSequenceClassification
from transformers import GPT2Tokenizer, GPT2Model
from torch.utils.data import Dataset

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device


import gc
torch.cuda.empty_cache()
gc.collect()

df_org= pd.read_csv("processed_sentiment.1600000.csv", encoding='utf-8')

df_org = df_org.sample(frac=1.0, random_state=42)

df_org.head()
df_org=df_org.dropna()
labels = df_org['Class'].unique().tolist()
labels
NUM_LABELS= len(labels)

id2label={id:label for id,label in enumerate(labels)}

label2id={label:id for id,label in enumerate(labels)}

id2label

df_org["labels"]=df_org.Class.map(lambda x: label2id[x.strip()])
df_org.head()
df_org.Class.value_counts().plot(kind='pie', figsize=(5,5))
from transformers import AutoModel, AutoTokenizer, DistilBertTokenizer, DistilBertModel, AutoModelForSequenceClassification
import torch.nn as nn
from transformers.modeling_outputs import TokenClassifierOutput
fchidden = 256
hiddendim_lstm = 256
embeddim = 768
numlayers = 5
checkpoint='roberta-base'


class MyTaskSpecificCustomModel(nn.Module):
    """
    A task-specific custom transformer model. This model loads a pre-trained transformer model and adds a new dropout
    and linear layer at the end for fine-tuning and prediction on specific tasks.
    """
    def __init__(self, checkpoint, num_labels ):
        """
        Args:
            checkpoint (str): The name of the pre-trained model or path to the model weights.
            num_labels (int): The number of output labels in the final classification layer.
        """
        super(MyTaskSpecificCustomModel, self).__init__()
        self.num_labels = num_labels

        self.model = model = AutoModel.from_pretrained(checkpoint, config = AutoConfig.from_pretrained(checkpoint,
                                                                                                       output_attention = True,
                                                                                                       output_hidden_state = True ) )
        # New Layer
        self.dropout = nn.Dropout(0.1)
        #self.lstm=nn.LSTM(768,hiddendim_lstm,batch_first=True)
        self.classifier = nn.Linear(768, self.num_labels )

    def forward(self, input_ids = None, attention_mask=None, labels = None ):
        """
        Forward pass for the model.

        Args:
            input_ids (torch.Tensor, optional): Tensor of input IDs. Defaults to None.
            attention_mask (torch.Tensor, optional): Tensor for attention masks. Defaults to None.
            labels (torch.Tensor, optional): Tensor for labels. Defaults to None.

        Returns:
            TokenClassifierOutput: A named tuple with the following fields:
            - loss (torch.FloatTensor of shape (1,), optional, returned when label_ids is provided) – Classification loss.
            - logits (torch.FloatTensor of shape (batch_size, num_labels)) – Classification scores before SoftMax.
            - hidden_states (tuple(torch.FloatTensor), optional, returned when output_hidden_states=True is passed or when config.output_hidden_states=True) – Tuple of torch.FloatTensor (one for the output of the embeddings + one for the output of each layer) of shape (batch_size, sequence_length, hidden_size).
            - attentions (tuple(torch.FloatTensor), optional, returned when output_attentions=True is passed or when config.output_attentions=True) – Tuple of torch.FloatTensor (one for each layer) of shape (batch_size, num_heads, sequence_length, sequence_length).
        """
        outputs = self.model(input_ids = input_ids, attention_mask = attention_mask  )

        last_hidden_state = outputs[0]

        sequence_outputs = self.dropout(last_hidden_state)

        logits = self.classifier(sequence_outputs[:, 0, : ].view(-1, 768 ))

        loss = None
        loss = None
        if labels is not None:
            loss_func = nn.CrossEntropyLoss()
            loss = loss_func(logits.view(-1, self.num_labels), labels.view(-1))

            return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)


tokenizer = AutoTokenizer.from_pretrained("roberta-base", max_length=512)

model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=NUM_LABELS, id2label=id2label, label2id=label2id)
model.to(device)



SIZE= df_org.shape[0]

train_texts= list(df_org.Comments [:(90*SIZE)//100])

val_texts=   list(df_org.Comments [(90*SIZE)//100:(95*SIZE)//100 ])

test_texts=  list(df_org.Comments [(95*SIZE)//100:])

train_labels= list(df_org.labels[:(90*SIZE)//100])

val_labels=   list(df_org.labels[(90*SIZE)//100:(95*SIZE)//100 ])

test_labels=  list(df_org.labels[(95*SIZE)//100:])

len(train_texts), len(val_texts), len(test_texts)
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings  = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)
class DataLoader(Dataset):
    """
    Custom Dataset class for handling tokenized text data and corresponding labels.
    Inherits from torch.utils.data.Dataset.
    """
    def __init__(self, encodings, labels):
        """
        Initializes the DataLoader class with encodings and labels.

        Args:
            encodings (dict): A dictionary containing tokenized input text data
                              (e.g., 'input_ids', 'token_type_ids', 'attention_mask').
            labels (list): A list of integer labels for the input text data.
        """
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        """
        Returns a dictionary containing tokenized data and the corresponding label for a given index.

        Args:
            idx (int): The index of the data item to retrieve.

        Returns:
            item (dict): A dictionary containing the tokenized data and the corresponding label.
        """
        # Retrieve tokenized data for the given index
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # Add the label for the given index to the item dictionary
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        """
        Returns the number of data items in the dataset.

        Returns:
            (int): The number of data items in the dataset.
        """
        return len(self.labels)
print(train_labels)

train_dataloader = DataLoader(train_encodings, train_labels)

val_dataloader = DataLoader(val_encodings, val_labels)

test_dataset = DataLoader(test_encodings, test_labels)
from transformers import TrainingArguments, Trainer

from sklearn.metrics import accuracy_score, precision_recall_fscore_support,classification_report

def compute_metrics(pred):
    """
    Computes accuracy, F1, precision, and recall for a given set of predictions.

    Args:
        pred (obj): An object containing label_ids and predictions attributes.
            - label_ids (array-like): A 1D array of true class labels.
            - predictions (array-like): A 2D array where each row represents
              an observation, and each column represents the probability of
              that observation belonging to a certain class.

    Returns:
        dict: A dictionary containing the following metrics:
            - Accuracy (float): The proportion of correctly classified instances.
            - F1 (float): The macro F1 score, which is the harmonic mean of precision
              and recall. Macro averaging calculates the metric independently for
              each class and then takes the average.
            - Precision (float): The macro precision, which is the number of true
              positives divided by the sum of true positives and false positives.
            - Recall (float): The macro recall, which is the number of true positives
              divided by the sum of true positives and false negatives.
    """
    # Extract true labels from the input object
    labels = pred.label_ids

    # Obtain predicted class labels by finding the column index with the maximum probability
    preds = pred.predictions.argmax(-1)

    # Compute macro precision, recall, and F1 score using sklearn's precision_recall_fscore_support function
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', warn_for=('precision', 'recall', 'f-score'), sample_weight=None, zero_division=0)

    # Calculate the accuracy score using sklearn's accuracy_score function
    acc = accuracy_score(labels, preds)
    #mainreports=classification_report(preds, labels, target_names=['negative', 'positive']) #For IMDb and Sentiment140
    mainreports=classification_report(preds, labels, target_names=['negative', 'positive'])

    # Return the computed metrics as a dictionary
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall,
        'reports': mainreports
    }


from huggingface_hub import notebook_login

notebook_login()

training_args = TrainingArguments(
    output_dir="./imdbreviews_classification_roberta_v02",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    #weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    # the pre-trained model that will be fine-tuned
    model=model,
     # training arguments that we defined above
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataloader,
    eval_dataset=val_dataloader,
    compute_metrics= compute_metrics
)


trainer.train()

q=[trainer.evaluate(eval_dataset=df_org) for df_org in [train_dataloader, val_dataloader, test_dataset]]

pd.DataFrame(q, index=["train","val","test"]).iloc[:,:5]

