In [95]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm


In [8]:
data=pd.read_csv('dataset.csv')

In [79]:
train_texts, test_texts, train_labels, test_labels = train_test_split(data['User Input'], data['Polarity'], test_size=0.2, random_state=42)

In [30]:
model_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3) # pos neg and neutral
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [76]:
class TextDataset(Dataset):
    def __init__(self, input_ids, labels=None):
        self.input_ids = input_ids
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        item = {
            "input_ids": self.input_ids[idx]
        }
        if self.labels is not None:
            item["labels"] = self.labels[idx]
        return item

In [80]:
def preprocess_text(text):
    text = text.lower().strip()
    return text

train_texts= train_texts.apply(preprocess_text)

train_texts_input_ids = train_texts.apply(
    lambda x: tokenizer.encode(x, padding="max_length", truncation=True, max_length=512)
)
train_labels = train_labels.map({"Positive": 0, "Negative": 1, "Neutral": 2})
# train_labels_input_ids = train_labels.apply(
#     lambda x: tokenizer.encode(x)
# )


test_texts_input_ids = test_texts.apply(
    lambda x: tokenizer.encode(x, padding="max_length", truncation=True, max_length=512)
)

train_texts_input_ids = torch.tensor(train_texts_input_ids[:50].tolist())
test_texts_input_ids = torch.tensor(test_texts_input_ids[:50].tolist())
train_labels = torch.tensor(train_labels[:50].tolist())
train_labels = train_labels.view(-1, 1)




In [101]:
train_dataset = TextDataset(train_texts_input_ids, train_labels)
test_dataset = TextDataset(test_texts_input_ids, test_labels)
batch_size = 8
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


<__main__.TextDataset object at 0x35e2b0c50>


In [None]:
def train_model(model, train_dataloader, num_epochs=3, learning_rate=2e-5):

    model.train()

    optimizer = AdamW(model.parameters(), lr=learning_rate)

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)
    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        epoch_loss = 0
        progress_bar = tqdm(train_dataloader, desc="Training")
        for batch in progress_bar:
            # Move batch to device

            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, labels=labels)
            loss = outputs.loss
            epoch_loss += loss.item()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()


            progress_bar.set_postfix(loss=loss.item())


        avg_epoch_loss = epoch_loss / len(train_dataloader)
        print(f"Average Epoch Loss: {avg_epoch_loss:.4f}")

    print("Training completed.")

train_model(model, train_dataloader, num_epochs=3, learning_rate=2e-5)
