In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AlbertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True
        )
        return {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Load your dataset
data = pd.read_csv('updated_dataset.csv')
data.dropna(inplace=True)

# Encode labels
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
max_len = 128

# Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(data['text'], data['label'], test_size=0.1, random_state=42)

# Create Dataset
train_dataset = TextDataset(train_texts.tolist(), train_labels.tolist(), tokenizer, max_len)
val_dataset = TextDataset(val_texts.tolist(), val_labels.tolist(), tokenizer, max_len)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)


In [None]:
from transformers import AlbertForSequenceClassification, Trainer, TrainingArguments
device = torch.device("cuda")
# Model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_)).to(device)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=50,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=lambda p: {'accuracy': (p.predictions.argmax(-1) == p.label_ids).astype(float).mean()}
)

# Train the model
trainer.train()


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4328,0.473968,0.83969
2,0.3139,0.41253,0.857143
3,0.1883,0.410083,0.868132


TrainOutput(global_step=837, training_loss=0.4078730791985205, metrics={'train_runtime': 919.8021, 'train_samples_per_second': 45.408, 'train_steps_per_second': 0.91, 'total_flos': 249581362857984.0, 'train_loss': 0.4078730791985205, 'epoch': 3.0})