In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers torch torchvision torchaudio datasets tqdm scikit-learn imbalanced-learn


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting 

In [None]:
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from transformers import (
    XLNetTokenizer, XLNetForSequenceClassification, BertTokenizer, BertForMaskedLM,
    get_scheduler
)
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from tqdm import tqdm

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load Dataset
df = pd.read_csv('/content/drive/MyDrive/research/dataset_new/Reviews.csv')

# Map Scores to Sentiment Labels
label_mapping = {1: "Very Poor", 2: "Poor", 3: "Neutral", 4: "Good", 5: "Very Good"}
df['label'] = df['Score'].map(label_mapping)

# Convert labels to numerical format
label_to_idx = {label: idx for idx, label in enumerate(label_mapping.values())}
df['label'] = df['label'].map(label_to_idx)

# Compute Class Weights for Imbalance Handling
class_weights = compute_class_weight('balanced', classes=np.unique(df['label']), y=df['label'])
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# Tokenizer for XLNet
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")

# Augment Minority Classes using BERT-MLM
mlm_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
mlm_model = BertForMaskedLM.from_pretrained("bert-base-uncased").to(device)

def augment_text_batch(texts, batch_size=16):
    """Applies BERT-based MLM augmentation in batches to prevent OOM."""
    augmented_texts = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Augmenting Texts"):
        batch_texts = texts[i : i + batch_size]
        inputs = mlm_tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)

        with torch.no_grad():
            outputs = mlm_model(**inputs).logits

        mask_token_index = (inputs["input_ids"] == mlm_tokenizer.mask_token_id).nonzero(as_tuple=True)
        predictions = torch.argmax(outputs, dim=-1)

        for j in range(len(batch_texts)):
            masked_indices = (inputs["input_ids"][j] == mlm_tokenizer.mask_token_id).nonzero(as_tuple=True)[0]
            if len(masked_indices) > 0:
                inputs["input_ids"][j, masked_indices] = predictions[j, masked_indices]
            augmented_texts.append(mlm_tokenizer.decode(inputs["input_ids"][j], skip_special_tokens=True))

        # Free GPU memory after each batch
        torch.cuda.empty_cache()

    return augmented_texts


Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another archite

In [None]:

# Apply Augmentation to Minority Classes
minority_classes = [1, 3, 4]  # "Poor", "Neutral", "Good"
augmented_data = []
for class_label in minority_classes:
    class_samples = df[df['label'] == class_label].sample(n=1000, random_state=42)
    aug_texts = augment_text_batch(class_samples['Text'].tolist(), batch_size=16)
    aug_df = pd.DataFrame({"Text": aug_texts, "label": class_label})
    augmented_data.append(aug_df)

df = pd.concat([df] + augmented_data)

# Split Data
train_texts, val_texts, train_labels, val_labels = train_test_split(df['Text'], df['label'], test_size=0.3, stratify=df['label'])

# Custom Dataset Class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(self.texts[idx], padding="max_length", truncation=True, max_length=self.max_len, return_tensors="pt")
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(self.labels[idx], dtype=torch.long),
        }

# Create Dataset & Dataloader
train_dataset = SentimentDataset(train_texts, train_labels, tokenizer)
val_dataset = SentimentDataset(val_texts, val_labels, tokenizer)

# Reduce batch size to avoid OOM
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)


Augmenting Texts: 100%|██████████| 63/63 [00:11<00:00,  5.53it/s]
Augmenting Texts: 100%|██████████| 63/63 [00:10<00:00,  5.78it/s]
Augmenting Texts: 100%|██████████| 63/63 [00:10<00:00,  6.24it/s]


In [None]:

# Define XLNet Model
# Define XLNet Model
class XLNetSentiment(nn.Module):
    def __init__(self):
        super(XLNetSentiment, self).__init__()
        self.xlnet = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=5)
        # Remove the line below as it's causing the error
        # self.xlnet.gradient_checkpointing_enable()  # Enable gradient checkpointing

    def forward(self, input_ids, attention_mask, labels=None):
        return self.xlnet(input_ids, attention_mask=attention_mask, labels=labels)
# Initialize Model
model = XLNetSentiment().to(device)

# Focal Loss Function
class FocalLoss(nn.Module):
    def __init__(self, alpha=class_weights, gamma=2.0):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, logits, targets):
        ce_loss = F.cross_entropy(logits, targets, weight=self.alpha, reduction="none")
        pt = torch.exp(-ce_loss)
        return ((1 - pt) ** self.gamma * ce_loss).mean()

# Optimizer & Loss
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = FocalLoss()
scaler = torch.cuda.amp.GradScaler()  # Mixed Precision Training

# Training Loop
def train_model(model, train_loader, val_loader, optimizer, loss_fn, epochs=1):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}")

        for batch in progress_bar:
            optimizer.zero_grad()
            batch = {key: val.to(device) for key, val in batch.items()}

            with torch.cuda.amp.autocast():
                outputs = model(batch["input_ids"], batch["attention_mask"], batch["label"])
                loss = loss_fn(outputs.logits, batch["label"])

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            total_loss += loss.item()
            progress_bar.set_postfix(loss=total_loss / (progress_bar.n + 1))

# Train Model
train_model(model, train_loader, val_loader, optimizer, loss_fn)

# Evaluate Model
model.eval()
preds, labels = [], []
with torch.no_grad():
    for batch in tqdm(val_loader, desc="Evaluating"):
        batch = {key: val.to(device) for key, val in batch.items()}
        outputs = model(batch["input_ids"], batch["attention_mask"])
        preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
        labels.extend(batch["label"].cpu().numpy())

# Print Metrics
print("Classification Report:\n", classification_report(labels, preds))
print("Accuracy:", accuracy_score(labels, preds))


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = torch.cuda.amp.GradScaler()  # Mixed Precision Training
  with torch.cuda.amp.autocast():
Epoch 1: 100%|██████████| 25002/25002 [1:30:56<00:00,  4.58it/s, loss=0.652]
Evaluating: 100%|██████████| 10715/10715 [31:39<00:00,  5.64it/s]


Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.75      0.74     15680
           1       0.49      0.54      0.51      9231
           2       0.49      0.59      0.54     12792
           3       0.19      0.83      0.31     24497
           4       0.97      0.22      0.36    109237

    accuracy                           0.40    171437
   macro avg       0.57      0.59      0.49    171437
weighted avg       0.78      0.40      0.41    171437

Accuracy: 0.4033201700916372
