<a href="https://colab.research.google.com/github/martiny76/Files-PdfConverter/blob/main/AL_RL_PoC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
ENV = 'COLAB'  #@param ["COLAB", "CONDA"]
SAMPLING_STRATEGY = "random"  #@param ["uncertainty", "margin", "random"]
DATASET_NAME = "SetFit/subj" #@param ["ag_news", "SetFit/subj"]
MODEL_NAME = 'bert-base-uncased' #@param ['bert-base-uncased']
NUM_CLASSES = {
    "ag_news": 4,
    "SetFit/subj": 2
}
BATCH_SIZE_TRAIN = 1
BATCH_SIZE = 64

In [2]:
if ENV == 'COLAB':
  !pip install datasets --quiet

Data part

In [3]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from datasets import load_dataset

# Load dataset
dataset = load_dataset(DATASET_NAME)

# Tokenizer
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

# Process dataset
class AGNewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def encode_data(tokenizer, texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)
    return AGNewsDataset(encodings, labels)

# Test set is pre-defined
test_texts, test_labels = dataset['train']['text'], dataset['train']['label']

# Split train data into training and dev sets
train_texts, dev_texts, train_labels, dev_labels = train_test_split(
    dataset['train']['text'], dataset['train']['label'], test_size=0.2
)

# Encode the data
train_dataset = encode_data(tokenizer, train_texts, train_labels)
dev_dataset = encode_data(tokenizer, dev_texts, dev_labels)
test_dataset = encode_data(tokenizer, test_texts, test_labels)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Repo card metadata block was not found. Setting CardData to empty.


BERT part

In [4]:
import torch
import torch.nn as nn
from transformers import BertModel

class BertClassifier(nn.Module):
    def __init__(self, bert_model, num_classes):
        super(BertClassifier, self).__init__()
        self.bert = bert_model  # Pre-loaded BERT model
        self.classifier = nn.Sequential(
            nn.Linear(768, 512),  # BERT base outputs 768 features from CLS token
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(512, num_classes)
        )

    def forward(self, input_ids, attention_mask):
        # Get the outputs from BERT model
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        # We use the output associated with the CLS token
        cls_output = outputs.last_hidden_state[:, 0, :]  # Shape: (batch_size, 768)

        # Pass CLS token embeddings through the classifier to get final logits
        logits = self.classifier(cls_output)

        return logits


In [5]:
from transformers import BertModel

# Load pre-trained BERT
bert_model = BertModel.from_pretrained(MODEL_NAME)

# Initialize the classifier
num_classes = NUM_CLASSES[DATASET_NAME]  # Assuming 4 classes for AG News dataset
model = BertClassifier(bert_model, num_classes)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [6]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch

def evaluate_model(model, data_loader, device):
    """
    Evaluate the model on a given dataset.

    Parameters:
        model (nn.Module): The model to be evaluated.
        data_loader (DataLoader): DataLoader for the dataset to evaluate on.
        device (torch.device): Device to perform computation (GPU/CPU).

    Returns:
        float: Accuracy of the model on the provided dataset.
        float: Precision, Recall, and F1-Score of the model.
    """
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            _, predictions = torch.max(outputs, dim=1)

            all_preds.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='macro')

    return accuracy, precision, recall, f1

def evaluate_on_datasets(model, dev_loader, test_loader, device):
    """
    Evaluate the model on development and test datasets.

    Parameters:
        model (nn.Module): The model to be evaluated.
        dev_loader (DataLoader): DataLoader for the development dataset.
        test_loader (DataLoader): DataLoader for the test dataset.
        device (torch.device): Device to perform computation (GPU/CPU).

    Returns:
        dict: A dictionary containing accuracy, precision, recall, and F1-score for both datasets.
    """
    print("Evaluating on development dataset:")
    dev_accuracy, dev_precision, dev_recall, dev_f1 = evaluate_model(model, dev_loader, device)
    print(f"Development Set - Accuracy: {dev_accuracy:.4f}, Precision: {dev_precision:.4f}, Recall: {dev_recall:.4f}, F1-Score: {dev_f1:.4f}")

    print("Evaluating on test dataset:")
    test_accuracy, test_precision, test_recall, test_f1 = evaluate_model(model, test_loader, device)
    print(f"Test Set - Accuracy: {test_accuracy:.4f}, Precision: {test_precision:.4f}, Recall: {test_recall:.4f}, F1-Score: {test_f1:.4f}")

    return {
        'dev': {
            'accuracy': dev_accuracy,
            'precision': dev_precision,
            'recall': dev_recall,
            'f1': dev_f1
        },
        'test': {
            'accuracy': test_accuracy,
            'precision': test_precision,
            'recall': test_recall,
            'f1': test_f1
        }
    }


In [7]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

def train_model(model, data_loader, optimizer):
    model.train()
    total_loss = 0
    loss_fn = nn.CrossEntropyLoss()
    best_performance = {
        'dev': {'accuracy': 0},
        'test': {'accuracy': 0}
    }

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Zero the gradients on each iteration
        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask)

        # Compute loss between predictions and labels
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()


        total_loss += loss.item()

    #
    eval_results = evaluate_on_datasets(model, dev_loader, test_loader, device)
    print(12 * ' = ')

    if eval_results['dev']['accuracy'] > best_performance['dev']['accuracy']:
        best_performance['dev'] = eval_results['dev']
        best_performance['test'] = eval_results['test']



    print("Average loss:", total_loss / len(data_loader))
    print(best_performance)
    return best_performance



In [8]:
# Example training call
# train_model(model, train_loader, optimizer)


Active Learning part

In [9]:
import torch
import numpy as np

def sample_data(logits, k, used_indices, total_samples, method='uncertainty'):
    """
    Selects indices of k most uncertain samples based on the specified sampling method,
    excluding already used indices.

    Parameters:
        logits (torch.Tensor): Logits from the model for the unlabeled data.
        k (int): Number of samples to query.
        used_indices (set): Set of indices that have already been used.
        total_samples (int): Total number of samples in the dataset.
        method (str): Sampling method ('uncertainty', 'random', 'margin').

    Returns:
        torch.Tensor: Indices of the selected samples.
    """
    valid_indices = torch.tensor([i for i in range(total_samples) if i not in used_indices], dtype=torch.long)
    logits = logits[valid_indices]

    if method == 'random':
        indices = torch.randperm(logits.size(0))[:k]
    elif method == 'margin':
        probs = torch.softmax(logits, dim=1)
        top2_probs = torch.topk(probs, 2, dim=1).values
        margins = top2_probs[:, 0] - top2_probs[:, 1]
        indices = torch.topk(margins, k, largest=False).indices
    else:  # Default to uncertainty sampling
        probs = torch.softmax(logits, dim=1)
        log_probs = torch.log(probs + 1e-5)  # Adding a small constant to avoid log(0)
        entropy = -(probs * log_probs).sum(dim=1)
        indices = torch.topk(entropy, k).indices

    return valid_indices[indices]


def select_action(model, data_loader, dqn, device, query_budget, used_indices, sampling_method='uncertainty'):
    """
    Select actions for each sample in the dataset using the DQN, considering a query budget.
    """
    selected_indices = []
    all_rewards = []
    model.eval()
    dqn.eval()

    with torch.no_grad():
        all_logits = []
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask)
            all_logits.append(outputs.logits)
        all_logits = torch.cat(all_logits, dim=0)

        query_indices = sample_data(all_logits, min(query_budget, len(all_logits) - len(used_indices)), used_indices, len(all_logits), method=sampling_method)

        # Update used_indices with new selections
        used_indices.update(query_indices.tolist())

        # Process selected samples
        for idx in query_indices:
            if query_budget <= 0:
                break
            selected_indices.append(idx)
            cls_embeddings = all_logits[idx].unsqueeze(0)  # Extract the CLS token embeddings for this index
            action_probs = dqn(cls_embeddings)
            action = torch.argmax(action_probs, dim=1).item()
            if action == 1:
                reward = calculate_reward(cls_embeddings, model)
                all_rewards.append(reward)
                query_budget -= 1

    return selected_indices, all_rewards, query_budget, used_indices



In [10]:
import torch
import numpy as np

def uncertainty_sampling(logits, k):
    """Selects the indices of k most uncertain samples based on entropy of predictions."""
    probs = torch.softmax(logits, dim=1)
    log_probs = torch.log(probs + 1e-5)  # Adding a small constant to avoid log(0)
    entropy = -(probs * log_probs).sum(dim=1)
    return torch.topk(entropy, k).indices


In [11]:
if not False:
  from torch.utils.data import DataLoader, Subset, random_split
  from transformers import BertTokenizer, BertModel

  # Assume data_loader for the entire dataset and model are already defined
  num_iterations = 10
  k = 10  # Number of samples to query each iteration

  # Split data into initial labeled and unlabeled sets
  initial_labeled_size = 100
  labeled_dataset, unlabeled_dataset = random_split(train_dataset, [initial_labeled_size, len(train_dataset) - initial_labeled_size])

  labeled_loader = DataLoader(labeled_dataset, batch_size=BATCH_SIZE_TRAIN, shuffle=True)
  unlabeled_loader = DataLoader(unlabeled_dataset, batch_size=BATCH_SIZE_TRAIN, shuffle=False)

  for iteration in range(num_iterations):
      # Train model on the current labeled dataset
      train_model(model, labeled_loader, optimizer)

      # Make predictions on the unlabeled dataset to find the most uncertain samples
      model.eval()
      all_logits = []
      with torch.no_grad():
          for batch in unlabeled_loader:
              input_ids = batch['input_ids'].to(device)
              attention_mask = batch['attention_mask'].to(device)
              logits = model(input_ids, attention_mask)
              all_logits.append(logits)
      all_logits = torch.cat(all_logits, dim=0)

      # Initialization of used indices
      used_indices = set()

      # Uncertainty sampling
      query_indices = sample_data(all_logits, k, used_indices, len(all_logits), method=SAMPLING_STRATEGY)

      # Update labeled and unlabeled sets: simulate the labeling process
      newly_labeled = Subset(unlabeled_dataset, query_indices)
      labeled_dataset = torch.utils.data.ConcatDataset([labeled_dataset, newly_labeled])
      unlabeled_dataset = Subset(unlabeled_dataset, [i for i in range(len(unlabeled_dataset)) if i not in query_indices])

      # Update DataLoaders
      labeled_loader = DataLoader(labeled_dataset, batch_size=BATCH_SIZE_TRAIN, shuffle=True)
      unlabeled_loader = DataLoader(unlabeled_dataset, batch_size=BATCH_SIZE_TRAIN, shuffle=False)

      print(f"Iteration {iteration + 1}: Model trained on an additional {k} samples")
      print(f"Using {( len(labeled_loader) / (len(labeled_loader) + len(unlabeled_loader)) ) * 100} % of data.")


Evaluating on development dataset:
Development Set - Accuracy: 0.9094, Precision: 0.9099, Recall: 0.9108, F1-Score: 0.9094
Evaluating on test dataset:
Test Set - Accuracy: 0.9129, Precision: 0.9137, Recall: 0.9131, F1-Score: 0.9129
 =  =  =  =  =  =  =  =  =  =  =  = 
Average loss: 0.42332098495215176
{'dev': {'accuracy': 0.909375, 'precision': 0.9099271026751943, 'recall': 0.9107778016191606, 'f1': 0.9093528693528694}, 'test': {'accuracy': 0.912875, 'precision': 0.9137206678254006, 'recall': 0.9130701770889281, 'f1': 0.9128523402467819}}
Iteration 1: Model trained on an additional 10 samples
Using 1.7187500000000002 % of data.
Evaluating on development dataset:
Development Set - Accuracy: 0.9175, Precision: 0.9268, Recall: 0.9141, F1-Score: 0.9164
Evaluating on test dataset:
Test Set - Accuracy: 0.9119, Precision: 0.9207, Recall: 0.9113, F1-Score: 0.9113
 =  =  =  =  =  =  =  =  =  =  =  = 
Average loss: 0.1324831052937291
{'dev': {'accuracy': 0.9175, 'precision': 0.9267792320567637, 

Active Learning + Reinforcement Learning

In [12]:
class DQN(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(DQN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 2)  # Binary output for each sample
        )

    def forward(self, x):
        return self.net(x)



In [13]:
def select_action(model, data_loader, dqn, device, query_budget=5):
    """
    Select actions for each sample in the dataset using the DQN, considering a query budget.

    Parameters:
        model (nn.Module): The BERT-based classification model.
        data_loader (DataLoader): DataLoader for the unlabeled dataset.
        dqn (DQN): The trained DQN model to decide on actions.
        device (torch.device): Device to perform computation (GPU/CPU).
        query_budget (int): Remaining budget for querying labels.

    Returns:
        list: Indices of samples selected for labeling.
        list: Rewards accumulated for each selected sample.
        int: Updated query budget after selecting actions.
    """
    selected_indices = []
    all_rewards = []
    model.eval()
    dqn.eval()

    with torch.no_grad():
        for idx, batch in enumerate(data_loader):
            if query_budget <= 0:  # Stop querying if the budget is exhausted
                break

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]  # Extract the CLS token embeddings

            # DQN takes the state (CLS embeddings) and outputs action probabilities
            action_probs = dqn(cls_embeddings)
            action = torch.argmax(action_probs, dim=1).item()  # Choosing the action with the highest probability

            # Append the index of the sample if the action is to label it
            if action == 1:  # Action 1 implies query the label
                if query_budget > 0:
                    selected_indices.append(idx)
                    reward = calculate_reward(cls_embeddings, model)
                    all_rewards.append(reward)
                    query_budget -= 1  # Decrement the budget

    return selected_indices, all_rewards

def calculate_reward(cls_embeddings, model):
    """
    Calculate the reward for labeling a sample based on model performance improvement and query cost.

    Parameters:
        cls_embeddings (torch.Tensor): CLS token embeddings from BERT model.
        model (nn.Module): The BERT-based classification model.

    Returns:
        float: Computed reward for the action taken.
    """
    performance_improvement = simulate_model_improvement(cls_embeddings, model)
    query_cost = 0.1  # Assuming a fixed cost for querying
    return performance_improvement - query_cost

def simulate_model_improvement(cls_embeddings, model):
    """
    Simulate an improvement in model performance to calculate the reward.
    This is a placeholder function and should be implemented based on actual model evaluations.

    Returns:
        float: Simulated improvement value.
    """
    return 0.05  # Example fixed improvement for demonstration


In [14]:
import torch
from torch.utils.data import DataLoader, Subset, random_split
import torch.optim as optim
import torch.nn as nn
from collections import deque
import numpy as np
batch_size = 1
# Assuming data_loader for the entire dataset and model are already defined
num_iterations = 10
k = 10  # Number of samples to query each iteration

# Split data into initial labeled and unlabeled sets
initial_labeled_size = 100
labeled_dataset, unlabeled_dataset = random_split(train_dataset, [initial_labeled_size, len(train_dataset) - initial_labeled_size])

labeled_loader = DataLoader(labeled_dataset, batch_size=BATCH_SIZE_TRAIN, shuffle=True)
unlabeled_loader = DataLoader(unlabeled_dataset, batch_size=BATCH_SIZE_TRAIN, shuffle=False)

# Parameters
input_dim = 768  # Assuming using BERT's CLS token embedding as state
hidden_dim = 256

# Initialize DQN
dqn = DQN(input_dim, hidden_dim).to(device)
optimizer_dqn = optim.Adam(dqn.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

# Experience replay buffer
replay_buffer = deque(maxlen=10000)

# Active learning loop
for iteration in range(num_iterations):
    # Train model on the current labeled dataset
    train_model(model, labeled_loader, optimizer)

    # Evaluate model on the unlabeled dataset and select actions
    new_indices, rewards = select_action(bert_model, unlabeled_loader, dqn, device, query_budget=k)
    print(f"Iteration {iteration + 1}: Model trained on newly labeled samples. Total rewards: {sum(rewards)}")
    print(f"Using {( len(labeled_loader) / (len(labeled_loader) + len(unlabeled_loader)) ) * 100} % of data.")

    # Update labeled and unlabeled sets based on the actions taken
    if new_indices:
        newly_labeled = Subset(unlabeled_dataset, new_indices)
        remaining_indices = [i for i in range(len(unlabeled_dataset)) if i not in new_indices]
        unlabeled_dataset = Subset(unlabeled_dataset, remaining_indices)
        labeled_dataset = torch.utils.data.ConcatDataset([labeled_dataset, newly_labeled])

        # Update DataLoaders for the next iteration
        labeled_loader = DataLoader(labeled_dataset, batch_size=BATCH_SIZE_TRAIN, shuffle=True)
        unlabeled_loader = DataLoader(unlabeled_dataset, batch_size=BATCH_SIZE_TRAIN, shuffle=False)

    # Optional: Update DQN based on the collected experience
    if len(replay_buffer) >= batch_size:
        train_dqn(dqn, replay_buffer, optimizer_dqn, batch_size)


Evaluating on development dataset:
Development Set - Accuracy: 0.8988, Precision: 0.9018, Recall: 0.9012, F1-Score: 0.8987
Evaluating on test dataset:
Test Set - Accuracy: 0.9051, Precision: 0.9088, Recall: 0.9055, F1-Score: 0.9050
 =  =  =  =  =  =  =  =  =  =  =  = 
Average loss: 0.2772319319087546
{'dev': {'accuracy': 0.89875, 'precision': 0.9017501595830608, 'recall': 0.9011964371576826, 'f1': 0.8987460447673736}, 'test': {'accuracy': 0.905125, 'precision': 0.9088075914910777, 'recall': 0.9055238509671064, 'f1': 0.9049662208285465}}
Iteration 1: Model trained on newly labeled samples. Total rewards: -0.49999999999999994
Using 1.5625 % of data.
Evaluating on development dataset:
Development Set - Accuracy: 0.9287, Precision: 0.9347, Recall: 0.9261, F1-Score: 0.9280
Evaluating on test dataset:
Test Set - Accuracy: 0.9280, Precision: 0.9329, Recall: 0.9276, F1-Score: 0.9277
 =  =  =  =  =  =  =  =  =  =  =  = 
Average loss: 0.020294637626714327
{'dev': {'accuracy': 0.92875, 'precision

In [15]:
cnt = 0
for i in labeled_dataset:
    cnt += 1

In [16]:
cnt

200