In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from pathlib import Path
from dataclasses import dataclass
import logging

from adaptive_al_v2.strategies import FineTuneStrategy
from adaptive_al_v2.samplers import RandomSampler
from adaptive_al_v2.config import ExperimentConfig

from adaptive_al_v2 import ActiveLearning

from adaptive_al_v2.utils.data_loader import load_agnews
from adaptive_al_v2.utils.text_datasets import SimpleTextDataset
from adaptive_al_v2.utils.text_classifiers import SimpleTextClassifier # Dummy class for testing

## Loading necessary model + optimizer/criterion/scheduler

In [3]:
model = SimpleTextClassifier(
    hidden_dim=32,
    num_classes=4
)

device = "cuda" if torch.cuda.is_available() else "cpu"
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

print(f"Using device: {device}")
print(f"Model: {model}")

Using device: cuda
Model: SimpleTextClassifier(
  (fc1): Linear(in_features=3, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=4, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


## Loading the data (maybe we should abstract this instead)

In [4]:
df_train, df_val, df_test = load_agnews(path='data')

df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# Create datasets
train_dataset = SimpleTextDataset(
    texts=df_train['text'].tolist(),
    labels=df_train['label'].tolist()
)

val_dataset = SimpleTextDataset(
    texts=df_val['text'].tolist(),
    labels=df_val['label'].tolist()
)

test_dataset = SimpleTextDataset(
    texts=df_test['text'].tolist(),
    labels=df_test['label'].tolist()
)

## Creating a configuration for the active learning

In [5]:
cfg = ExperimentConfig(
    seed=42,
    experiment_name="dummy test!!!",
    save_dir=Path("./experiments"),

    # Active learning settings
    initial_pool_size=100,
    batch_size=32,

    # Model configuration
    model_class=SimpleTextClassifier,
    model_kwargs={
        "hidden_dim": 32,
        "num_classes": 4
    },

    train_dataset = train_dataset,
    val_dataset = val_dataset,
    test_dataset = test_dataset,

    # Strategy and sampler instances
    strategy=FineTuneStrategy(
        optimizer=optimizer,
        criterion=criterion,
        scheduler=scheduler,
        device=device,
        epochs=3,
        batch_size=32
    ),

    sampler=RandomSampler(seed=42)
)

In [6]:
print(f"Strategy: {cfg.strategy}")
print(f"Sampler: {cfg.sampler}")
print(f"Model class: {cfg.model_class}")

Strategy: <adaptive_al_v2.strategies.fine_tuning_strategy.FineTuneStrategy object at 0x0000021066A1F550>
Sampler: <adaptive_al_v2.samplers.random_sampler.RandomSampler object at 0x0000021064BF8910>
Model class: <class 'adaptive_al_v2.utils.text_classifiers.SimpleTextClassifier'>


In [7]:
al = ActiveLearning(cfg)

In [8]:
print(f"Dataset sizes:")
print(f"Train: {len(al.train_dataset)}")
print(f"Val: {len(al.val_dataset)}")
print(f"Test: {len(al.test_dataset)}")

Dataset sizes:
Train: 108000
Val: 12000
Test: 7600


In [9]:
pool_stats = al.pool.get_pool_stats()
print(f"Initial pool stats: {pool_stats}")

Initial pool stats: {'labeled_count': 100, 'unlabeled_count': 107900, 'total_count': 108000}


In [10]:
# Test dataset access
sample = al.train_dataset[0]
print(f"Dataset sample: {sample[0][:50]}... | Label: {sample[1]}")

# Test pool subsets
labeled_subset = al.pool.get_labeled_subset()
unlabeled_subset = al.pool.get_unlabeled_subset()
print(f"Pool subsets - Labeled: {len(labeled_subset)}, Unlabeled: {len(unlabeled_subset)}")

# Test DataLoader
from torch.utils.data import DataLoader
dataloader = DataLoader(labeled_subset, batch_size=8, shuffle=True)

for i, batch in enumerate(dataloader):
    inputs, targets = batch
    print(f"Batch {i+1}: {len(inputs)} inputs, targets shape: {targets.shape}")
    print(f"Sample input: {inputs[0][:30]}...")
    print(f"Sample targets: {targets[:3].tolist()}")
    break

Dataset sample: tensor([283.,  54., 218.])... | Label: 1
Pool subsets - Labeled: 100, Unlabeled: 107900
Batch 1: 8 inputs, targets shape: torch.Size([8])
Sample input: tensor([185.,  35., 138.])...
Sample targets: [2, 2, 0]


In [11]:
# Train with initial labeled data
round_stats = al.train_one_round(new_indices=None)
print(f"Initial training completed!")
print(f"Round stats: {round_stats}")

Initial training completed!
Round stats: {'training_time': 0.13252997398376465, 'avg_loss': 41.456451098124184, 'epochs': 3, 'total_samples': 100, 'new_samples': 0, 'f1_score': 0.7291267979503492, 'pool_stats': {'labeled_count': 100, 'unlabeled_count': 107900, 'total_count': 108000}}


In [12]:
# Sample new indices
new_indices = al.sample_next_batch(batch_size=20)
print(f"Sampling completed!")
print(f"Sampled {len(new_indices)} new indices: {new_indices[:5]}...")

# Check updated pool stats
updated_stats = al.pool.get_pool_stats()
print(f"Updated pool stats: {updated_stats}")



Sampling completed!
Sampled 20 new indices: [21441, 60650, 49788, 35420, 83965]...
Updated pool stats: {'labeled_count': 100, 'unlabeled_count': 107900, 'total_count': 108000}


In [13]:
# Train with newly sampled data
round_stats = al.train_one_round(new_indices=new_indices)
print(f"Training with new samples completed!")
print(f"Round stats: {round_stats}")

Training with new samples completed!
Round stats: {'training_time': 0.016015291213989258, 'avg_loss': 41.140060106913246, 'epochs': 3, 'total_samples': 120, 'new_samples': 20, 'f1_score': 0.06618860521154724, 'pool_stats': {'labeled_count': 120, 'unlabeled_count': 107880, 'total_count': 108000}}


## The active learning training rounds (maybe we should also abstract this further)

In [14]:
num_rounds = 3
for round_num in range(num_rounds):
    print(f"\n--- Round {round_num + 3}")  # Continue from previous rounds

    # Sample new batch
    new_indices = al.sample_next_batch(batch_size=15)
    print(f"Sampled {len(new_indices)} new samples")

    if len(new_indices) == 0:
        print("No more unlabeled data!")
        break

    # Train with new samples
    round_stats = al.train_one_round(new_indices=new_indices)
    print(f"F1 Score: {round_stats['f1_score']:.4f}")
    print(f"Training Time: {round_stats['training_time']:.2f}s")
    print(f"Pool Stats: {round_stats['pool_stats']}")

print(f"\nMultiple rounds completed successfully!")




--- Round 3
Sampled 15 new samples
F1 Score: 0.5390
Training Time: 0.02s
Pool Stats: {'labeled_count': 135, 'unlabeled_count': 107865, 'total_count': 108000}

--- Round 4
Sampled 15 new samples
F1 Score: 0.6274
Training Time: 0.02s
Pool Stats: {'labeled_count': 150, 'unlabeled_count': 107850, 'total_count': 108000}

--- Round 5
Sampled 15 new samples
F1 Score: 0.1146
Training Time: 0.02s
Pool Stats: {'labeled_count': 165, 'unlabeled_count': 107835, 'total_count': 108000}

Multiple rounds completed successfully!


In [15]:
# Get experiment summary
summary = al.get_experiment_summary()
print(f"Experiment summary generated!")
print(f"Total rounds: {summary['total_rounds']}")
print(f"Final F1: {summary['final_f1']:.4f}")
print(f"Final pool stats: {summary['final_pool_stats']}")

Experiment summary generated!
Total rounds: 5
Final F1: 0.1146
Final pool stats: {'labeled_count': 165, 'unlabeled_count': 107835, 'total_count': 108000}


In [16]:
al.save_experiment()