# Intent and Slot Filling with Curriculum Learning

This notebook demonstrates implementing a joint intent classification and slot filling model for natural language understanding (NLU) using curriculum learning.

## What is Curriculum Learning?

Curriculum learning is a training strategy inspired by the way humans learn, where models are trained on examples of gradually increasing difficulty. The key ideas are:

1. **Start Simple**: Begin with easier examples that have clear patterns
2. **Progressive Difficulty**: Gradually introduce more complex examples
3. **Competence-Based Progression**: Advance to harder examples once the model masters simpler ones

## Benefits of Curriculum Learning

- **Faster Convergence**: Often leads to faster training, especially in the early stages
- **Better Generalization**: Can help models generalize better by building foundational knowledge first
- **Avoiding Local Minima**: May help avoid poor local minima by guiding optimization

## Implementation Approaches

In this notebook, we implement curriculum learning for joint intent and slot prediction in several ways:

1. **Static Curriculum**: Pre-define difficulty based on features like sentence length, number of slots, etc.
2. **Competence-Based Curriculum**: Dynamically adjust the curriculum based on model performance
3. **Alternative Difficulty Metrics**: Explore different ways to measure example difficulty

We compare these approaches with standard training to demonstrate the benefits of curriculum learning.

In [ ]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import BertTokenizerFast, BertModel
from collections import defaultdict
import json

# --------- 1. Expanded Training Data ---------
# Your prompts split into words with slot labels and intent

data = [
    # add_expense
    {"words": ["Add", "expense", "of", "$20", "to", "group", "Travel", "Friends"], 
     "slots": ["O", "O", "O", "B-amount", "O", "O", "B-group", "I-group"], "intent": "add_expense"},
    {"words": ["Add", "rent", "of", "$1200", "to", "House", "Bills", "group"], 
     "slots": ["O", "O", "O", "B-amount", "O", "B-group", "I-group", "O"], "intent": "add_expense"},
    {"words": ["Add", "$95", "groceries", "to", "House", "Bills", "group"], 
     "slots": ["O", "B-amount", "O", "O", "B-group", "I-group", "O"], "intent": "add_expense"},
    {"words": ["Add", "a", "$350", "software", "charge", "to", "Startup", "Budget", "group"], 
     "slots": ["O", "O", "B-amount", "O", "O", "O", "B-group", "I-group", "O"], "intent": "add_expense"},
    {"words": ["Log", "$60", "Spotify", "family", "plan", "to", "Monthly", "Subscriptions"], 
     "slots": ["O", "B-amount", "B-service", "I-service", "I-service", "O", "B-group", "I-group"], "intent": "add_expense"},
    {"words": ["Add", "equipment", "bill", "of", "$480", "to", "Band", "Practice", "group"], 
     "slots": ["O", "O", "O", "O", "B-amount", "O", "B-group", "I-group", "O"], "intent": "add_expense"},
    {"words": ["Add", "a", "dinner", "bill", "of", "$240", "to", "Tahoe", "trip", "group"], 
     "slots": ["O", "O", "O", "O", "O", "B-amount", "O", "B-group", "I-group", "O"], "intent": "add_expense"},
    {"words": ["Record", "a", "payment", "of", "$75", "for", "internet", "to", "Utilities", "group"], 
     "slots": ["O", "O", "O", "O", "B-amount", "O", "B-service", "O", "B-group", "O"], "intent": "add_expense"},
    {"words": ["Add", "$122.50", "restaurant", "bill", "to", "Weekend", "Fun", "group"], 
     "slots": ["O", "B-amount", "O", "O", "O", "B-group", "I-group", "O"], "intent": "add_expense"},
    {"words": ["Log", "movie", "tickets", "expense", "of", "$42", "to", "Entertainment", "group"], 
     "slots": ["O", "B-service", "I-service", "O", "O", "B-amount", "O", "B-group", "O"], "intent": "add_expense"},
    {"words": ["Add", "my", "share", "of", "$220", "for", "the", "concert", "to", "Music", "Expenses"], 
     "slots": ["O", "O", "O", "O", "B-amount", "O", "O", "B-service", "O", "B-group", "I-group"], "intent": "add_expense"},
    {"words": ["Record", "$18.75", "for", "coffee", "in", "Office", "Supplies", "group"], 
     "slots": ["O", "B-amount", "O", "B-service", "O", "B-group", "I-group", "O"], "intent": "add_expense"},
    {"words": ["Add", "expense", "of", "$550", "for", "hotel", "booking", "to", "Vacation", "2023"], 
     "slots": ["O", "O", "O", "B-amount", "O", "B-service", "I-service", "O", "B-group", "I-group"], "intent": "add_expense"},
    
    # add_member
    {"words": ["Add", "Sarah", "to", "Startup", "Budget", "group"], 
     "slots": ["O", "B-person", "O", "B-group", "I-group", "O"], "intent": "add_member"},
    {"words": ["Add", "Charlie", "to", "group", "Travel", "Friends"], 
     "slots": ["O", "B-person", "O", "O", "B-group", "I-group"], "intent": "add_member"},
    {"words": ["Add", "Mike", "to", "the", "House", "Bills", "group"], 
     "slots": ["O", "B-person", "O", "O", "B-group", "I-group", "O"], "intent": "add_member"},
    {"words": ["Add", "multiple", "people", ":", "Josh", ",", "Amy", ",", "and", "Chris", "to", "Tahoe", "Trip"], 
     "slots": ["O", "O", "O", "O", "B-person", "O", "B-person", "O", "O", "B-person", "O", "B-group", "I-group"], "intent": "add_member"},
    {"words": ["Include", "Samantha", "in", "the", "Office", "Lunch", "group"], 
     "slots": ["O", "B-person", "O", "O", "B-group", "I-group", "O"], "intent": "add_member"},
    {"words": ["Add", "both", "David", "and", "Emily", "to", "Weekend", "Fun"], 
     "slots": ["O", "O", "B-person", "O", "B-person", "O", "B-group", "I-group"], "intent": "add_member"},
    {"words": ["Include", "my", "roommate", "Jennifer", "in", "Utilities", "group"], 
     "slots": ["O", "O", "O", "B-person", "O", "B-group", "O"], "intent": "add_member"},
    {"words": ["Add", "new", "team", "member", "Richard", "to", "Project", "Alpha", "expenses"], 
     "slots": ["O", "O", "O", "O", "B-person", "O", "B-group", "I-group", "O"], "intent": "add_member"},
    {"words": ["Include", "Robert", ",", "Sandra", "and", "Tim", "in", "the", "Hiking", "Club"], 
     "slots": ["O", "B-person", "O", "B-person", "O", "B-person", "O", "O", "B-group", "I-group"], "intent": "add_member"},
    
    # check_balance
    {"words": ["What's", "my", "share", "of", "rent", "in", "House", "Bills", "?"], 
     "slots": ["O", "O", "O", "O", "O", "O", "B-group", "I-group", "O"], "intent": "check_balance"},
    {"words": ["Do", "I", "owe", "the", "group", "money", "?"], 
     "slots": ["O", "O", "O", "O", "O", "O", "O"], "intent": "check_balance"},
    {"words": ["What's", "my", "total", "due", "across", "all", "groups", "?"], 
     "slots": ["O", "O", "O", "O", "O", "O", "O", "O"], "intent": "check_balance"},
    {"words": ["Do", "I", "owe", "anyone", "in", "the", "Startup", "Budget", "group", "?"], 
     "slots": ["O", "O", "O", "O", "O", "O", "B-group", "I-group", "O", "O"], "intent": "check_balance"},
    {"words": ["How", "much", "do", "I", "owe", "in", "the", "group", "for", "my", "band", "practice", "?"], 
     "slots": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B-group", "I-group", "O"], "intent": "check_balance"},
    {"words": ["Am", "I", "owed", "money", "from", "groups", "?"], 
     "slots": ["O", "O", "O", "O", "O", "O", "O"], "intent": "check_balance"},
    {"words": ["Show", "me", "my", "balance", "in", "Vacation", "2023", "group"], 
     "slots": ["O", "O", "O", "O", "O", "B-group", "I-group", "O"], "intent": "check_balance"},
    {"words": ["How", "much", "does", "Sarah", "owe", "me", "in", "House", "Bills", "?"], 
     "slots": ["O", "O", "O", "B-person", "O", "O", "O", "B-group", "I-group", "O"], "intent": "check_balance"},
    {"words": ["What's", "the", "current", "balance", "of", "Weekend", "Fun", "group", "?"], 
     "slots": ["O", "O", "O", "O", "O", "B-group", "I-group", "O", "O"], "intent": "check_balance"},
    {"words": ["Check", "if", "I", "owe", "anything", "for", "the", "Office", "Lunch", "expenses"], 
     "slots": ["O", "O", "O", "O", "O", "O", "O", "B-group", "I-group", "O"], "intent": "check_balance"},
    {"words": ["Does", "anyone", "owe", "me", "money", "in", "the", "Music", "Expenses", "group", "?"], 
     "slots": ["O", "O", "O", "O", "O", "O", "O", "B-group", "I-group", "O", "O"], "intent": "check_balance"},
    {"words": ["What's", "my", "balance", "with", "Chris", "across", "all", "shared", "groups", "?"], 
     "slots": ["O", "O", "O", "O", "B-person", "O", "O", "O", "O", "O"], "intent": "check_balance"},
    
    # create_group
    {"words": ["Create", "a", "group", "called", "Travel", "Friends", "with", "Bob", "and", "Alice"], 
     "slots": ["O", "O", "O", "O", "B-group", "I-group", "O", "B-person", "O", "B-person"], "intent": "create_group"},
    {"words": ["Create", "a", "new", "group", "Monthly", "Subscriptions", "with", "Alex", "and", "Ben"], 
     "slots": ["O", "O", "O", "O", "B-group", "I-group", "O", "B-person", "O", "B-person"], "intent": "create_group"},
    {"words": ["Start", "a", "group", "called", "Startup", "Budget", "with", "John", "and", "Emma"], 
     "slots": ["O", "O", "O", "O", "B-group", "I-group", "O", "B-person", "O", "B-person"], "intent": "create_group"},
    {"words": ["Make", "a", "group", "for", "House", "Bills", "with", "Lisa", "and", "Tom"], 
     "slots": ["O", "O", "O", "O", "B-group", "I-group", "O", "B-person", "O", "B-person"], "intent": "create_group"},
    {"words": ["Create", "a", "new", "expense", "group", "called", "Office", "Lunch", "with", "coworkers", "Jim", "and", "Pam"], 
     "slots": ["O", "O", "O", "O", "O", "O", "B-group", "I-group", "O", "O", "B-person", "O", "B-person"], "intent": "create_group"},
    {"words": ["Set", "up", "a", "group", "named", "Weekend", "Fun", "with", "my", "friends", "Kate", "and", "Mark"], 
     "slots": ["O", "O", "O", "O", "O", "B-group", "I-group", "O", "O", "O", "B-person", "O", "B-person"], "intent": "create_group"},
    {"words": ["Make", "a", "new", "group", "for", "our", "Vacation", "2023", "with", "Rachel", ",", "Ross", "and", "Joey"], 
     "slots": ["O", "O", "O", "O", "O", "O", "B-group", "I-group", "O", "B-person", "O", "B-person", "O", "B-person"], "intent": "create_group"},
    {"words": ["Create", "Utilities", "group", "with", "my", "roommates", "Sam", "and", "Jessica"], 
     "slots": ["O", "B-group", "O", "O", "O", "O", "B-person", "O", "B-person"], "intent": "create_group"},
    {"words": ["Start", "a", "Music", "Expenses", "group", "with", "bandmates", "Dave", ",", "Steve", "and", "Michael"], 
     "slots": ["O", "O", "B-group", "I-group", "O", "O", "O", "B-person", "O", "B-person", "O", "B-person"], "intent": "create_group"},
    {"words": ["Create", "a", "Hiking", "Club", "expense", "tracker", "with", "Laura", "and", "Daniel"], 
     "slots": ["O", "O", "B-group", "I-group", "O", "O", "O", "B-person", "O", "B-person"], "intent": "create_group"},
    
    # group_summary
    {"words": ["Show", "breakdown", "for", "Weekend", "Warriors", "group"], 
     "slots": ["O", "O", "O", "B-group", "I-group", "O"], "intent": "group_summary"},
    {"words": ["Show", "me", "a", "summary", "of", "the", "group", "for", "my", "band", "practice"], 
     "slots": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "B-group", "I-group"], "intent": "group_summary"},
    {"words": ["Show", "full", "activity", "of", "Startup", "Budget", "group"], 
     "slots": ["O", "O", "O", "O", "B-group", "I-group", "O"], "intent": "group_summary"},
    {"words": ["Summary", "of", "all", "groups", "with", "pending", "balances"], 
     "slots": ["O", "O", "O", "O", "O", "O", "O"], "intent": "group_summary"},
    {"words": ["Show", "transaction", "history", "for", "House", "Bills"], 
     "slots": ["O", "O", "O", "O", "B-group", "I-group"], "intent": "group_summary"},
    {"words": ["Give", "me", "a", "detailed", "breakdown", "of", "Vacation", "2023", "expenses"], 
     "slots": ["O", "O", "O", "O", "O", "O", "B-group", "I-group", "O"], "intent": "group_summary"},
    {"words": ["Show", "all", "transactions", "in", "the", "Office", "Lunch", "group"], 
     "slots": ["O", "O", "O", "O", "O", "B-group", "I-group", "O"], "intent": "group_summary"},
    {"words": ["Summarize", "spending", "in", "Weekend", "Fun", "group", "by", "category"], 
     "slots": ["O", "O", "O", "B-group", "I-group", "O", "O", "O"], "intent": "group_summary"},
    {"words": ["Show", "expense", "breakdown", "for", "Music", "Expenses", "by", "month"], 
     "slots": ["O", "O", "O", "O", "B-group", "I-group", "O", "O"], "intent": "group_summary"},
    {"words": ["I", "need", "a", "summary", "of", "Hiking", "Club", "expenses", "from", "last", "month"], 
     "slots": ["O", "O", "O", "O", "O", "B-group", "I-group", "O", "O", "O", "O"], "intent": "group_summary"},
    {"words": ["Let", "me", "see", "all", "payments", "in", "Utilities", "group", "since", "January"], 
     "slots": ["O", "O", "O", "O", "O", "O", "B-group", "O", "O", "O"], "intent": "group_summary"},
    
    # hide_groups
    {"words": ["Hide", "all", "my", "groups", "with", "zero", "balance"], 
     "slots": ["O", "O", "O", "O", "O", "O", "O"], "intent": "hide_groups"},
    {"words": ["Hide", "inactive", "groups", "from", "my", "dashboard"], 
     "slots": ["O", "O", "O", "O", "O", "O"], "intent": "hide_groups"},
    {"words": ["Don't", "show", "settled", "groups", "in", "my", "list"], 
     "slots": ["O", "O", "O", "O", "O", "O", "O"], "intent": "hide_groups"},
    {"words": ["Hide", "all", "groups", "that", "are", "fully", "paid"], 
     "slots": ["O", "O", "O", "O", "O", "O", "O"], "intent": "hide_groups"},
    {"words": ["Remove", "groups", "with", "no", "activity", "for", "over", "3", "months"], 
     "slots": ["O", "O", "O", "O", "O", "O", "O", "O", "O"], "intent": "hide_groups"},
    {"words": ["Hide", "completed", "group", "expenses", "from", "view"], 
     "slots": ["O", "O", "O", "O", "O", "O"], "intent": "hide_groups"},
    {"words": ["Don't", "display", "archived", "groups", "in", "my", "feed"], 
     "slots": ["O", "O", "O", "O", "O", "O", "O"], "intent": "hide_groups"},
]

In [7]:
# --------- 2. Label Mappings ---------
intents = list(set(item["intent"] for item in data))
intent2id = {intent: idx for idx, intent in enumerate(intents)}
id2intent = {v: k for k, v in intent2id.items()}

slot_labels = set()
for item in data:
    slot_labels.update(item["slots"])
slot_labels = sorted(slot_labels)
slot_label2id = {label: idx for idx, label in enumerate(slot_labels)}
id2slot_label = {v: k for k, v in slot_label2id.items()}

print("Intents:", intent2id)
print("Slots:", slot_label2id)

Intents: {'hide_groups': 0, 'add_member': 1, 'check_balance': 2, 'create_group': 3, 'group_summary': 4, 'add_expense': 5}
Slots: {'B-amount': 0, 'B-group': 1, 'B-person': 2, 'B-service': 3, 'I-group': 4, 'I-service': 5, 'O': 6}


In [8]:
# --------- 3. Tokenizer ---------
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [9]:
# --------- 4. Dataset ---------
def tokenize_and_align_labels(texts, slot_labels_list):
    encodings = tokenizer(texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
    all_labels = []
    for i, labels in enumerate(slot_labels_list):
        word_ids = encodings.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(slot_label2id.get(labels[word_idx], slot_label2id["O"]))
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        all_labels.append(label_ids)
    return encodings, all_labels

class JointDataset(Dataset):
    def __init__(self, data):
        self.texts = [item["words"] for item in data]
        self.slots = [item["slots"] for item in data]
        self.intents = [intent2id[item["intent"]] for item in data]
        self.encodings, self.slot_labels = tokenize_and_align_labels(self.texts, self.slots)
    def __len__(self):
        return len(self.intents)
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items() if key != "offset_mapping"}
        item['labels'] = torch.tensor(self.slot_labels[idx])
        item['intent_label'] = torch.tensor(self.intents[idx])
        return item

dataset = JointDataset(data)

In [ ]:
# --------- 5. Model ---------
class JointBERT(nn.Module):
    def __init__(self, num_intents, num_slots):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        hidden_size = self.bert.config.hidden_size
        self.intent_classifier = nn.Linear(hidden_size, num_intents)
        self.slot_classifier = nn.Linear(hidden_size, num_slots)
    def forward(self, input_ids, attention_mask, labels=None, intent_label=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        pooled_output = outputs.pooler_output
        intent_logits = self.intent_classifier(pooled_output)
        slot_logits = self.slot_classifier(sequence_output)
        loss = None
        if labels is not None and intent_label is not None:
            loss_fct = nn.CrossEntropyLoss()
            slot_loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
            intent_loss = loss_fct(intent_logits, intent_label)
            slot_loss = slot_loss_fct(slot_logits.view(-1, slot_logits.shape[-1]), labels.view(-1))
            loss = intent_loss + slot_loss
        return loss, intent_logits, slot_logits

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = JointBERT(num_intents=len(intent2id), num_slots=len(slot_label2id)).to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

# Define difficulty measurement for curriculum learning
def calculate_example_difficulty(example):
    """
    Calculate difficulty score for an example based on multiple factors:
    1. Input length - longer inputs are generally harder to process
    2. Number of slots - more slots means more complexity
    3. Entity density - more entities per word increases difficulty
    4. Multiple entities of same type - harder to disambiguate
    5. Intent complexity - some intents are inherently more complex
    """
    # Factor 1: Input length
    input_length = len(example["words"])
    length_score = min(1.0, input_length / 15)  # Normalize by max expected length
    
    # Factor 2: Number of non-"O" slots
    slot_count = sum(1 for slot in example["slots"] if slot != "O")
    slot_score = min(1.0, slot_count / 8)  # Normalize by max expected slots
    
    # Factor 3: Entity density
    if input_length > 0:
        entity_density = slot_count / input_length
    else:
        entity_density = 0
    density_score = min(1.0, entity_density * 3)  # Scale appropriately
    
    # Factor 4: Multiple entities of same type
    entity_types = {}
    current_entity = None
    for slot in example["slots"]:
        if slot.startswith("B-"):
            entity_type = slot[2:]
            entity_types[entity_type] = entity_types.get(entity_type, 0) + 1
    
    duplicate_entity_score = sum(min(1.0, (count - 1) * 0.5) for count in entity_types.values())
    
    # Factor 5: Intent complexity (based on domain knowledge)
    intent_complexity = {
        "hide_groups": 0.3,      # Simple command
        "check_balance": 0.4,    # Simple query
        "group_summary": 0.5,    # Information retrieval
        "create_group": 0.7,     # Creation with multiple parameters
        "add_member": 0.8,       # Modification with entity reference
        "add_expense": 1.0       # Complex with amount and multiple references
    }
    intent_score = intent_complexity.get(example["intent"], 0.5)
    
    # Combine all factors with appropriate weights
    weights = {
        "length": 0.15,
        "slots": 0.25,
        "density": 0.2,
        "duplicates": 0.15,
        "intent": 0.25
    }
    
    difficulty = (
        weights["length"] * length_score +
        weights["slots"] * slot_score +
        weights["density"] * density_score +
        weights["duplicates"] * duplicate_entity_score +
        weights["intent"] * intent_score
    )
    
    return difficulty

train_loader = DataLoader(dataset, batch_size=4, shuffle=True)

In [ ]:
# --------- 6. Training with Curriculum Learning ---------
import matplotlib.pyplot as plt
import numpy as np

# Sort training examples by difficulty
example_difficulties = [(i, calculate_example_difficulty(data[i])) for i in range(len(data))]
example_difficulties.sort(key=lambda x: x[1])  # Sort by difficulty

# Visualize the difficulty distribution
difficulties = [diff for _, diff in example_difficulties]
plt.figure(figsize=(10, 5))
plt.hist(difficulties, bins=10, alpha=0.7)
plt.title('Distribution of Example Difficulties')
plt.xlabel('Difficulty Score')
plt.ylabel('Number of Examples')
plt.axvline(x=np.percentile(difficulties, 33), color='r', linestyle='--', alpha=0.7, label='33rd percentile')
plt.axvline(x=np.percentile(difficulties, 66), color='g', linestyle='--', alpha=0.7, label='66th percentile')
plt.legend()
plt.grid(alpha=0.3)
plt.show()

print(f"Difficulty statistics:")
print(f"  Min: {min(difficulties):.2f}")
print(f"  Max: {max(difficulties):.2f}")
print(f"  Mean: {np.mean(difficulties):.2f}")
print(f"  Median: {np.median(difficulties):.2f}")

# Create curriculum stages
num_stages = 3
curriculum_stages = []

# First stage: easy examples (bottom 33%)
easy_threshold = np.percentile(difficulties, 33)
easy_indices = [idx for idx, diff in example_difficulties if diff <= easy_threshold]
curriculum_stages.append([data[i] for i in easy_indices])

# Second stage: easy + medium examples (bottom 66%)
medium_threshold = np.percentile(difficulties, 66)
medium_indices = [idx for idx, diff in example_difficulties if diff <= medium_threshold]
curriculum_stages.append([data[i] for i in medium_indices])

# Third stage: all examples
curriculum_stages.append(data)

print(f"Curriculum stages created with {[len(stage) for stage in curriculum_stages]} examples per stage")

# Create validation set (20% of data)
from sklearn.model_selection import train_test_split
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)
val_dataset = JointDataset(val_data)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

# Function to evaluate model
def evaluate_model(model, data_loader):
    model.eval()
    intent_correct = 0
    slot_correct = 0
    slot_total = 0
    total_examples = 0
    total_loss = 0
    
    with torch.no_grad():
        for batch in data_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            loss, intent_logits, slot_logits = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                labels=batch["labels"],
                intent_label=batch["intent_label"]
            )
            
            # Intent accuracy
            intent_preds = torch.argmax(intent_logits, dim=1)
            intent_correct += (intent_preds == batch["intent_label"]).sum().item()
            
            # Slot accuracy (ignoring padding tokens)
            slot_preds = torch.argmax(slot_logits, dim=2)
            active_slots = batch["labels"] != -100
            slot_correct += ((slot_preds == batch["labels"]) & active_slots).sum().item()
            slot_total += active_slots.sum().item()
            
            total_examples += batch["intent_label"].size(0)
            total_loss += loss.item() * batch["intent_label"].size(0)
    
    intent_accuracy = intent_correct / total_examples if total_examples > 0 else 0
    slot_accuracy = slot_correct / slot_total if slot_total > 0 else 0
    avg_loss = total_loss / total_examples if total_examples > 0 else float('inf')
    
    return {
        "loss": avg_loss,
        "intent_accuracy": intent_accuracy,
        "slot_accuracy": slot_accuracy,
        "joint_accuracy": (intent_accuracy + slot_accuracy) / 2
    }

# Training with curriculum learning
model.train()
total_epochs = 9  # 3 stages × 3 epochs per stage
epochs_per_stage = 3
metrics_history = {
    "loss": [],
    "intent_accuracy": [],
    "slot_accuracy": [],
    "joint_accuracy": []
}

print("\n--- Starting Curriculum Learning Training ---")

for stage in range(num_stages):
    print(f"\n--- Stage {stage+1}/{num_stages} - {len(curriculum_stages[stage])} examples ---")
    stage_data = curriculum_stages[stage]
    stage_dataset = JointDataset(stage_data)
    stage_loader = DataLoader(stage_dataset, batch_size=4, shuffle=True)
    
    for epoch in range(epochs_per_stage):
        # Training epoch
        model.train()
        total_loss = 0
        for batch in stage_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            optimizer.zero_grad()
            loss, _, _ = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                labels=batch["labels"],
                intent_label=batch["intent_label"]
            )
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        # Evaluation
        model.eval()
        val_metrics = evaluate_model(model, val_loader)
        metrics_history["loss"].append(val_metrics["loss"])
        metrics_history["intent_accuracy"].append(val_metrics["intent_accuracy"])
        metrics_history["slot_accuracy"].append(val_metrics["slot_accuracy"])
        metrics_history["joint_accuracy"].append(val_metrics["joint_accuracy"])
        
        global_epoch = (stage * epochs_per_stage) + epoch
        print(f"Epoch {global_epoch+1}/{total_epochs} - "
              f"Train Loss: {total_loss/len(stage_loader):.4f}, "
              f"Val Loss: {val_metrics['loss']:.4f}, "
              f"Intent Acc: {val_metrics['intent_accuracy']:.4f}, "
              f"Slot Acc: {val_metrics['slot_accuracy']:.4f}")

# Plot learning curves
plt.figure(figsize=(12, 8))

plt.subplot(2, 2, 1)
plt.plot(range(1, total_epochs+1), metrics_history["loss"])
plt.title('Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(alpha=0.3)

plt.subplot(2, 2, 2)
plt.plot(range(1, total_epochs+1), metrics_history["intent_accuracy"])
plt.title('Intent Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.grid(alpha=0.3)

plt.subplot(2, 2, 3)
plt.plot(range(1, total_epochs+1), metrics_history["slot_accuracy"])
plt.title('Slot Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.grid(alpha=0.3)

plt.subplot(2, 2, 4)
plt.plot(range(1, total_epochs+1), metrics_history["joint_accuracy"])
plt.title('Joint Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.grid(alpha=0.3)

plt.tight_layout()
plt.show()

# Highlight curriculum transitions
plt.figure(figsize=(10, 6))
plt.plot(range(1, total_epochs+1), metrics_history["joint_accuracy"], marker='o')
plt.axvline(x=epochs_per_stage + 0.5, color='r', linestyle='--', alpha=0.7, label='Stage 1 → 2')
plt.axvline(x=2*epochs_per_stage + 0.5, color='g', linestyle='--', alpha=0.7, label='Stage 2 → 3')
plt.title('Learning Progress with Curriculum Stages')
plt.xlabel('Epoch')
plt.ylabel('Joint Accuracy')
plt.legend()
plt.grid(alpha=0.3)
plt.show()

In [ ]:
# --------- 7. Comparison with Standard Training ---------
import copy

# Create a copy of the model for standard training
standard_model = copy.deepcopy(model)
standard_model = JointBERT(num_intents=len(intent2id), num_slots=len(slot_label2id)).to(device)
standard_optimizer = AdamW(standard_model.parameters(), lr=5e-5)

# Create dataset for standard training (without curriculum)
standard_train_dataset = JointDataset(train_data)
standard_train_loader = DataLoader(standard_train_dataset, batch_size=4, shuffle=True)

# Training metrics for standard approach
standard_metrics_history = {
    "loss": [],
    "intent_accuracy": [],
    "slot_accuracy": [],
    "joint_accuracy": []
}

print("\n--- Starting Standard Training (No Curriculum) ---")

# Train for the same total number of epochs
for epoch in range(total_epochs):
    # Training epoch
    standard_model.train()
    total_loss = 0
    for batch in standard_train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        standard_optimizer.zero_grad()
        loss, _, _ = standard_model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=batch["labels"],
            intent_label=batch["intent_label"]
        )
        loss.backward()
        standard_optimizer.step()
        total_loss += loss.item()
    
    # Evaluation
    standard_model.eval()
    val_metrics = evaluate_model(standard_model, val_loader)
    standard_metrics_history["loss"].append(val_metrics["loss"])
    standard_metrics_history["intent_accuracy"].append(val_metrics["intent_accuracy"])
    standard_metrics_history["slot_accuracy"].append(val_metrics["slot_accuracy"])
    standard_metrics_history["joint_accuracy"].append(val_metrics["joint_accuracy"])
    
    print(f"Epoch {epoch+1}/{total_epochs} - "
          f"Train Loss: {total_loss/len(standard_train_loader):.4f}, "
          f"Val Loss: {val_metrics['loss']:.4f}, "
          f"Intent Acc: {val_metrics['intent_accuracy']:.4f}, "
          f"Slot Acc: {val_metrics['slot_accuracy']:.4f}")

# Comparison plots
plt.figure(figsize=(15, 10))

# Loss comparison
plt.subplot(2, 2, 1)
plt.plot(range(1, total_epochs+1), metrics_history["loss"], label='Curriculum Learning')
plt.plot(range(1, total_epochs+1), standard_metrics_history["loss"], label='Standard Training')
plt.title('Validation Loss Comparison')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(alpha=0.3)

# Intent accuracy comparison
plt.subplot(2, 2, 2)
plt.plot(range(1, total_epochs+1), metrics_history["intent_accuracy"], label='Curriculum Learning')
plt.plot(range(1, total_epochs+1), standard_metrics_history["intent_accuracy"], label='Standard Training')
plt.title('Intent Accuracy Comparison')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(alpha=0.3)

# Slot accuracy comparison
plt.subplot(2, 2, 3)
plt.plot(range(1, total_epochs+1), metrics_history["slot_accuracy"], label='Curriculum Learning')
plt.plot(range(1, total_epochs+1), standard_metrics_history["slot_accuracy"], label='Standard Training')
plt.title('Slot Accuracy Comparison')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(alpha=0.3)

# Joint accuracy comparison
plt.subplot(2, 2, 4)
plt.plot(range(1, total_epochs+1), metrics_history["joint_accuracy"], label='Curriculum Learning')
plt.plot(range(1, total_epochs+1), standard_metrics_history["joint_accuracy"], label='Standard Training')
plt.title('Joint Accuracy Comparison')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(alpha=0.3)

plt.tight_layout()
plt.show()

# Final performance comparison
print("\n--- Final Performance Comparison ---")
print(f"{'Metric':<20} {'Curriculum':<15} {'Standard':<15} {'Difference':<15}")
print("-" * 65)

for metric in ["loss", "intent_accuracy", "slot_accuracy", "joint_accuracy"]:
    curriculum_value = metrics_history[metric][-1]
    standard_value = standard_metrics_history[metric][-1]
    difference = curriculum_value - standard_value
    difference_str = f"{difference:.4f} ({difference/standard_value*100:.1f}%)" if metric != "loss" else f"{difference:.4f}"
    
    # Improvement indicator
    if metric == "loss":
        is_better = difference < 0
    else:
        is_better = difference > 0
    
    indicator = "✓" if is_better else "✗"
    print(f"{metric.replace('_', ' ').title():<20} {curriculum_value:.4f}{'':<8} {standard_value:.4f}{'':<8} {difference_str:<15} {indicator}")

# Analysis of performance on different difficulty levels
print("\n--- Performance Analysis by Difficulty Level ---")

# Group test examples by difficulty
difficulty_ranges = {
    "Easy": (0, easy_threshold),
    "Medium": (easy_threshold, medium_threshold),
    "Hard": (medium_threshold, float('inf'))
}

# Calculate difficulty for validation examples
val_difficulties = [calculate_example_difficulty(example) for example in val_data]

# Split validation data into difficulty groups
easy_val = [i for i, diff in enumerate(val_difficulties) if diff <= easy_threshold]
medium_val = [i for i, diff in enumerate(val_difficulties) if easy_threshold < diff <= medium_threshold]
hard_val = [i for i, diff in enumerate(val_difficulties) if diff > medium_threshold]

difficulty_groups = {
    "Easy": easy_val,
    "Medium": medium_val,
    "Hard": hard_val
}

# Create datasets for each difficulty group
difficulty_datasets = {}
for difficulty, indices in difficulty_groups.items():
    if indices:  # Only create datasets for non-empty groups
        difficulty_data = [val_data[i] for i in indices]
        difficulty_datasets[difficulty] = JointDataset(difficulty_data)

# Evaluate both models on each difficulty group
print(f"{'Difficulty':<10} {'Examples':<10} {'Curriculum Acc':<15} {'Standard Acc':<15} {'Difference':<15}")
print("-" * 65)

for difficulty, dataset in difficulty_datasets.items():
    loader = DataLoader(dataset, batch_size=4, shuffle=False)
    
    # Skip if no examples in this difficulty range
    if len(loader) == 0:
        continue
    
    # Curriculum model performance
    curr_metrics = evaluate_model(model, loader)
    
    # Standard model performance
    std_metrics = evaluate_model(standard_model, loader)
    
    # Joint accuracy comparison
    curr_acc = curr_metrics["joint_accuracy"]
    std_acc = std_metrics["joint_accuracy"]
    difference = curr_acc - std_acc
    difference_str = f"{difference:.4f} ({difference/std_acc*100:.1f}%)" if std_acc > 0 else "N/A"
    
    # Improvement indicator
    indicator = "✓" if difference > 0 else "✗"
    if difference == 0:
        indicator = "-"
    
    print(f"{difficulty:<10} {len(dataset):<10} {curr_acc:.4f}{'':<8} {std_acc:.4f}{'':<8} {difference_str:<15} {indicator}")

# Analysis conclusion
print("\nKey Findings:")
print("1. Curriculum learning shows " + 
      ("better" if metrics_history["joint_accuracy"][-1] > standard_metrics_history["joint_accuracy"][-1] else "worse") + 
      " overall performance compared to standard training.")
print("2. The biggest difference in performance is seen in " + 
      ("easy" if curr_metrics["joint_accuracy"] - std_metrics["joint_accuracy"] > 0 else "hard") + 
      " examples, suggesting that curriculum learning " + 
      ("helps" if curr_metrics["joint_accuracy"] - std_metrics["joint_accuracy"] > 0 else "may not help") + 
      " the model build a stronger foundation.")
print("3. Learning rate is " + 
      ("faster" if np.mean(metrics_history["joint_accuracy"][:3]) > np.mean(standard_metrics_history["joint_accuracy"][:3]) else "slower") + 
      " in early epochs with curriculum learning, indicating " + 
      ("more" if np.mean(metrics_history["joint_accuracy"][:3]) > np.mean(standard_metrics_history["joint_accuracy"][:3]) else "less") + 
      " efficient initial learning.")

In [ ]:
# --------- 10. Alternative Curriculum Learning Approaches ---------

# This cell provides alternative approaches to curriculum learning that could be explored

def get_difficulty_by_uncertainty(model, example, tokenizer):
    """Measure example difficulty by model uncertainty/confidence
    
    Higher uncertainty = more difficult example
    This approach requires a pre-trained model to assess difficulty
    """
    model.eval()
    words = example["words"]
    inputs = tokenizer([words], is_split_into_words=True, return_tensors="pt").to(device)
    
    with torch.no_grad():
        _, intent_logits, slot_logits = model(inputs["input_ids"], inputs["attention_mask"])
        
        # Calculate uncertainty using entropy
        intent_probs = torch.nn.functional.softmax(intent_logits, dim=1)
        intent_entropy = -torch.sum(intent_probs * torch.log(intent_probs + 1e-10), dim=1).item()
        
        # Average slot entropy across sequence
        slot_probs = torch.nn.functional.softmax(slot_logits, dim=2)
        slot_entropy = -torch.sum(slot_probs * torch.log(slot_probs + 1e-10), dim=2)
        avg_slot_entropy = torch.mean(slot_entropy).item()
        
    return intent_entropy + avg_slot_entropy

def get_difficulty_by_intent_distribution(example):
    """Curriculum based on intent distribution
    
    Train on one intent type at a time, starting with simpler intents
    """
    # Predefined intent difficulty (domain knowledge)
    intent_difficulty = {
        "check_balance": 1,      # Simplest - mostly information retrieval
        "hide_groups": 1,        # Simple - just a setting
        "group_summary": 2,      # Medium - display structured info
        "create_group": 3,       # More complex - creating entities with attributes
        "add_member": 3,         # More complex - modifying entities
        "add_expense": 4         # Most complex - numerical values, categories, etc.
    }
    
    return intent_difficulty.get(example["intent"], 5)  # Default to highest difficulty

def get_difficulty_by_slot_density(example):
    """Measure difficulty by the density of slots
    
    Higher slot density = more difficult example
    """
    total_words = len(example["words"])
    if total_words == 0:
        return 0
        
    slot_words = sum(1 for slot in example["slots"] if slot != "O")
    return slot_words / total_words

# Example of a competence-based curriculum:
# This approach adapts the curriculum based on model performance
def competence_based_curriculum(model, data, tokenizer, num_epochs=8):
    """
    Implement a competence-based curriculum that adapts based on model performance
    """
    # Split data into validation and training
    from sklearn.model_selection import train_test_split
    train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)
    
    # Calculate initial difficulty scores
    difficulties = [(i, get_example_difficulty(example)) for i, example in enumerate(train_data)]
    difficulties.sort(key=lambda x: x[1])
    
    # Start with easier 20% of examples
    active_indices = set([idx for idx, _ in difficulties[:int(0.2 * len(difficulties))]])
    
    model.train()
    val_dataset = JointDataset(val_data)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)
    
    # Track performance history
    performance_history = []
    
    for epoch in range(num_epochs):
        # Create dataset from active examples
        active_data = [train_data[i] for i in active_indices]
        active_dataset = JointDataset(active_data)
        active_loader = DataLoader(active_dataset, batch_size=4, shuffle=True)
        
        # Train for one epoch
        total_loss = 0
        for batch in active_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            optimizer.zero_grad()
            loss, _, _ = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                labels=batch["labels"],
                intent_label=batch["intent_label"]
            )
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        # Evaluate current performance
        metrics = evaluate_model(model, val_loader)
        performance_history.append(metrics["joint_accuracy"])
        print(f"Epoch {epoch+1}/{num_epochs}, {len(active_indices)}/{len(train_data)} examples, "
              f"loss: {total_loss/len(active_loader):.4f}, "
              f"acc: {metrics['joint_accuracy']:.4f}")
        
        # Adjust curriculum based on performance
        # If performance plateaus, add more difficult examples
        if epoch >= 2 and abs(performance_history[-1] - performance_history[-2]) < 0.01:
            # Add next 10% of examples by difficulty
            current_size = len(active_indices)
            target_size = min(len(train_data), int(current_size * 1.1))
            remaining = [(idx, diff) for idx, diff in difficulties if idx not in active_indices]
            remaining.sort(key=lambda x: x[1])
            
            # Add easier remaining examples first
            to_add = target_size - current_size
            active_indices.update([idx for idx, _ in remaining[:to_add]])
            
            print(f"  Performance plateau detected. Added {to_add} examples.")
    
    return model, performance_history

# Example usage (not executed to save computation time):
"""
print("\n--- Competence-based Curriculum Learning ---")
competence_model = JointBERT(num_intents=len(intent2id), num_slots=len(slot_label2id)).to(device)
competence_optimizer = AdamW(competence_model.parameters(), lr=5e-5)
competence_model, history = competence_based_curriculum(competence_model, data, tokenizer)
"""

# Other curriculum strategies that could be explored:
# 
# 1. Mixed difficulty batches: Rather than training on all easy examples first,
#    create batches with a mix of difficulties but weight towards easier examples initially
#
# 2. Task-specific curriculum: Train intent classification first, then slot filling
#
# 3. Intent-based curriculum: Train on one intent at a time in order of increasing complexity
#
# 4. Online difficulty assessment: Periodically re-assess example difficulty based on
#    the model's current loss or uncertainty on each example

In [ ]:
# --------- 9. Curriculum Learning Evaluation ---------
import matplotlib.pyplot as plt
import numpy as np

# Function to evaluate model on test data
def evaluate_model(model, test_loader):
    model.eval()
    intent_correct = 0
    slot_correct = 0
    slot_total = 0
    total = 0
    
    with torch.no_grad():
        for batch in test_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            _, intent_logits, slot_logits = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"]
            )
            
            # Intent accuracy
            intent_preds = torch.argmax(intent_logits, dim=1)
            intent_correct += (intent_preds == batch["intent_label"]).sum().item()
            
            # Slot accuracy (ignoring padding tokens)
            slot_preds = torch.argmax(slot_logits, dim=2)
            active_slots = batch["labels"] != -100
            slot_correct += ((slot_preds == batch["labels"]) & active_slots).sum().item()
            slot_total += active_slots.sum().item()
            
            total += batch["intent_label"].size(0)
    
    intent_accuracy = intent_correct / total
    slot_accuracy = slot_correct / slot_total if slot_total > 0 else 0
    
    return {
        "intent_accuracy": intent_accuracy,
        "slot_accuracy": slot_accuracy,
        "joint_accuracy": (intent_accuracy + slot_accuracy) / 2
    }

# Visualize difficulty distribution
difficulties = [get_example_difficulty(example) for example in data]

plt.figure(figsize=(10, 5))
plt.hist(difficulties, bins=10, alpha=0.7)
plt.title('Distribution of Example Difficulties')
plt.xlabel('Difficulty Score')
plt.ylabel('Number of Examples')
plt.grid(alpha=0.3)
plt.show()

# Print difficulty statistics
print(f"Difficulty statistics:")
print(f"  Min: {min(difficulties):.2f}")
print(f"  Max: {max(difficulties):.2f}")
print(f"  Mean: {np.mean(difficulties):.2f}")
print(f"  Median: {np.median(difficulties):.2f}")

# Create validation set from 20% of the data
from sklearn.model_selection import train_test_split

train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)
val_dataset = JointDataset(val_data)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

# Training without curriculum learning (for comparison)
no_curriculum_model = JointBERT(num_intents=len(intent2id), num_slots=len(slot_label2id)).to(device)
no_curriculum_optimizer = AdamW(no_curriculum_model.parameters(), lr=5e-5)
train_dataset = JointDataset(train_data)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

print("\n--- Training without curriculum learning ---")
no_curriculum_model.train()
for epoch in range(8):
    total_loss = 0
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        no_curriculum_optimizer.zero_grad()
        loss, _, _ = no_curriculum_model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=batch["labels"],
            intent_label=batch["intent_label"]
        )
        loss.backward()
        no_curriculum_optimizer.step()
        total_loss += loss.item()
        
    print(f"Epoch {epoch+1}/8 loss: {total_loss/len(train_loader):.4f}")
    
    # Evaluate after each epoch
    metrics = evaluate_model(no_curriculum_model, val_loader)
    print(f"  Validation - Intent Acc: {metrics['intent_accuracy']:.4f}, Slot Acc: {metrics['slot_accuracy']:.4f}")

# Compare the two models
curriculum_metrics = evaluate_model(model, val_loader)
no_curriculum_metrics = evaluate_model(no_curriculum_model, val_loader)

print("\n--- Comparison of Models ---")
print("With Curriculum Learning:")
print(f"  Intent Accuracy: {curriculum_metrics['intent_accuracy']:.4f}")
print(f"  Slot Accuracy: {curriculum_metrics['slot_accuracy']:.4f}")
print(f"  Joint Accuracy: {curriculum_metrics['joint_accuracy']:.4f}")

print("\nWithout Curriculum Learning:")
print(f"  Intent Accuracy: {no_curriculum_metrics['intent_accuracy']:.4f}")
print(f"  Slot Accuracy: {no_curriculum_metrics['slot_accuracy']:.4f}")
print(f"  Joint Accuracy: {no_curriculum_metrics['joint_accuracy']:.4f}")

# Plot learning curves
plt.figure(figsize=(12, 6))
plt.title('Accuracy Comparison')
plt.bar(['Intent Acc (w/ CL)', 'Intent Acc', 'Slot Acc (w/ CL)', 'Slot Acc', 'Joint Acc (w/ CL)', 'Joint Acc'], 
        [curriculum_metrics['intent_accuracy'], no_curriculum_metrics['intent_accuracy'],
         curriculum_metrics['slot_accuracy'], no_curriculum_metrics['slot_accuracy'],
         curriculum_metrics['joint_accuracy'], no_curriculum_metrics['joint_accuracy']])
plt.ylabel('Accuracy')
plt.ylim(0, 1)
plt.grid(axis='y', alpha=0.3)
plt.show()

In [None]:
# --------- 8. Test ---------
test_sentences = [
    "Add expense of $20 to group Travel Friends",
    "Add Sarah to Startup Budget group",
    "What's my share of rent in House Bills?",
    "Create a group called Travel Friends with Bob and Alice",
    "Show me a summary of the group for my band practice",
    "Hide all my groups with zero balance"
]

for sent in test_sentences:
    intent, slots = predict(sent)
    print(f"\nInput: {sent}")
    print(f"Predicted intent: {intent}")
    print(f"Predicted slots:\n{json.dumps(slots, indent=2)}")