# Deep Knowledge Tracing (DKT) - Training Notebook

This notebook demonstrates:
1. Creating synthetic student interaction data
2. Training a DKT LSTM model
3. Evaluating model performance (AUC, Brier score)
4. Comparing with Beta-Bernoulli baseline
5. Saving the trained model for deployment


In [None]:
import sys
sys.path.append('..')

import numpy as np
import torch
import matplotlib.pyplot as plt
from pathlib import Path

from models.dkt import DKT, DKTPredictor, create_synthetic_data, train_dkt
from models.beta_kt import BetaKT
from evaluation import KTEvaluator

print(f"PyTorch version: {torch.__version__}")
print(f"Device: {'cuda' if torch.cuda.is_available() else 'cpu'}")


## 1. Generate Synthetic Data

We'll create synthetic student interaction sequences with varying abilities and learning effects.


In [None]:
# Parameters
N_STUDENTS = 200
N_QUESTIONS = 25
SEED = 42

# Generate data
data = create_synthetic_data(
    n_students=N_STUDENTS,
    n_questions=N_QUESTIONS,
    seq_len_range=(15, 60),
    seed=SEED
)

print(f"Generated {len(data)} student sequences")
print(f"Example sequence length: {len(data[0]['attempts'])}")
print(f"Example first 5 attempts: {data[0]['attempts'][:5]}")


## 2. Train DKT Model


In [None]:
# Split data
np.random.shuffle(data)
train_data = data[:140]
test_data = data[140:]

# Train
model = train_dkt(
    data=train_data,
    n_questions=N_QUESTIONS,
    epochs=20,
    batch_size=32,
    lr=0.001,
    device='cpu',
    save_path='../models/dkt_model.pt'
)

print("Training complete! Model saved.")


## 3. Evaluate and Compare

Now evaluate the DKT model and compare with the Beta-Bernoulli baseline.


In [None]:
# Load and evaluate DKT
predictor = DKTPredictor(model_path='../models/dkt_model.pt', device='cpu')

y_true = []
y_pred = []

for student in test_data:
    attempts = student['attempts']
    for i in range(5, len(attempts)):
        history = attempts[:i]
        next_attempt = attempts[i]
        prob = predictor.predict_next_question(history, next_attempt['question_id'])
        y_true.append(int(next_attempt['correct']))
        y_pred.append(prob)

# Metrics
dkt_metrics = KTEvaluator.evaluate_model(y_true, y_pred)
print(f"DKT AUC: {dkt_metrics['auc']:.4f}")
print(f"DKT Brier: {dkt_metrics['brier_score']:.4f}")
print(f"DKT Accuracy: {dkt_metrics['accuracy']:.4f}")


## 4. Baseline Comparison - Beta-Bernoulli KT

Now let's compare with the simpler Beta-Bernoulli baseline model.


In [None]:
# Beta-Bernoulli baseline
beta_kt = BetaKT(alpha=1.0, beta=1.0)

# Create concept mapping for evaluation
# Map question_id to concept (simplified: each question is its own concept)
concept_map = {i: f"concept_{i}" for i in range(N_QUESTIONS)}

y_true_beta = []
y_pred_beta = []

for student in test_data:
    attempts = student['attempts']
    for i in range(5, len(attempts)):
        history = attempts[:i]
        next_attempt = attempts[i]
        
        # Convert to concept format for Beta-KT
        history_with_concepts = [
            {'concept': concept_map[a['question_id']], 'correct': a['correct']}
            for a in history
        ]
        
        # Predict mastery for next question's concept
        mastery = beta_kt.predict_mastery(history_with_concepts)
        concept = concept_map[next_attempt['question_id']]
        prob = mastery.get(concept, 0.5)  # default to 0.5 if no prior
        
        y_true_beta.append(int(next_attempt['correct']))
        y_pred_beta.append(prob)

# Evaluate Beta-KT
beta_metrics = KTEvaluator.evaluate_model(y_true_beta, y_pred_beta)
print(f"Beta-KT AUC: {beta_metrics['auc']:.4f}")
print(f"Beta-KT Brier: {beta_metrics['brier_score']:.4f}")
print(f"Beta-KT Accuracy: {beta_metrics['accuracy']:.4f}")


## 5. Model Comparison Summary


In [None]:
# Compare models side by side
print("=" * 60)
print("MODEL COMPARISON")
print("=" * 60)
print(f"{'Metric':<20} {'DKT':<20} {'Beta-KT':<20}")
print("-" * 60)
print(f"{'AUC':<20} {dkt_metrics['auc']:<20.4f} {beta_metrics['auc']:<20.4f}")
print(f"{'Brier Score':<20} {dkt_metrics['brier_score']:<20.4f} {beta_metrics['brier_score']:<20.4f}")
print(f"{'Accuracy':<20} {dkt_metrics['accuracy']:<20.4f} {beta_metrics['accuracy']:<20.4f}")
print("=" * 60)

# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# AUC comparison
models = ['DKT', 'Beta-KT']
auc_scores = [dkt_metrics['auc'], beta_metrics['auc']]
axes[0].bar(models, auc_scores, color=['#3498db', '#e74c3c'])
axes[0].set_ylabel('AUC Score')
axes[0].set_title('AUC Comparison')
axes[0].set_ylim([0, 1])
for i, v in enumerate(auc_scores):
    axes[0].text(i, v + 0.02, f"{v:.4f}", ha='center', fontweight='bold')

# Brier score comparison (lower is better)
brier_scores = [dkt_metrics['brier_score'], beta_metrics['brier_score']]
axes[1].bar(models, brier_scores, color=['#3498db', '#e74c3c'])
axes[1].set_ylabel('Brier Score')
axes[1].set_title('Brier Score Comparison (Lower is Better)')
axes[1].set_ylim([0, max(brier_scores) * 1.2])
for i, v in enumerate(brier_scores):
    axes[1].text(i, v + 0.01, f"{v:.4f}", ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

print("\nKey Insights:")
print(f"- DKT improves AUC by {(dkt_metrics['auc'] - beta_metrics['auc']):.4f} over Beta-KT")
print(f"- DKT improves Brier score by {(beta_metrics['brier_score'] - dkt_metrics['brier_score']):.4f} over Beta-KT")
print(f"- DKT model has been saved to ../models/dkt_model.pt")


## 6. STEM Learning Path Generation

Now let's use our trained models to generate personalized STEM learning pathways!


In [None]:
# Initialize STEM path generator
from stem_path_generator import STEMPathGenerator, LearningStyle

generator = STEMPathGenerator()

print(f"📚 Knowledge Graph loaded with {len(generator.knowledge_graph)} concepts")
print(f"Subjects available: {set(c.subject for c in generator.knowledge_graph.values())}")


In [None]:
### Example 1: Beginner Programmer - Visual Learner


In [None]:
# User A: Beginner with some exposure to variables
user_a_attempts = [
    {"concept": "variables", "correct": True},
    {"concept": "variables", "correct": True},
    {"concept": "variables", "correct": False},  # Still learning
    {"concept": "control_structures", "correct": False},  # Struggling
]

path_a = generator.generate_path(
    user_id="user_a_beginner",
    subject="programming",
    user_attempts=user_a_attempts,
    learning_style=LearningStyle.VISUAL,
    learning_goal="Master Python basics"
)

print("=" * 80)
print(f"🎯 Learning Path for {path_a.user_id}")
print(f"📊 Overall Mastery: {path_a.overall_mastery:.1%}")
print(f"⏱️  Estimated Time: {path_a.metadata['estimated_total_hours']:.1f} hours")
print("=" * 80)

for i, node in enumerate(path_a.nodes[:5], 1):  # Show first 5 concepts
    status_emoji = {"completed": "✅", "in_progress": "🔄", "not_started": "⭕", "locked": "🔒"}
    print(f"\n{i}. {status_emoji[node.status]} {node.concept.name}")
    print(f"   Current Mastery: {node.current_mastery:.1%} → Target: {node.target_mastery:.1%}")
    print(f"   Status: {node.status.upper()}")
    print(f"   Prerequisites: {', '.join(node.concept.prerequisites) or 'None'}")
    print(f"   Recommended Resources:")
    for res in node.recommended_resources[:2]:  # Show top 2
        print(f"      • {res.title} ({res.type}, {res.duration_minutes}min, difficulty: {res.difficulty:.1%})")


In [None]:
# User B: Intermediate learner, good at fundamentals but needs work on advanced topics
user_b_attempts = [
    # Strong fundamentals
    {"concept": "variables", "correct": True},
    {"concept": "variables", "correct": True},
    {"concept": "control_structures", "correct": True},
    {"concept": "control_structures", "correct": True},
    {"concept": "functions", "correct": True},
    {"concept": "functions", "correct": False},  # Some challenges
    # Struggling with advanced concepts
    {"concept": "arrays", "correct": False},
    {"concept": "arrays", "correct": False},
]

path_b = generator.generate_path(
    user_id="user_b_intermediate",
    subject="programming",
    user_attempts=user_b_attempts,
    learning_style=LearningStyle.KINESTHETIC,  # Prefers hands-on labs and projects
    learning_goal="Prepare for data structures course"
)

print("=" * 80)
print(f"🎯 Learning Path for {path_b.user_id}")
print(f"📊 Overall Mastery: {path_b.overall_mastery:.1%}")
print(f"⏱️  Estimated Time: {path_b.metadata['estimated_total_hours']:.1f} hours")
print(f"🎨 Learning Style: {path_b.metadata['learning_style']}")
print("=" * 80)

# Show concepts with different statuses
for node in path_b.nodes:
    if node.status in ["in_progress", "completed"]:
        status_emoji = {"completed": "✅", "in_progress": "🔄"}
        print(f"\n{status_emoji[node.status]} {node.concept.name}")
        print(f"   Mastery: {node.current_mastery:.1%} (Target: {node.target_mastery:.1%})")
        
        # Show lab/project resources prioritized for kinesthetic learners
        hands_on = [r for r in node.recommended_resources if r.type in ['lab', 'project', 'interactive']]
        if hands_on:
            print(f"   🛠️ Hands-on Resources:")
            for res in hands_on[:2]:
                print(f"      • {res.title} ({res.type}, {res.duration_minutes}min)")


In [None]:
# User C: Math student preparing for calculus
user_c_attempts = [
    {"concept": "algebra_basics", "correct": True},
    {"concept": "algebra_basics", "correct": True},
    {"concept": "linear_equations", "correct": True},
    {"concept": "linear_equations", "correct": False},
]

path_c = generator.generate_path(
    user_id="user_c_math",
    subject="math",
    user_attempts=user_c_attempts,
    learning_style=LearningStyle.VISUAL,
    learning_goal="Prepare for AP Calculus"
)

print("=" * 80)
print(f"📐 Mathematics Learning Path for {path_c.user_id}")
print(f"📊 Overall Mastery: {path_c.overall_mastery:.1%}")
print(f"⏱️  Estimated Time: {path_c.metadata['estimated_total_hours']:.1f} hours")
print("=" * 80)

for i, node in enumerate(path_c.nodes, 1):
    status_emoji = {"completed": "✅", "in_progress": "🔄", "not_started": "⭕", "locked": "🔒"}
    print(f"\n{i}. {status_emoji[node.status]} {node.concept.name}")
    print(f"   Mastery: {node.current_mastery:.1%} → {node.target_mastery:.1%}")
    
    # Show prerequisite chain
    if node.concept.prerequisites:
        prereq_status = []
        for prereq_id in node.concept.prerequisites:
            prereq_node = next((n for n in path_c.nodes if n.concept.id == prereq_id), None)
            if prereq_node:
                prereq_status.append(f"{prereq_node.concept.name} ({prereq_node.current_mastery:.0%})")
        print(f"   Prerequisites: {' → '.join(prereq_status)}")


In [None]:
### Example 4: Adaptive Path Updates

Simulating how the path adapts as the user makes progress


In [None]:
# Start with User A's original path
print("📌 Original Path for User A:")
print(f"Overall Mastery: {path_a.overall_mastery:.1%}\n")

for node in path_a.nodes[:3]:
    print(f"  {node.concept.name}: {node.current_mastery:.1%} ({node.status})")

# Simulate new learning attempts (user practiced control_structures)
new_attempts = [
    {"concept": "control_structures", "correct": True},
    {"concept": "control_structures", "correct": True},
    {"concept": "control_structures", "correct": True},
    {"concept": "control_structures", "correct": False},  # Still one mistake
]

print("\n🔄 User completed practice on Control Structures...")

# Update the path
updated_path = generator.update_path_with_new_attempts(path_a, new_attempts)

print("\n📈 Updated Path:")
print(f"Overall Mastery: {updated_path.overall_mastery:.1%}\n")

# Create a lookup dictionary for easier access
original_mastery = {node.concept.id: node.current_mastery for node in path_a.nodes}

for node in updated_path.nodes[:3]:
    old_mastery = original_mastery.get(node.concept.id, 0.0)
    mastery_change = node.current_mastery - old_mastery
    
    change_indicator = "📈" if mastery_change > 0 else "➡️"
    print(f"  {change_indicator} {node.concept.name}: {node.current_mastery:.1%} ({node.status})", end="")
    if mastery_change > 0:
        print(f" [+{mastery_change:.1%}]")
    else:
        print()

# Get next recommended concept
next_concept = generator.get_next_recommended_concept(updated_path)
if next_concept:
    print(f"\n🎯 Next Recommended: {next_concept.name}")
    print(f"   This concept will help you progress toward your goal!")


In [None]:
### Visualization: Mastery Progression Across Concepts


# Create visualization comparing different learner profiles
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

paths_to_viz = [
    (path_a, "Beginner (Visual)", "skyblue"),
    (path_b, "Intermediate (Kinesthetic)", "lightcoral"),
    (path_c, "Math Student", "lightgreen")
]

for idx, (path, title, color) in enumerate(paths_to_viz):
    ax = axes[idx]
    
    concepts = [node.concept.name[:20] for node in path.nodes[:6]]  # First 6 concepts
    current_mastery = [node.current_mastery * 100 for node in path.nodes[:6]]
    target_mastery = [node.target_mastery * 100 for node in path.nodes[:6]]
    
    x = np.arange(len(concepts))
    width = 0.35
    
    bars1 = ax.bar(x - width/2, current_mastery, width, label='Current', color=color, alpha=0.8)
    bars2 = ax.bar(x + width/2, target_mastery, width, label='Target', color='gray', alpha=0.4)
    
    ax.set_ylabel('Mastery %')
    ax.set_title(f'{title}\nOverall: {path.overall_mastery*100:.1f}%')
    ax.set_xticks(x)
    ax.set_xticklabels(concepts, rotation=45, ha='right', fontsize=8)
    ax.legend()
    ax.set_ylim(0, 100)
    ax.axhline(y=70, color='green', linestyle='--', alpha=0.3, label='Typical Target')
    ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('stem_mastery_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n📊 Visualization saved as 'stem_mastery_comparison.png'")


# Export path in API-ready JSON format
path_json = path_a.to_dict()

print("📄 Sample API Response (first 2 concepts):\n")
sample_response = {
    "path_id": path_json["path_id"],
    "user_id": path_json["user_id"],
    "subject": path_json["subject"],
    "overall_mastery": path_json["overall_mastery"],
    "concepts": path_json["concepts"][:2],  # Show first 2
    "metadata": path_json["metadata"]
}

print(json.dumps(sample_response, indent=2))

# Save full path
with open('../output/example_stem_path.json', 'w') as f:
    json.dump(path_json, f, indent=2)

print("\n✅ Full learning path saved to '../output/example_stem_path.json'")
