## 1. Setup & Imports

In [1]:
import sys
sys.path.insert(0, '/Users/hp/Documents/Akulearn_docs')

import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime, timedelta
import json

# Import our content generator
from mlops.exam_content_generator import (
    ExamContentOrchestrator,
    GenerationRequest,
    ExamBoard,
    Difficulty,
)

print("✓ All imports successful!")

  from .autonotebook import tqdm as notebook_tqdm


✓ All imports successful!


## 2. Generate Content for Different Exam Boards

In [6]:
# Mock data for demo (avoiding API calls that require authentication)
import random
from datetime import datetime

# Create mock questions data
def create_mock_questions(exam_board, subject, topic, difficulty, count):
    """Create mock question data for demo purposes"""
    questions = []
    
    # Sample question templates
    templates = {
        "mathematics": {
            "algebra": [
                "Solve: 2x + 3 = 11",
                "Simplify: (x + 2)(x - 3)",
                "Factor: x² - 5x + 6",
            ],
            "geometry": [
                "Find the area of a circle with radius 5cm",
                "Calculate the volume of a cube with side 3cm",
            ]
        },
        "biology": {
            "photosynthesis": [
                "What is the primary function of chlorophyll?",
                "Name the three main stages of photosynthesis",
                "What is the light-dependent reaction?",
            ]
        },
        "chemistry": {
            "periodic_table": [
                "What is the atomic number of Oxygen?",
                "Which element has the symbol 'Au'?",
                "What is the valence of Carbon?",
            ]
        }
    }
    
    for i in range(count):
        q_text = random.choice(templates.get(subject, {}).get(topic, ["Sample question"]))
        questions.append({
            'exam_board': exam_board,
            'subject': subject,
            'topic': topic,
            'difficulty': difficulty,
            'quality_score': random.uniform(0.75, 0.98),
            'relevance_score': random.uniform(0.80, 0.99),
        })
    
    return questions

# Generate mock WAEC questions
print("\n📝 Generating WAEC Mathematics Content (Mock Data)...")
waec_questions = create_mock_questions(
    exam_board='WAEC',
    subject='mathematics',
    topic='algebra',
    difficulty='medium',
    count=15
)
waec_result = {
    'generated': waec_questions,
    'validated': waec_questions
}
print(f"✓ Generated {len(waec_result['generated'])} questions")
print(f"✓ {len(waec_result['validated'])} questions passed validation")


INFO:mlops.exam_content_generator:Initializing QuestionGeneratorAgent with model: gpt2
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Error while downloading from https://huggingface.co/facebook/bart-large-cnn/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...
Trying to resume download...
Error while downloading from https://huggingface.co/facebook/bart-large-cnn/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume 

KeyboardInterrupt: 

In [None]:
# Generate mock NECO questions
neco_questions = create_mock_questions(
    exam_board='NECO',
    subject='biology',
    topic='photosynthesis',
    difficulty='easy',
    count=12
)
neco_result = {
    'generated': neco_questions,
    'validated': neco_questions
}

print("\n📝 Generating NECO Biology Content (Mock Data)...")
print(f"✓ Generated {len(neco_result['generated'])} questions")
print(f"✓ {len(neco_result['validated'])} questions passed validation")



📝 Generating NECO Biology Content...


NameError: name 'orchestrator' is not defined

In [None]:
# Generate mock JAMB questions
jamb_questions = create_mock_questions(
    exam_board='JAMB',
    subject='chemistry',
    topic='periodic_table',
    difficulty='hard',
    count=18
)
jamb_result = {
    'generated': jamb_questions,
    'validated': jamb_questions
}

print("\n📝 Generating JAMB Chemistry Content (Mock Data)...")
print(f"✓ Generated {len(jamb_result['generated'])} questions")
print(f"✓ {len(jamb_result['validated'])} questions passed validation")


## 3. Analyze Generation Statistics

In [None]:
# Collect all generated questions
all_questions = waec_result['validated'] + neco_result['validated'] + jamb_result['validated']

# Create DataFrame for analysis
data = []
for q in all_questions:
    data.append({
        'exam_board': q['exam_board'],
        'subject': q['subject'],
        'topic': q['topic'],
        'difficulty': q['difficulty'],
        'quality_score': q['quality_score'],
        'relevance_score': q['relevance_score'],
        'avg_score': (q['quality_score'] + q['relevance_score']) / 2,
    })

df = pd.DataFrame(data)

print("\n📊 CONTENT GENERATION STATISTICS")
print("="*50)
print(f"Total Questions Generated: {len(df)}")
print(f"\nBy Exam Board:")
print(df['exam_board'].value_counts())
print(f"\nBy Subject:")
print(df['subject'].value_counts())
print(f"\nBy Difficulty:")
print(df['difficulty'].value_counts())
print(f"\nQuality Metrics:")
print(f"  Avg Quality Score:   {df['quality_score'].mean():.3f}")
print(f"  Avg Relevance Score: {df['relevance_score'].mean():.3f}")
print(f"  Avg Overall Score:   {df['avg_score'].mean():.3f}")


## 4. Visualizations

In [7]:
# Chart 1: Distribution by Exam Board
fig1 = px.bar(
    df['exam_board'].value_counts().reset_index().rename(columns={'count': 'Number of Questions', 'exam_board': 'Exam Board'}),
    x='Exam Board',
    y='Number of Questions',
    color='Exam Board',
    title='Questions Generated by Exam Board',
    color_discrete_map={'WAEC': '#1f77b4', 'NECO': '#ff7f0e', 'JAMB': '#2ca02c'}
)
fig1.update_layout(showlegend=False, height=400)
fig1.show()

NameError: name 'df' is not defined

In [8]:
# Chart 2: Difficulty Distribution
difficulty_order = ['easy', 'medium', 'hard']
difficulty_data = df['difficulty'].value_counts().reindex(difficulty_order, fill_value=0)

fig2 = px.pie(
    values=difficulty_data.values,
    names=difficulty_data.index,
    title='Question Difficulty Distribution',
    color_discrete_map={'easy': '#90EE90', 'medium': '#FFD700', 'hard': '#FF6B6B'}
)
fig2.update_layout(height=400)
fig2.show()

NameError: name 'df' is not defined

In [None]:
# Chart 3: Quality Metrics by Exam Board
quality_by_board = df.groupby('exam_board')[['quality_score', 'relevance_score']].mean().reset_index()

fig3 = make_subplots(
    rows=1, cols=2,
    subplot_titles=("Average Quality Score", "Average Relevance Score")
)

fig3.add_trace(
    go.Bar(x=quality_by_board['exam_board'], y=quality_by_board['quality_score'], 
           name='Quality', marker_color='#1f77b4'),
    row=1, col=1
)

fig3.add_trace(
    go.Bar(x=quality_by_board['exam_board'], y=quality_by_board['relevance_score'], 
           name='Relevance', marker_color='#ff7f0e'),
    row=1, col=2
)

fig3.update_layout(title_text="Quality Metrics by Exam Board", height=400, showlegend=False)
fig3.update_yaxes(range=[0, 1])
fig3.show()

In [None]:
# Chart 4: Subject Distribution
fig4 = px.bar(
    df['subject'].value_counts().reset_index().rename(columns={'count': 'Count', 'subject': 'Subject'}),
    x='Subject',
    y='Count',
    title='Questions by Subject',
    color='Subject'
)
fig4.update_layout(height=400, showlegend=False)
fig4.show()

In [None]:
# Chart 5: Scatter plot - Quality vs Relevance
fig5 = px.scatter(
    df,
    x='quality_score',
    y='relevance_score',
    color='exam_board',
    size='avg_score',
    hover_data=['difficulty'],
    title='Quality vs Relevance Score Analysis',
    labels={'quality_score': 'Quality Score', 'relevance_score': 'Relevance Score'},
    color_discrete_map={'WAEC': '#1f77b4', 'NECO': '#ff7f0e', 'JAMB': '#2ca02c'}
)
fig5.update_layout(height=500)
fig5.show()

## 5. Export Generated Content

In [None]:
# Export all questions to JSON
import os
os.makedirs('runs', exist_ok=True)

output_path = 'runs/exam_content_batch.json'
with open(output_path, 'w') as f:
    json.dump([q for q in all_questions], f, indent=2)

print(f"\n✓ Exported {len(all_questions)} questions to: {output_path}")

# Show sample questions
print(f"\n📌 Sample Questions from Dataset:")
for i, sample_question in enumerate(all_questions[:3], 1):
    print(f"\n  Sample {i}:")
    print(f"    Exam Board: {sample_question['exam_board']}")
    print(f"    Subject: {sample_question['subject']}")
    print(f"    Topic: {sample_question['topic']}")
    print(f"    Difficulty: {sample_question['difficulty']}")
    print(f"    Quality Score: {sample_question['quality_score']:.3f}")
    print(f"    Relevance Score: {sample_question['relevance_score']:.3f}")


## 6. Next Steps: Integration with Google Tools

### Notebook LM Integration
- Upload textbooks/study materials to Notebook LM
- Generate audio study guides for selected topics
- Embed audio URLs in quiz app

### Google AI Studio Integration
- Test improved prompts for question generation
- Refine question templates
- A/B test explanation styles

### Hugging Face Hub
- Fine-tune models on exam-specific data
- Use better-performing models for generation
- Deploy via Hugging Face Inference API

In [None]:
print("\n🎯 SUMMARY")
print("="*60)
print(f"Total questions generated: {len(all_questions)}")
print(f"Average quality score: {df['quality_score'].mean():.3f}/1.0")
print(f"Average relevance score: {df['relevance_score'].mean():.3f}/1.0")
print(f"\nBy exam board:")
print(f"  WAEC:  {len(waec_result['validated'])} questions")
print(f"  NECO:  {len(neco_result['validated'])} questions")
print(f"  JAMB:  {len(jamb_result['validated'])} questions")
print(f"\nNext: Train models, integrate with Google tools, scale to production")
print("="*60)