<a href="https://colab.research.google.com/github/ogutiann/EnhancedPota/blob/main/EduPathAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Sankofa Pathways: AI-Blockchain Adaptive Learning System
Core Implementation for Soroti University Case Study
"""
import sys
from collections import Counter
import pandas as pd
import numpy as np
import transformers
from sklearn.mixture import GaussianMixture
from sklearn.cluster import DBSCAN, KMeans  # Added for density-based clustering
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score  # Added cluster validation
from sentence_transformers import SentenceTransformer, datasets
from scipy.stats import laplace
import matplotlib.pyplot as plt
import seaborn as sns
import hashlib
import json
from datetime import datetime
from tqdm import tqdm
import os
import random
import psutil
# Add to top imports
from sentence_transformers import SentenceTransformer, models, InputExample, losses
from torch.utils.data import DataLoader
import nltk


# Add these to your top-level imports
try:
    from gensim.corpora import Dictionary
    from gensim.models import CoherenceModel
    GENSIM_AVAILABLE = True
except ImportError:
    GENSIM_AVAILABLE = False
    print("Gensim not installed. Topic coherence metrics disabled.")


try:
    from umap import UMAP
    UMAP_AVAILABLE = True
except ImportError:
    UMAP_AVAILABLE = False
    print("UMAP not installed. Using PCA instead.")
    from sklearn.decomposition import PCA

# Add to top imports
try:
    from hdbscan import HDBSCAN
    HDBSCAN_AVAILABLE = True
except ImportError:
    HDBSCAN_AVAILABLE = False
    print("HDBSCAN not installed. Using KMeans for clustering.")

# =====================
# 0. CONFIGURATION
# =====================
PRODUCTION_MODE = False # if PRODUCTION_MODE = True  # This prevents model evaluation, set to false and reproducibility mode to true to enable evaluation
REPRODUCIBILITY_MODE = True
FINE_TUNE_BERT = True  # Global flag to enable/disable fine-tuning
IN_COLAB = 'google.colab' in sys.modules  # Detect Colab environment

if REPRODUCIBILITY_MODE:
    np.random.seed(42)
    random.seed(42)
    os.environ['PYTHONHASHSEED'] = '42'

# Fixed dataset filename
DATASET_FILENAME = "soroti_engineering_dataset.csv"
TEMPLATES_FILENAME = "assessment_templates.csv"

import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# =====================
# 1. DATA PREPROCESSING
# =====================

class DataPreprocessor:
    """
    Handles data ingestion, anonymization, and normalization
    Implements ε-differential privacy (ε=0.85) per Uganda's Data Protection Act
    """

    def __init__(self, epsilon=0.85):
        self.epsilon = epsilon
        self.imputer = SimpleImputer(strategy='median')
        self.scaler = StandardScaler()

    def validate_data(self, df):
        """Ensure data quality before processing"""
        if df.isnull().sum().sum() > 0:
            print(f"Warning: Found {df.isnull().sum().sum()} missing values. Imputing...")
            df = self.imputer.fit_transform(df)
        return df

    def anonymize_ids(self, student_ids):
        """Apply SHA-256 hashing to student identifiers"""
        return [hashlib.sha256(str(id).encode()).hexdigest() for id in student_ids]

    def add_laplace_noise(self, scores, sensitivity=12):
        """Inject Laplace noise for differential privacy with reproducibility option"""
        scale = sensitivity / self.epsilon

        if REPRODUCIBILITY_MODE:
            rng = np.random.default_rng(42)
            noise = rng.laplace(scale=scale, size=len(scores))
        else:
            noise = np.random.laplace(scale=scale, size=len(scores))

        return scores + noise

    def normalize_scores(self, scores):
        """Robust z-score standardization by course"""
        return (scores - np.nanmedian(scores)) / (np.nanstd(scores) + 1e-8)

    def preprocess(self, raw_data):
        """
        Full preprocessing pipeline:
        1. Create ID mapping before anonymization
        2. Anonymize student IDs
        3. Add Laplace noise to scores
        4. Normalize scores by course
        """
        # Create deep copy to avoid mutation
        data = raw_data.copy()

        # Create ID mapping BEFORE anonymization
        self.id_mapping = {id: hashlib.sha256(str(id).encode()).hexdigest()
                           for id in data['student_id'].unique()}

        # Anonymization using mapping
        data['hashed_id'] = data['student_id'].map(self.id_mapping)

        # Differential privacy
        for course in data['course'].unique():
            mask = data['course'] == course
            scores = data.loc[mask, 'score'].values
            data.loc[mask, 'score'] = self.add_laplace_noise(scores)

        # Normalization
        for course in data['course'].unique():
            mask = data['course'] == course
            scores = data.loc[mask, 'score'].values
            data.loc[mask, 'z_score'] = self.normalize_scores(scores)

        return data


# ========================
# 2. HYBRID TOPIC MODELING
# ========================

class HybridTopicModel:
    """
    Principled integration of contextual embeddings (BERT) and probabilistic modeling (LDA)
    using topic alignment and semantic coherence measures
    Enhanced hybrid model with fine-tuned alignment
    """

    def __init__(self, lambda_weight=0.65, n_topics=5, fine_tune_steps=100):
        self.bert_model = self.load_bert_model()
        self.lda_model = None
        self.vectorizer = None
        self.lambda_weight = lambda_weight
        self.n_topics = n_topics
        self.topic_embeddings = None
        self.fine_tune_steps = fine_tune_steps
        self.word_embeddings = None

    def load_bert_model(self):
        """Robust model loading with explicit Sentence Transformer"""
        try:
            print("Loading model: sentence-transformers/all-MiniLM-L6-v2")
            model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
            return model
        except Exception as e:
            print(f"Error loading BERT model: {str(e)}. Using dummy embeddings.")
            return self.DummyEmbedder()

    class DummyEmbedder:
        def __init__(self, dim=384):
            self.dim = dim

        def encode(self, texts):
            if isinstance(texts, str):
                return np.random.randn(self.dim)
            return [np.random.randn(self.dim) for _ in texts]

    def fine_tune_bert(self, documents):
        """Fine-tune BERT with educational texts using paired examples"""
        if not FINE_TUNE_BERT or not IN_COLAB:
            print("Fine-tuning disabled or not in Colab. Skipping.")
            return
        if isinstance(documents, np.ndarray):
            documents = documents.tolist()
        if not documents or len(documents) < 1:
            print("Warning: No documents for fine-tuning BERT. Skipping.")
            return

        # Create paired InputExamples (anchor and positive are the same document)
        examples = []
        for doc in documents:
            examples.append(InputExample(texts=[doc, doc], label=1.0))  # Label 1.0 for positive pair
        train_dataloader = DataLoader(examples, shuffle=True, batch_size=8)

        train_loss = losses.CosineSimilarityLoss(self.bert_model)
        self.bert_model.fit(
            train_objectives=[(train_dataloader, train_loss)],
            epochs=self.fine_tune_steps // len(documents) + 1,
            warmup_steps=10,
            output_path="fine_tuned_bert_model",
            use_amp=False
        )

    def train_lda(self, documents):
        # FIXED: Properly handle NumPy arrays
        if documents is None or len(documents) == 0:
            print("Warning: No documents for LDA training. Using default model.")
            self.lda_model = LatentDirichletAllocation(n_components=self.n_topics, random_state=42)
            return

        academic_stop_words = ['student', 'professor', 'university', 'chapter', 'section', 'example', 'problem',
                               'solution', 'study', 'learn']
        self.vectorizer = CountVectorizer(max_df=0.85, min_df=3, stop_words='english', ngram_range=(1, 2),
                                          max_features=1000)
        self.vectorizer.stop_words_ = set(list(self.vectorizer.get_stop_words()) + academic_stop_words)
        dtm = self.vectorizer.fit_transform(documents)

        if GENSIM_AVAILABLE:
            best_coherence = -1
            for alpha in [0.1, 0.5, 1.0]:
                for eta in [0.01, 0.1]:
                    lda = LatentDirichletAllocation(
                        n_components=self.n_topics,
                        learning_method='online',
                        learning_offset=10.,
                        random_state=42,
                        max_iter=15,
                        n_jobs=1,
                        doc_topic_prior=alpha,
                        topic_word_prior=eta
                    )
                    lda.fit(dtm)
                    # Extract topics manually for CoherenceModel
                    feature_names = self.vectorizer.get_feature_names_out()
                    topics = [[feature_names[i] for i in topic.argsort()[:-11:-1]] for topic in lda.components_]
                    coherence_model = CoherenceModel(
                        topics=topics,
                        texts=[doc.split() for doc in documents],
                        dictionary=Dictionary([doc.split() for doc in documents]),
                        coherence='c_v'
                    )
                    coherence = coherence_model.get_coherence()
                    if coherence > best_coherence:
                        best_coherence = coherence
                        self.lda_model = lda
        else:
            print("Gensim not available. Using default LDA model.")
            self.lda_model = LatentDirichletAllocation(
                n_components=self.n_topics,
                random_state=42,
                max_iter=15,
                n_jobs=1
            )
            self.lda_model.fit(dtm)

        # FIXED TOPIC TERM GENERATION
        feature_names = self.vectorizer.get_feature_names_out()
        top_indices = self.lda_model.components_.argsort(axis=1)[:, ::-1][:, :10]
        self.topic_terms = [feature_names[i] for row in top_indices for i in row]

    def compute_topic_embeddings(self, documents):
        """Enhanced topic embedding calculation with documents parameter"""
        # Get document embeddings
        doc_embeddings = self.bert_model.encode(documents)

        # Cluster documents using BERT embeddings
        kmeans = KMeans(n_clusters=self.n_topics, random_state=42)
        doc_clusters = kmeans.fit_predict(doc_embeddings)
        cluster_centers = kmeans.cluster_centers_

        # Get LDA topic-word distributions
        topic_word_dist = self.lda_model.components_ / self.lda_model.components_.sum(axis=1)[:, np.newaxis]
        feature_names = self.vectorizer.get_feature_names_out()

        # Create enhanced topic embeddings
        self.topic_embeddings = []
        for i in range(self.n_topics):
            # Get top words for LDA topic
            top_word_indices = topic_word_dist[i].argsort()[::-1][:15]
            top_words = [feature_names[idx] for idx in top_word_indices]

            # Get most representative documents for BERT cluster
            cluster_docs = [doc for j, doc in enumerate(documents) if doc_clusters[j] == i]
            if cluster_docs:
                # Encode cluster documents and average
                cluster_embeddings = self.bert_model.encode(cluster_docs)
                cluster_avg = np.mean(cluster_embeddings, axis=0)
            else:
                cluster_avg = cluster_centers[i]

            # Encode LDA top words and average
            word_embeddings = self.bert_model.encode(top_words)
            word_avg = np.mean(word_embeddings, axis=0)

            # Combine cluster and word embeddings
            combined_embedding = (self.lambda_weight * cluster_avg +
                                  (1 - self.lambda_weight) * word_avg)
            self.topic_embeddings.append(combined_embedding)

        self.topic_embeddings = np.array(self.topic_embeddings)

    def get_document_topic_distribution(self, documents):
        """Improved distribution using semantic similarity"""
        doc_embeddings = self.bert_model.encode(documents)
        sim_matrix = cosine_similarity(doc_embeddings, self.topic_embeddings)

        # Get LDA distribution
        dtm = self.vectorizer.transform(documents)
        lda_dist = self.lda_model.transform(dtm)

        # Combine distributions
        hybrid_dist = (self.lambda_weight * sim_matrix +
                       (1 - self.lambda_weight) * lda_dist)

        # Softmax normalization
        hybrid_dist = np.exp(hybrid_dist) / np.sum(np.exp(hybrid_dist), axis=1, keepdims=True)
        return hybrid_dist

    def get_dominant_topic(self, documents):
        """Extract dominant topic"""
        hybrid_dist = self.get_document_topic_distribution(documents)
        return np.argmax(hybrid_dist, axis=1)

# ========================
# 2. Enhanced HYBRID TOPIC MODELING
# ========================

class EnhancedHybridTopicModel:
    """Paper-inspired hybrid model with UMAP, per-cluster topics, and optimal transport"""

    def __init__(self, lambda_weight=0.85, n_topics=5, fine_tune_steps=200):
        self.bert_model = self.load_bert_model()
        self.lda_model = None
        self.vectorizer = None
        self.lambda_weight = lambda_weight
        self.n_topics = n_topics
        self.topic_embeddings = None
        self.fine_tune_steps = fine_tune_steps
        self.cluster_model = None
        self.reducer = None
        self.global_lda_model = None
        self.global_vectorizer = None
        self.cluster_lda_models = {}
        self.cluster_vectorizers = {}
        # Add dynamic topic range
        self.min_topics = max(3, n_topics - 2)
        self.max_topics = n_topics + 3

        # Add coherence optimizer
        self.coherence_threshold = 0.6

    def load_bert_model(self):
        """Robust model loading with explicit Sentence Transformer"""
        try:
            print("Loading model: sentence-transformers/all-MiniLM-L6-v2")
            return SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
        except Exception as e:
            print(f"Error loading BERT model: {str(e)}. Using dummy embeddings.")
            return self.DummyEmbedder()

    class DummyEmbedder:
        def __init__(self, dim=384):
            self.dim = dim

        def encode(self, texts):
            if isinstance(texts, str):
                return np.random.randn(self.dim)
            return [np.random.randn(self.dim) for _ in texts]

    def fine_tune_bert(self, documents):
        """Fine-tune BERT with educational texts using paired examples"""
        if not FINE_TUNE_BERT or not IN_COLAB:
            print("Fine-tuning disabled or not in Colab. Skipping.")
            return
        if isinstance(documents, np.ndarray):
            documents = documents.tolist()
        if not documents or len(documents) < 1:
            print("Warning: No documents for fine-tuning BERT. Skipping.")
            return

        # Create paired InputExamples (anchor and positive are the same document)
        examples = []
        for doc in documents:
            examples.append(InputExample(texts=[doc, doc], label=1.0))
        train_dataloader = DataLoader(examples, shuffle=True, batch_size=8)

        train_loss = losses.CosineSimilarityLoss(self.bert_model)
        self.bert_model.fit(
            train_objectives=[(train_dataloader, train_loss)],
            epochs=self.fine_tune_steps // len(documents) + 1,
            warmup_steps=10,
            output_path="fine_tuned_bert_model",
            use_amp=False
        )

    def reduce_dimensions(self, embeddings):
        """Apply UMAP or PCA dimensionality reduction"""
        if UMAP_AVAILABLE:
            self.reducer = UMAP(n_components=50, random_state=42, n_neighbors=15, min_dist=0.1)
            return self.reducer.fit_transform(embeddings)
        else:
            print("Using PCA for dimensionality reduction")
            self.reducer = PCA(n_components=50, random_state=42)
            return self.reducer.fit_transform(embeddings)

    def cluster_documents(self, reduced_embeddings):
        """Cluster documents using reduced embeddings"""
        self.cluster_model = KMeans(n_clusters=self.n_topics, random_state=42)
        return self.cluster_model.fit_predict(reduced_embeddings)

    def train_global_lda(self, documents):
        """Train global LDA model as fallback"""
        academic_stop_words = ['student', 'professor', 'university', 'chapter', 'section',
                               'example', 'problem', 'solution', 'study', 'learn']
        self.global_vectorizer = CountVectorizer(max_df=0.85, min_df=3, stop_words='english',
                                                 ngram_range=(1, 2), max_features=1000)
        self.global_vectorizer.stop_words_ = set(list(self.global_vectorizer.get_stop_words()) + academic_stop_words)
        dtm = self.global_vectorizer.fit_transform(documents)

        self.global_lda_model = LatentDirichletAllocation(
            n_components=self.n_topics,
            random_state=42,
            max_iter=15
        )
        self.global_lda_model.fit(dtm)
        return self.global_lda_model

    def train_lda_per_cluster(self, documents, clusters):
        """Train separate LDA models for each cluster"""
        for cluster_id in range(self.n_topics):
            cluster_docs = [doc for i, doc in enumerate(documents) if clusters[i] == cluster_id]

            if len(cluster_docs) < 10:  # Minimum documents threshold
                print(f"Cluster {cluster_id} has too few documents. Using global model.")
                self.cluster_lda_models[cluster_id] = self.global_lda_model
                self.cluster_vectorizers[cluster_id] = self.global_vectorizer
                continue

            # Cluster-specific vectorizer
            vectorizer = CountVectorizer(max_df=0.85, min_df=2, stop_words='english',
                                         ngram_range=(1, 2), max_features=500)
            dtm = vectorizer.fit_transform(cluster_docs)

            # Train LDA
            lda = LatentDirichletAllocation(
                n_components=1,  # Each cluster gets one primary topic
                learning_method='online',
                random_state=42,
                max_iter=10
            )
            lda.fit(dtm)

            self.cluster_lda_models[cluster_id] = lda
            self.cluster_vectorizers[cluster_id] = vectorizer

    def compute_topic_embeddings(self, documents):
        """Optimized topic embedding calculation with adaptive topic count and MMR diversification"""
        # Get BERT embeddings and reduce dimensions
        bert_embeddings = self.bert_model.encode(documents)
        reduced_embeddings = self.reduce_dimensions(bert_embeddings)

        # Cluster documents using HDBSCAN or KMeans
        if HDBSCAN_AVAILABLE:
            try:
                cluster_model = HDBSCAN(min_cluster_size=10, gen_min_span_tree=True)
                clusters = cluster_model.fit_predict(reduced_embeddings)
                n_clusters = len(np.unique(clusters)) - (1 if -1 in clusters else 0)
                self.n_topics = max(3, min(n_clusters, 10))
                self.cluster_model = cluster_model
                self.clusters = clusters
                print(f"HDBSCAN found {n_clusters} clusters")
            except Exception as e:
                print(f"HDBSCAN failed: {str(e)}. Using KMeans.")
                self._cluster_with_kmeans(reduced_embeddings)
        else:
            self._cluster_with_kmeans(reduced_embeddings)

        # Adaptive topic count optimization using coherence
        best_coherence = float('-inf')  # Initialize to negative infinity
        best_lda = None
        best_vectorizer = None
        optimal_topics = self.n_topics  # Initialize with current cluster count

        # Define topic range safely
        min_topics = max(2, self.n_topics - 2)
        max_topics = min(self.n_topics + 3, 15)
        topic_range = range(min_topics, max_topics + 1)

        print(f"Optimizing LDA topic count in range {list(topic_range)}...")
        for n in topic_range:
            # Train temporary LDA model
            academic_stop_words = ['student', 'professor', 'university', 'chapter', 'section',
                                   'example', 'problem', 'solution', 'study', 'learn']
            vectorizer = CountVectorizer(max_df=0.85, min_df=3, stop_words='english',
                                         ngram_range=(1, 2), max_features=1000)
            vectorizer.stop_words_ = set(list(vectorizer.get_stop_words()) + academic_stop_words)
            dtm = vectorizer.fit_transform(documents)

            lda = LatentDirichletAllocation(
                n_components=n,
                learning_method='online',
                random_state=42,
                max_iter=10
            )
            lda.fit(dtm)

            # Calculate coherence
            coherence = self.calculate_coherence(lda, vectorizer, documents)
            print(f"  Topics={n} | Coherence={coherence:.3f}")

            if coherence > best_coherence:
                best_coherence = coherence
                best_lda = lda
                best_vectorizer = vectorizer
                optimal_topics = n  # Update optimal topic count

        # Set best models
        self.global_lda_model = best_lda
        self.global_vectorizer = best_vectorizer
        print(f"Selected LDA model with {optimal_topics} topics (Coherence={best_coherence:.3f})")

        # Create enhanced topic embeddings with MMR diversification
        self.topic_embeddings = []
        unique_clusters = np.unique(self.clusters)

        for cluster_id in unique_clusters:
            if cluster_id == -1:
                continue  # Skip noise points

            # Get cluster documents
            cluster_mask = (self.clusters == cluster_id)
            cluster_docs = [doc for i, doc in enumerate(documents) if cluster_mask[i]]

            # Get cluster centroid
            cluster_embeddings = bert_embeddings[cluster_mask]
            centroid = np.mean(cluster_embeddings, axis=0) if len(cluster_embeddings) > 0 else np.mean(bert_embeddings,
                                                                                                       axis=0)

            # Get candidate terms using TF-IDF
            if cluster_docs:
                try:
                    # Get top candidate terms from cluster documents
                    candidate_terms = self.get_candidate_terms(cluster_docs)

                    # Apply MMR diversification
                    selected_terms = self.mmr_diversification(
                        centroid,
                        candidate_terms,
                        self.bert_model,
                        diversity_factor=0.7,
                        top_n=15
                    )
                except Exception as e:
                    print(f"Term selection failed for cluster {cluster_id}: {str(e)}")
                    selected_terms = self._get_global_topic_words(cluster_id)
            else:
                selected_terms = self._get_global_topic_words(cluster_id)

            # Encode selected terms
            if selected_terms:
                try:
                    word_embeddings = self.bert_model.encode(selected_terms)
                    word_avg = np.mean(word_embeddings, axis=0)
                except:
                    word_avg = centroid
            else:
                word_avg = centroid

            # Combine embeddings
            combined_embedding = 0.85 * centroid + 0.15 * word_avg
            self.topic_embeddings.append(combined_embedding)

        self.topic_embeddings = np.array(self.topic_embeddings)
        return self.topic_embeddings

    def _cluster_with_kmeans(self, reduced_embeddings):
        """Cluster using KMeans as fallback"""
        self.cluster_model = KMeans(n_clusters=self.n_topics, random_state=42)
        self.clusters = self.cluster_model.fit_predict(reduced_embeddings)
        print(f"Using KMeans with {self.n_topics} clusters")

    def _get_global_topic_words(self, cluster_id):
        """Get top words from global LDA"""
        cluster_idx = min(cluster_id, self.global_lda_model.components_.shape[0]-1)
        top_indices = self.global_lda_model.components_[cluster_idx].argsort()[::-1][:20]
        return [self.global_vectorizer.get_feature_names_out()[i] for i in top_indices]

    def get_document_topic_distribution(self, documents):
        """Improved distribution using semantic similarity with contextual weighting"""
        doc_embeddings = self.bert_model.encode(documents)
        semantic_sim = cosine_similarity(doc_embeddings, self.topic_embeddings)

        # Get cluster probabilities
        reduced_embeddings = self.reducer.transform(doc_embeddings)
        cluster_probs = self._get_cluster_probabilities(reduced_embeddings)

        # Combine with contextual weighting
        hybrid_dist = 0.75 * semantic_sim + 0.25 * cluster_probs

        # Apply temperature scaling
        temperature = 0.7
        scaled_dist = np.exp(hybrid_dist / temperature)
        return scaled_dist / np.sum(scaled_dist, axis=1, keepdims=True)

    def _get_cluster_probabilities(self, embeddings):
        """Get soft cluster probabilities"""
        if hasattr(self.cluster_model, 'predict_proba'):
            return self.cluster_model.predict_proba(embeddings)
        else:
            # Create soft clustering for KMeans
            distances = self.cluster_model.transform(embeddings)
            return 1 / (1 + distances)

    def get_dominant_topic(self, documents):
        """Extract dominant topic"""
        hybrid_dist = self.get_document_topic_distribution(documents)
        return np.argmax(hybrid_dist, axis=1)

    # Helper methods needed in the class:
    def calculate_coherence(self, lda_model, vectorizer, documents):
        """Calculate topic coherence using UMass measure with sparse matrix support"""
        # Get topic terms
        feature_names = vectorizer.get_feature_names_out()
        topics = []
        for topic_idx in range(lda_model.n_components):
            top_indices = lda_model.components_[topic_idx].argsort()[::-1][:10]
            topics.append([feature_names[i] for i in top_indices])

        # Prepare document-term matrix
        dtm = vectorizer.transform(documents)

        # Calculate pairwise coherence
        total_coherence = 0
        valid_topics = 0

        for topic in topics:
            topic_coherence = 0
            valid_pairs = 0

            for i in range(1, len(topic)):
                for j in range(0, i):
                    # Get vocabulary indices
                    try:
                        idx_i = vectorizer.vocabulary_[topic[j]]
                        idx_j = vectorizer.vocabulary_[topic[i]]
                    except KeyError:
                        continue

                    # Get co-occurrence statistics using sparse matrix operations
                    # Count documents containing term i
                    D_wi = (dtm[:, idx_i] != 0).sum()

                    # Count documents containing both terms i and j
                    # Create binary masks for each term
                    term_i_mask = (dtm[:, idx_i] != 0).astype(int)
                    term_j_mask = (dtm[:, idx_j] != 0).astype(int)

                    # Calculate co-occurrence using dot product
                    D_wi_wj = term_i_mask.multiply(term_j_mask).sum()

                    # Avoid division by zero
                    if D_wi > 0 and D_wi_wj > 0:
                        score = np.log((D_wi_wj + 1.0) / D_wi)
                        topic_coherence += score
                        valid_pairs += 1

            # Only count topics with valid pairs
            if valid_pairs > 0:
                total_coherence += topic_coherence / valid_pairs
                valid_topics += 1

        # Return average coherence across valid topics
        return total_coherence / valid_topics if valid_topics > 0 else 0

    def get_candidate_terms(self, cluster_docs, top_n=100):
        """Get top candidate terms from cluster documents using TF-IDF"""
        vectorizer = TfidfVectorizer(max_features=top_n, stop_words='english')
        try:
            tfidf = vectorizer.fit_transform(cluster_docs)
            feature_names = vectorizer.get_feature_names_out()
            word_scores = tfidf.sum(axis=0).A1
            top_indices = word_scores.argsort()[::-1][:top_n]
            return [feature_names[i] for i in top_indices]
        except ValueError:
            return []

    def mmr_diversification(self, centroid, terms, bert_model, diversity_factor=0.7, top_n=15):
        """Maximal Marginal Relevance for diverse term selection"""
        if not terms:
            return []

        # Encode all terms at once
        term_embeddings = bert_model.encode(terms)

        # Calculate similarity to centroid
        centroid_sim = cosine_similarity([centroid], term_embeddings)[0]

        selected_indices = []
        selected_terms = []

        # Start with most relevant term
        first_idx = np.argmax(centroid_sim)
        selected_indices.append(first_idx)
        selected_terms.append(terms[first_idx])

        # Iteratively select remaining terms
        while len(selected_terms) < min(top_n, len(terms)):
            candidate_indices = set(range(len(terms))) - set(selected_indices)
            if not candidate_indices:
                break

            mmr_scores = []
            for idx in candidate_indices:
                # Relevance to centroid
                rel_score = centroid_sim[idx]

                # Max similarity to selected terms
                max_sim = 0
                if selected_indices:
                    sims = cosine_similarity(
                        [term_embeddings[idx]],
                        [term_embeddings[i] for i in selected_indices]
                    )
                    max_sim = np.max(sims)

                # MMR calculation
                mmr = diversity_factor * rel_score - (1 - diversity_factor) * max_sim
                mmr_scores.append((idx, mmr))

            # Select term with highest MMR
            next_idx = max(mmr_scores, key=lambda x: x[1])[0]
            selected_indices.append(next_idx)
            selected_terms.append(terms[next_idx])

        return selected_terms

    def _get_cluster_probabilities(self, embeddings):
        """Robust cluster probability calculation for all clustering methods"""
        # 1. Models with predict_proba method
        if hasattr(self.cluster_model, 'predict_proba'):
            try:
                return self.cluster_model.predict_proba(embeddings)
            except Exception as e:
                print(f"Predict_proba failed: {str(e)}. Using fallback.")

        # 2. HDBSCAN-specific handling
        if HDBSCAN_AVAILABLE and isinstance(self.cluster_model, HDBSCAN):
            try:
                # Get soft clusters for HDBSCAN
                return self.cluster_model.membership_vector(embeddings)
            except Exception as e:
                print(f"HDBSCAN membership_vector failed: {str(e)}")

        # 3. KMeans/GMM fallback
        if hasattr(self.cluster_model, 'transform'):
            try:
                # Soft clustering via distance transform
                distances = self.cluster_model.transform(embeddings)
                return 1 / (1 + distances)
            except Exception as e:
                print(f"Distance transform failed: {str(e)}")

        # 4. Final fallback: one-hot encoding from labels
        try:
            labels = self.cluster_model.predict(embeddings)
            n_clusters = len(np.unique(labels))
            probs = np.zeros((len(embeddings), n_clusters))
            probs[np.arange(len(embeddings)), labels] = 1
            return probs
        except Exception as e:
            print(f"One-hot encoding failed: {str(e)}")
            # Uniform distribution as last resort
            return np.ones((len(embeddings), self.n_topics)) / self.n_topics

# ========================
# 2. TOPIC MODEL EVALUATION
# ========================

class TopicModelEvaluator:
    """Evaluates multiple topic models using coherence and diversity metrics"""

    def __init__(self, documents, n_topics=5):
        self.documents = documents
        self.n_topics = n_topics
        self.tokenized_docs = self._tokenize_documents(documents)
        self.dictionary = self._create_dictionary() if GENSIM_AVAILABLE else None
        self.visualizer = ResultVisualizer()  # Initialize visualizer here

    def _tokenize_documents(self, documents):
        """Tokenize documents using nltk or fallback"""
        try:
            from nltk.tokenize import word_tokenize
            return [word_tokenize(doc.lower()) for doc in documents]
        except ImportError:
            return [doc.lower().split() for doc in documents]

    def _calculate_topic_quality(self, topics):
        """Robust topic quality calculation with comprehensive error handling"""
        # Initialize default metrics
        metrics = {
            "coherence": 0.5,
            "distinctiveness": 0.0,
            "relevance": 0.0
        }

        # Validate topics input
        if not topics or not isinstance(topics, list) or len(topics) == 0:
            print("Warning: Invalid topics format. Returning default metrics.")
            return metrics

        # 1. Topic coherence calculation
        metrics["coherence"] = self._calculate_coherence(topics)

        # 2. Topic distinctiveness
        try:
            unique_words = set()
            total_words = 0
            for topic in topics:
                if not topic:  # Skip empty topics
                    continue
                for word in topic:
                    if word:  # Skip empty strings
                        unique_words.add(word)
                        total_words += 1
            metrics["distinctiveness"] = len(unique_words) / total_words if total_words > 0 else 0.0
        except Exception as e:
            print(f"Distinctiveness calculation failed: {str(e)}")
            metrics["distinctiveness"] = 0.0

        # 3. Term relevance (IDF-weighted)
        try:
            from sklearn.feature_extraction.text import TfidfVectorizer

            # Handle empty documents
            if not self.documents or len(self.documents) == 0:
                print("Warning: No documents for relevance calculation.")
                return metrics

            vectorizer = TfidfVectorizer()
            try:
                tfidf = vectorizer.fit_transform(self.documents)
                idf = vectorizer.idf_
                vocab = vectorizer.get_feature_names_out()
            except ValueError:
                # Fallback for small documents
                print("Using fallback TF-IDF for small document set")
                vectorizer = TfidfVectorizer(min_df=1)
                tfidf = vectorizer.fit_transform(self.documents)
                idf = vectorizer.idf_
                vocab = vectorizer.get_feature_names_out()

            relevance_scores = []
            for topic in topics:
                if not topic:  # Skip empty topics
                    continue

                topic_score = 0.0
                valid_terms = 0

                for term in topic:
                    if term and term in vocab:  # Check for non-empty term
                        try:
                            idx = np.where(vocab == term)[0][0]
                            topic_score += idf[idx]
                            valid_terms += 1
                        except IndexError:
                            pass

                if valid_terms > 0:
                    relevance_scores.append(topic_score / valid_terms)

            metrics["relevance"] = np.mean(relevance_scores) if relevance_scores else 0.0
        except Exception as e:
            print(f"Relevance calculation failed: {str(e)}")
            metrics["relevance"] = 0.0

        return metrics

    def _create_dictionary(self):
        """Create gensim dictionary with filtering"""
        if not GENSIM_AVAILABLE:
            return None
        dictionary = Dictionary(self.tokenized_docs)
        dictionary.filter_extremes(no_below=5, no_above=0.5)
        return dictionary

    def _calculate_coherence(self, topics):
        """Reliable coherence calculation with multiple fallbacks"""
        # Validate input
        if not topics or any(len(t) == 0 for t in topics):
            return 0.5

        # Attempt c_v coherence
        try:
            dictionary = Dictionary(self.tokenized_docs)
            dictionary.filter_extremes(no_below=3, no_above=0.8)
            coherence_model = CoherenceModel(
                topics=topics,
                texts=self.tokenized_docs,
                dictionary=dictionary,
                coherence='c_v'
            )
            return max(0, min(1.0, coherence_model.get_coherence()))
        except Exception as e:
            print(f"c_v coherence failed: {str(e)}")

        # Attempt u_mass coherence
        try:
            corpus = [dictionary.doc2bow(doc) for doc in self.tokenized_docs]
            coherence_model = CoherenceModel(
                topics=topics,
                corpus=corpus,
                dictionary=dictionary,
                coherence='u_mass'
            )
            u_mass = coherence_model.get_coherence()
            # Convert to 0-1 scale (approximate)
            return min(1.0, max(0, (u_mass + 10) / 20))
        except Exception as e:
            print(f"u_mass coherence failed: {str(e)}")

        # Fallback to simple metric
        return self._simple_topic_coherence(topics)

    def _simple_topic_coherence(self, topics):
        """Fallback coherence metric based on PMI"""
        from itertools import combinations
        from collections import defaultdict

        # Create document frequency map
        doc_freq = defaultdict(int)
        cooc_freq = defaultdict(int)

        for doc in self.tokenized_docs:
            unique_words = set(doc)
            for word in unique_words:
                doc_freq[word] += 1
            for w1, w2 in combinations(unique_words, 2):
                cooc_freq[(w1, w2)] += 1

        # Calculate average pairwise PMI
        topic_coherence = []
        for topic in topics:
            topic = [word for word in topic if word in doc_freq]
            if len(topic) < 2:
                continue

            pmi_scores = []
            for (w1, w2) in combinations(topic, 2):
                if (w1, w2) in cooc_freq:
                    p_w1w2 = cooc_freq[(w1, w2)] / len(self.tokenized_docs)
                    p_w1 = doc_freq[w1] / len(self.tokenized_docs)
                    p_w2 = doc_freq[w2] / len(self.tokenized_docs)
                    pmi = np.log(p_w1w2 / (p_w1 * p_w2))
                    pmi_scores.append(pmi)

            if pmi_scores:
                topic_coherence.append(np.mean(pmi_scores))

        # Normalize to 0-1 scale
        if topic_coherence:
            max_pmi = max(topic_coherence)
            return min(1.0, max(0, np.mean(topic_coherence) / max_pmi if max_pmi > 0 else 0))
        return 0.5

    def _calculate_diversity(self, topics):
        """Calculate topic diversity metric"""
        unique_words = set()
        total_words = 0
        for topic in topics:
            for word in topic:
                unique_words.add(word)
                total_words += 1
        return len(unique_words) / total_words if total_words > 0 else 0

    def evaluate_hybrid_model(self, hybrid_model):
        """Enhanced evaluation for both hybrid models"""
        enhanced_topics = []
        n_topics = hybrid_model.n_topics

        # Get topic terms based on model type
        if hasattr(hybrid_model, 'global_vectorizer') and hybrid_model.global_vectorizer is not None:
            # Enhanced hybrid model
            feature_names = hybrid_model.global_vectorizer.get_feature_names_out()
            top_indices = hybrid_model.global_lda_model.components_.argsort(axis=1)[:, ::-1][:, :10]
        else:
            # Original hybrid model
            feature_names = hybrid_model.vectorizer.get_feature_names_out()
            top_indices = hybrid_model.lda_model.components_.argsort(axis=1)[:, ::-1][:, :10]

        topic_terms = [feature_names[i] for row in top_indices for i in row]

        for i in range(n_topics):
            # Slice terms for the current topic
            start_idx = i * 10
            end_idx = start_idx + 10
            terms = topic_terms[start_idx:end_idx]

            # Get topic embedding
            topic_embedding = hybrid_model.topic_embeddings[i]

            # Find representative documents
            doc_embeddings = hybrid_model.bert_model.encode(self.documents)
            doc_sims = cosine_similarity([topic_embedding], doc_embeddings)[0]
            top_doc_idx = np.argsort(doc_sims)[-5:]  # Top 5 documents
            top_docs = [self.documents[i] for i in top_doc_idx]

            # Extract meaningful nouns
            nouns = [w for doc in top_docs for w in doc.split() if len(w) > 3 and w.isalpha()]
            counter = Counter(nouns)

            # Add new relevant terms
            terms_list = list(terms)
            new_terms = [word for word, _ in counter.most_common(3) if word not in terms_list]

            if not new_terms:
                enhanced_terms = terms[:7]  # Use top 7 terms if no new terms
            else:
                padded_new_terms = new_terms[:3] + [''] * (3 - len(new_terms))
                enhanced_terms = terms[:7] + padded_new_terms[:3]

            enhanced_topics.append(enhanced_terms)

        return enhanced_topics

    def evaluate_enhanced_hybrid_model(self, hybrid_model):
        """Evaluation specifically for the paper-inspired hybrid model"""
        topics = []
        n_topics = hybrid_model.n_topics

        # Get top words from global LDA
        feature_names = hybrid_model.global_vectorizer.get_feature_names_out()

        for i in range(n_topics):
            top_indices = hybrid_model.global_lda_model.components_[i].argsort()[::-1][:10]
            topics.append([feature_names[idx] for idx in top_indices])

        return topics

    def evaluate_bert_model(self, documents):
        """Evaluate BERT-based topic modeling"""
        model = SentenceTransformer('all-MiniLM-L6-v2')
        embeddings = model.encode(documents)
        kmeans = KMeans(n_clusters=self.n_topics, random_state=42)
        clusters = kmeans.fit_predict(embeddings)
        vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
        vectorizer.fit(documents)
        vocabulary = vectorizer.get_feature_names_out()
        topics = []
        for i in range(self.n_topics):
            cluster_docs = [documents[j] for j in range(len(documents)) if clusters[j] == i]
            if not cluster_docs:
                topics.append([])
                continue
            cluster_tfidf = vectorizer.transform(cluster_docs)
            word_scores = cluster_tfidf.sum(axis=0).A1
            top_indices = word_scores.argsort()[-10:][::-1]
            topics.append([vocabulary[idx] for idx in top_indices])
        return topics

    def train_global_lda(self, documents):
        """Train global LDA model as fallback"""
        academic_stop_words = ['student', 'professor', 'university', 'chapter', 'section',
                               'example', 'problem', 'solution', 'study', 'learn']
        self.global_vectorizer = CountVectorizer(max_df=0.85, min_df=3, stop_words='english',
                                                 ngram_range=(1, 2), max_features=1000)
        self.global_vectorizer.stop_words_ = set(list(self.global_vectorizer.get_stop_words()) + academic_stop_words)
        dtm = self.global_vectorizer.fit_transform(documents)

        self.global_lda_model = LatentDirichletAllocation(
            n_components=self.n_topics,
            random_state=42,
            max_iter=15
        )
        self.global_lda_model.fit(dtm)
        return self.global_lda_model

    def train_lda_per_cluster(self, documents, clusters):
        """Train separate LDA models for each cluster"""
        for cluster_id in range(self.n_topics):
            cluster_docs = [doc for i, doc in enumerate(documents) if clusters[i] == cluster_id]

            if len(cluster_docs) < 10:  # Minimum documents threshold
                print(f"Cluster {cluster_id} has too few documents. Using global model.")
                self.cluster_lda_models[cluster_id] = self.global_lda_model
                self.cluster_vectorizers[cluster_id] = self.global_vectorizer
                continue

            # Cluster-specific vectorizer
            vectorizer = CountVectorizer(max_df=0.85, min_df=2, stop_words='english',
                                         ngram_range=(1, 2), max_features=500)
            dtm = vectorizer.fit_transform(cluster_docs)

            # Train LDA
            lda = LatentDirichletAllocation(
                n_components=1,  # Each cluster gets one primary topic
                learning_method='online',
                random_state=42,
                max_iter=10
            )
            lda.fit(dtm)

            self.cluster_lda_models[cluster_id] = lda
            self.cluster_vectorizers[cluster_id] = vectorizer


    def train_lda(self, documents):
        # FIXED: Properly handle NumPy arrays
        if documents is None or len(documents) == 0:
            print("Warning: No documents for LDA training. Using default model.")
            self.lda_model = LatentDirichletAllocation(n_components=self.n_topics, random_state=42)
            return

        academic_stop_words = ['student', 'professor', 'university', 'chapter', 'section', 'example', 'problem', 'solution', 'study', 'learn']
        self.vectorizer = CountVectorizer(max_df=0.85, min_df=3, stop_words='english', ngram_range=(1, 2), max_features=1000)
        self.vectorizer.stop_words_ = set(list(self.vectorizer.get_stop_words()) + academic_stop_words)
        dtm = self.vectorizer.fit_transform(documents)

        if GENSIM_AVAILABLE:
            best_coherence = -1
            for alpha in [0.1, 0.5, 1.0]:
                for eta in [0.01, 0.1]:
                    lda = LatentDirichletAllocation(
                        n_components=self.n_topics,
                        learning_method='online',
                        learning_offset=10.,
                        random_state=42,
                        max_iter=15,
                        n_jobs=-1,
                        doc_topic_prior=alpha,
                        topic_word_prior=eta
                    )
                    lda.fit(dtm)
                    # Extract topics manually for CoherenceModel
                    feature_names = self.vectorizer.get_feature_names_out()
                    topics = [[feature_names[i] for i in topic.argsort()[:-11:-1]] for topic in lda.components_]
                    coherence_model = CoherenceModel(
                        topics=topics,
                        texts=[doc.split() for doc in documents],
                        dictionary=Dictionary([doc.split() for doc in documents]),
                        coherence='c_v'
                    )
                    coherence = coherence_model.get_coherence()
                    if coherence > best_coherence:
                        best_coherence = coherence
                        self.lda_model = lda
        else:
            print("Gensim not available. Using default LDA model.")
            self.lda_model = LatentDirichletAllocation(
                n_components=self.n_topics,
                random_state=42,
                max_iter=15,
                n_jobs=-1
            )
            self.lda_model.fit(dtm)

        # FIXED TOPIC TERM GENERATION
        feature_names = self.vectorizer.get_feature_names_out()
        top_indices = self.lda_model.components_.argsort(axis=1)[:, ::-1][:, :10]
        self.topic_terms = [feature_names[i] for row in top_indices for i in row]

    def compute_topic_embeddings(self, documents):
        """Enhanced topic embeddings combining cluster, BERT, and LDA information"""
        # Get BERT embeddings and reduce dimensions
        bert_embeddings = self.bert_model.encode(documents)
        reduced_embeddings = self.reduce_dimensions(bert_embeddings)

        # Cluster documents
        clusters = self.cluster_documents(reduced_embeddings)

        # Train global LDA as fallback
        self.train_global_lda(documents)

        # Train cluster-specific LDA models
        self.train_lda_per_cluster(documents, clusters)

        # Create unified topic embeddings
        self.topic_embeddings = []
        for cluster_id in range(self.n_topics):
            # Get cluster centroid
            cluster_mask = (clusters == cluster_id)
            cluster_center = np.mean(bert_embeddings[cluster_mask], axis=0) if any(cluster_mask) else np.mean(
                bert_embeddings, axis=0)

            # Get top words from cluster-specific LDA
            lda = self.cluster_lda_models[cluster_id]
            vectorizer = self.cluster_vectorizers[cluster_id]
            feature_names = vectorizer.get_feature_names_out()

            if lda.components_.shape[0] > 0:
                top_words_idx = lda.components_[0].argsort()[::-1][:10]
                top_words = [feature_names[i] for i in top_words_idx]
            else:
                # Fallback to global model
                top_words_idx = self.global_lda_model.components_[cluster_id].argsort()[::-1][:10]
                top_words = [self.global_vectorizer.get_feature_names_out()[i] for i in top_words_idx]

            # Encode top words
            word_embeddings = self.bert_model.encode(top_words)
            word_avg = np.mean(word_embeddings, axis=0)

            # Combine embeddings
            combined_embedding = (self.lambda_weight * cluster_center +
                                  (1 - self.lambda_weight) * word_avg)
            self.topic_embeddings.append(combined_embedding)

        self.topic_embeddings = np.array(self.topic_embeddings)

    def evaluate_lda_model(self, documents):
        """Train LDA model and extract topics for evaluation"""
        # Train LDA using existing method
        self.train_lda(documents)

        # Extract topics from trained model
        feature_names = self.vectorizer.get_feature_names_out()
        topics = []
        for topic_idx in range(self.lda_model.components_.shape[0]):
            top_indices = self.lda_model.components_[topic_idx].argsort()[::-1][:10]
            topics.append([feature_names[i] for i in top_indices])
        return topics

    def run_evaluation(self, hybrid_model):
        """Run evaluation with enhanced hybrid model"""
        results = {}

        # Evaluate our enhanced hybrid model
        if isinstance(hybrid_model, EnhancedHybridTopicModel):
            hybrid_topics = self.evaluate_enhanced_hybrid_model(hybrid_model)
        else:
            hybrid_topics = self.evaluate_hybrid_model(hybrid_model)

        lda_topics = self.evaluate_lda_model(self.documents)
        bert_topics = self.evaluate_bert_model(self.documents)

        results["Hybrid (Ours)"] = {"topics": hybrid_topics, "metrics": self._calculate_topic_quality(hybrid_topics)}
        results["LDA Only"] = {"topics": lda_topics, "metrics": self._calculate_topic_quality(lda_topics)}
        results["BERT Only"] = {"topics": bert_topics, "metrics": self._calculate_topic_quality(bert_topics)}

        return results

    def select_best_model(self, results):
        """Robust model selection with comprehensive error handling"""
        best_model = None
        best_score = -1
        best_topics = None

        # Default weights
        weights = {
            "coherence": 0.5,
            "distinctiveness": 0.3,
            "relevance": 0.2
        }

        for name, data in results.items():
            # Skip models with missing data
            if data is None or "metrics" not in data or data["metrics"] is None:
                print(f"Warning: Missing metrics for {name}. Skipping.")
                continue

            metrics = data["metrics"]
            if metrics is None:
                print(f"Warning: Metrics are None for {name}. Skipping.")
                continue

            # Calculate score with fallbacks
            try:
                score = 0
                for metric, weight in weights.items():
                    value = metrics.get(metric, 0)
                    score += weight * value
            except Exception as e:
                print(f"Score calculation failed for {name}: {str(e)}")
                continue

            if score > best_score:
                best_score = score
                best_model = name
                best_topics = data.get("topics", [])

        if best_model is None:
            print("Error: No valid models found. Using first model as fallback.")
            first_model = next(iter(results.keys()))
            return first_model, results[first_model].get("topics", [])

        print(f"Selected best model: {best_model} (Score: {best_score:.3f})")
        return best_model, best_topics

    def visualize_results(self, results, filename="topic_model_comparison.png"):
        """Robust visualization with comprehensive error handling"""
        import matplotlib.pyplot as plt
        import numpy as np

        # Validate input results
        if not results or not isinstance(results, dict):
            print("Error: Invalid results format. Cannot visualize.")
            return

        # Filter out invalid models
        valid_models = []
        metric_values = {m: {} for m in results}

        metrics = ["coherence", "distinctiveness", "relevance"]
        colors = ['#1f77b4', '#ff7f0e', '#2ca02c']

        # Collect valid data with fallbacks
        for model_name, model_data in results.items():
            if model_data is None:
                print(f"Warning: Missing data for {model_name}. Skipping.")
                continue

            if "metrics" not in model_data:
                print(f"Warning: Missing metrics for {model_name}. Skipping.")
                continue

            model_metrics = model_data["metrics"]
            if model_metrics is None:
                print(f"Warning: Metrics are None for {model_name}. Skipping.")
                continue

            valid_models.append(model_name)
            for metric in metrics:
                # Use 0 as fallback for missing metrics
                value = model_metrics.get(metric, 0) if model_metrics else 0
                metric_values[model_name][metric] = value

        if not valid_models:
            print("Error: No valid models with metrics to visualize.")
            return

        # Create visualization
        fig, ax = plt.subplots(figsize=(12, 6))
        bar_width = 0.25
        index = np.arange(len(valid_models))

        for i, metric in enumerate(metrics):
            values = [metric_values[m].get(metric, 0) for m in valid_models]
            ax.bar(index + i * bar_width, values, bar_width,
                   label=metric.capitalize(), color=colors[i])

        ax.set_xlabel('Models')
        ax.set_ylabel('Scores')
        ax.set_title('Topic Model Quality Metrics')
        ax.set_xticks(index + bar_width)
        ax.set_xticklabels(valid_models)
        ax.legend()
        ax.grid(True, axis='y', alpha=0.3)

        plt.tight_layout()
        plt.savefig(filename, dpi=300, bbox_inches='tight')
        plt.close()

        return fig

    def comprehensive_evaluation(self, models_dict):
        """
        Perform comprehensive evaluation of multiple models
        models_dict: {'Model Name': model_instance}
        Returns detailed comparison report
        """
        results = {}
        comparison_data = []

        for model_name, model in models_dict.items():
            print(f"\nEvaluating {model_name}...")

            # Get topics
            if model_name == "LDA Only":
                topics = self.evaluate_lda_model(self.documents)
            elif model_name == "BERT Only":
                topics = self.evaluate_bert_model(self.documents)
            else:
                topics = self.evaluate_hybrid_model(model) if not isinstance(model,
                                                                             EnhancedHybridTopicModel) else self.evaluate_enhanced_hybrid_model(
                    model)

            # Calculate metrics
            metrics = self._calculate_topic_quality(topics)

            # Store results
            results[model_name] = {
                "topics": topics,
                "metrics": metrics
            }

            # Prepare for detailed comparison
            comparison_data.append({
                "Model": model_name,
                "Coherence": metrics["coherence"],
                "Distinctiveness": metrics["distinctiveness"],
                "Relevance": metrics["relevance"],
                "Overall Score": 0.5 * metrics["coherence"] + 0.3 * metrics["distinctiveness"] + 0.2 * metrics[
                    "relevance"]
            })

            # Print topic samples
            print(f"{model_name} Topics (Sample):")
            for i, topic in enumerate(topics[:3]):  # Show first 3 topics
                print(f"  Topic {i + 1}: {', '.join(topic[:5])}...")

        # Create comparison dataframe
        comparison_df = pd.DataFrame(comparison_data)
        comparison_df = comparison_df.sort_values("Overall Score", ascending=False)

        # Visualize comparison
        self.visualize_comparison(comparison_df)

        # Generate detailed report
        report = self.generate_comparison_report(comparison_df, results)

        return comparison_df, report

    def visualize_comparison(self, comparison_df):
        """Create comprehensive visualizations of model comparison"""
        # Metrics comparison
        plt.figure(figsize=(14, 8))
        metrics = ["Coherence", "Distinctiveness", "Relevance", "Overall Score"]

        for i, metric in enumerate(metrics):
            plt.subplot(2, 2, i + 1)
            sns.barplot(x="Model", y=metric, data=comparison_df.sort_values(metric, ascending=False),
                        palette="viridis")
            plt.title(f"{metric} Comparison")
            plt.xticks(rotation=15)
            plt.tight_layout()

        plt.savefig("model_metrics_comparison.png", dpi=300)
        plt.close()

        # Radar chart
        self.visualizer.plot_radar_chart(comparison_df)

        # Topic quality scatter plot
        plt.figure(figsize=(10, 6))
        sns.scatterplot(
            x="Coherence",
            y="Distinctiveness",
            size="Overall Score",
            hue="Model",
            data=comparison_df,
            s=200,
            alpha=0.8
        )
        plt.title("Topic Quality Comparison")
        plt.grid(True, alpha=0.3)
        plt.savefig("topic_quality_scatter.png", dpi=300)
        plt.close()

    def generate_comparison_report(self, comparison_df, results):
        """Generate detailed textual report of model comparison"""
        report_lines = [
            "=" * 70,
            "TOPIC MODELING PERFORMANCE COMPARISON REPORT",
            "=" * 70,
            f"Evaluated on {len(self.documents)} documents",
            f"Number of topics: {self.n_topics}",
            "-" * 70,
            "Overall Ranking:"
        ]

        # Ranking
        for i, row in comparison_df.iterrows():
            report_lines.append(f"{i + 1}. {row['Model']}: {row['Overall Score']:.3f}")

        # Detailed comparison
        report_lines.extend([
            "\n" + "-" * 70,
            "Detailed Metrics:",
            "{:<20} {:<12} {:<15} {:<12} {:<12}".format(
                "Model", "Coherence", "Distinctiveness", "Relevance", "Overall"
            )
        ])

        for _, row in comparison_df.iterrows():
            report_lines.append("{:<20} {:<12.3f} {:<15.3f} {:<12.3f} {:<12.3f}".format(
                row["Model"], row["Coherence"], row["Distinctiveness"],
                row["Relevance"], row["Overall Score"]
            ))

        # Performance insights
        best_model = comparison_df.iloc[0]["Model"]
        report_lines.extend([
            "\n" + "-" * 70,
            "Performance Insights:",
            f"- Best performing model: {best_model}",
            f"- Coherence range: {comparison_df['Coherence'].min():.3f} - {comparison_df['Coherence'].max():.3f}",
            f"- Distinctiveness range: {comparison_df['Distinctiveness'].min():.3f} - {comparison_df['Distinctiveness'].max():.3f}",
            f"- Relevance range: {comparison_df['Relevance'].min():.3f} - {comparison_df['Relevance'].max():.3f}",
            "-" * 70
        ])

        # Recommendations
        report_lines.extend([
            "\nRecommendations:",
            f"- For coherence-focused applications: Use {comparison_df.sort_values('Coherence', ascending=False).iloc[0]['Model']}",
            f"- For diverse topics: Use {comparison_df.sort_values('Distinctiveness', ascending=False).iloc[0]['Model']}",
            f"- For relevant terms: Use {comparison_df.sort_values('Relevance', ascending=False).iloc[0]['Model']}",
            f"- Overall best model: {best_model}",
            "=" * 70
        ])

        return "\n".join(report_lines)


# =============================
# 3. DYNAMIC STUDENT CLUSTERING
# =============================

class StudentClusterer:
    """Base clusterer with BIC optimization and visualization"""

    def __init__(self, max_clusters=10):
        self.max_clusters = max_clusters
        self.optimal_k = None
        self.gmm = None

    def validate_features(self, data):
        """Ensure data quality for clustering"""
        return data.dropna()

    def optimize_cluster_count(self, data):
        """Use BIC to choose optimal cluster count"""
        data = self.validate_features(data)
        bic_scores = []
        k_range = range(1, self.max_clusters + 1)

        for k in tqdm(k_range, desc="Computing BIC"):
            gmm = GaussianMixture(n_components=k, random_state=42)
            gmm.fit(data)
            bic_scores.append(gmm.bic(data))

        # Find optimal k (min BIC)
        self.optimal_k = np.argmin(bic_scores) + 1  # +1 because k starts at 1
        return self.optimal_k, bic_scores

    def fit_gmm(self, data):
        """Fit GMM with optimal k and return labels"""
        if self.optimal_k is None:
            self.optimize_cluster_count(data)

        self.gmm = GaussianMixture(n_components=self.optimal_k, random_state=42)
        self.gmm.fit(data)
        return self.gmm.predict(data)

    def plot_bic_curve(self, bic_scores, filename='bic_optimization.png'):
        """Plot BIC curve for model selection with optimal value marked"""
        plt.figure(figsize=(10, 6))
        k_range = range(1, len(bic_scores) + 1)
        plt.plot(k_range, bic_scores, 'bo-', label='BIC')

        # Mark optimal value
        optimal_k = np.argmin(bic_scores) + 1
        min_bic = min(bic_scores)
        plt.plot(optimal_k, min_bic, 'ro', markersize=8,
                 label=f'Optimal k={optimal_k}')

        plt.xlabel('Number of Clusters')
        plt.ylabel('BIC Score')
        plt.title('BIC for Gaussian Mixture Model')
        plt.legend()
        plt.grid(True)
        plt.savefig(filename, dpi=300)
        plt.close()


class EnhancedStudentClusterer(StudentClusterer):
    """Improved clustering with silhouette optimization and profile interpretation"""
    """Improved clustering with EM convergence monitoring"""

    def __init__(self, max_clusters=10):
        super().__init__(max_clusters)
        self.cluster_metrics = {}
        self.log_likelihoods = []

    def validate_clusters(self, data, labels):
        """Compute multiple validation metrics for cluster quality"""
        return {
            "silhouette": silhouette_score(data, labels),
            "calinski_harabasz": calinski_harabasz_score(data, labels),
            "davies_bouldin": davies_bouldin_score(data, labels)
        }

    def optimize_cluster_count(self, data):
        """Use silhouette score instead of BIC for better profile separation"""
        """Use silhouette score with EM convergence monitoring"""
        data = super().validate_features(data)

        best_score = -1
        best_k = 3
        silhouette_scores = []
        cluster_metrics = []
        self.log_likelihoods = []  # Reset log-likelihoods

        for k in range(2, self.max_clusters + 1):
            # Configure GMM with convergence parameters
            gmm = GaussianMixture(
                n_components=k,
                random_state=42,
                max_iter=100,
                tol=1e-4,
                n_init=3
            )

            # Fit model and track convergence
            gmm.fit(data)
            labels = gmm.predict(data)

            # Store EM convergence metrics
            self.log_likelihoods.append(gmm.lower_bound_)

            score = silhouette_score(data, labels)
            silhouette_scores.append(score)

            metrics = self.validate_clusters(data, labels)
            cluster_metrics.append(metrics)

            if score > best_score:
                best_score = score
                best_k = k
                best_metrics = metrics

        self.optimal_k = best_k
        self.cluster_metrics = cluster_metrics
        return self.optimal_k, silhouette_scores


    def interpret_clusters(self, score_matrix):
        """Analyze cluster characteristics to identify performance profiles"""
        """Improved cluster profiling with adjusted thresholds"""
        cluster_profiles = []
        global_variance = score_matrix['variance'].mean()
        global_gap = score_matrix['max_min_gap'].mean()

        for cluster_id in sorted(score_matrix['cluster'].unique()):
            cluster_data = score_matrix[score_matrix['cluster'] == cluster_id]

            profile = {
                'cluster': cluster_id,
                'size': len(cluster_data),
                'overall_avg': cluster_data['overall_performance'].mean(),
                'theory_avg': cluster_data['theory_avg'].mean(),
                'practical_avg': cluster_data['practical_avg'].mean(),
                'variance_avg': cluster_data['variance'].mean(),
                'max_min_gap': cluster_data['max_min_gap'].mean(),
                'weak_subject_count': cluster_data['weak_subject_count'].mean(),
                'profile_type': ""
            }

            # Calculate metrics with new thresholds
            gap = abs(profile['theory_avg'] - profile['practical_avg'])
            variance_ratio = profile['variance_avg'] / global_variance
            gap_ratio = profile['max_min_gap'] / global_gap

            # Adjusted thresholds for better profiling
            if profile['overall_avg'] > 75:
                profile['profile_type'] = "High Performers"
            elif profile['overall_avg'] < 55:  # Lowered threshold from 50
                profile['profile_type'] = "Low Performers"
            elif gap > 15 or (gap_ratio > 1.5 and gap > 10):  # Stricter thresholds
                if profile['theory_avg'] > profile['practical_avg']:
                    profile['profile_type'] = "Theory-Focused"
                else:
                    profile['profile_type'] = "Practice-Focused"
            elif variance_ratio > 1.5 or profile['variance_avg'] > 80:  # reduced threshold
                profile['profile_type'] = "Inconsistent Learners"
            else:
                profile['profile_type'] = "Balanced Learners"

            cluster_profiles.append(profile)

        return pd.DataFrame(cluster_profiles)

    def plot_cluster_profiles(self, cluster_profile_df):
        """Visualize performance profiles"""
        plt.figure(figsize=(14, 8))

        # Plot theory vs practical skills
        for _, row in cluster_profile_df.iterrows():
            plt.scatter(
                row['theory_avg'],
                row['practical_avg'],
                s=row['size'] * 10,
                label=f"{row['profile_type']} (n={row['size']})"
            )

        # Add reference lines
        max_val = max(cluster_profile_df[['theory_avg', 'practical_avg']].max().max(), 100)
        plt.plot([0, max_val], [0, max_val], 'k--', alpha=0.3)
        plt.plot([50, max_val], [50, 50], 'r:', alpha=0.3)  # Passing threshold
        plt.plot([50, 50], [50, max_val], 'r:', alpha=0.3)

        plt.xlabel('Theory Skills (Avg Score)')
        plt.ylabel('Practical Skills (Avg Score)')
        plt.title('Student Performance Profiles')
        plt.legend()
        plt.grid(True)
        plt.savefig('performance_profiles.png', dpi=300)
        plt.close()

    def plot_em_convergence(self, filename='em_convergence.png'):
        """Visualize EM algorithm convergence"""
        if not self.log_likelihoods:
            return

        plt.figure(figsize=(10, 6))
        k_values = range(2, len(self.log_likelihoods) + 2)
        plt.plot(k_values, self.log_likelihoods, 'go-')

        # Mark optimal cluster count
        optimal_idx = self.optimal_k - 2
        plt.plot(self.optimal_k, self.log_likelihoods[optimal_idx], 'ro',
                 markersize=8, label=f'Optimal k={self.optimal_k}')

        plt.xlabel('Number of Clusters')
        plt.ylabel('Log-Likelihood (ELBO)')
        plt.title('EM Algorithm Convergence')
        plt.legend()
        plt.grid(True)
        plt.savefig(filename, dpi=300)
        plt.close()


# =========================
# 4. PATHWAY OPTIMIZATION
# =========================

class PathwayGenerator:
    """
    Creates personalized learning pathways based on:
    - Performance clusters
    - Vygotsky's Zone of Proximal Development (ZPD)
    - Time constraints (T_max = 12 hrs/week)
    """

    def __init__(self, threshold=60, max_hours=12):  # Lowered threshold
        self.threshold = threshold  # Performance threshold
        self.max_hours = max_hours  # Weekly time constraint

    def identify_weak_areas(self, scores):
        """Find courses where score < threshold"""
        return [i for i, score in enumerate(scores) if score < self.threshold]

    def calculate_zpd_level(self, current_score, cluster_mean):
        """
        Compute resource difficulty level using Vygotsky's ZPD:
        L_resource = S_current + 0.4 * (μ_cluster - S_current)
        """
        # Handle cases where cluster mean isn't higher
        if cluster_mean <= current_score:
            # Provide minimum 5-point challenge
            return min(100, current_score + 5)
        else:
            zpd = current_score + 0.4 * (cluster_mean - current_score)
            # Ensure at least 5-point challenge
            return max(current_score + 5, min(100, zpd))

    def optimize_pathway(self, weak_areas, course_scores, cluster_means):
        """Optimized pathway using dynamic programming knapsack solution"""
        if not weak_areas:
            return []

        # Calculate utility (performance gap) and weights (time)
        #utility = [100 - course_scores[i] for i in weak_areas]
        # Calculate utility as normalized gap
        max_score = 100
        utility = [(max_score - course_scores[i]) / max_score for i in weak_areas]
        weights = [2] * len(weak_areas)  # 2 hours per resource
        capacity = self.max_hours

        # Initialize DP table
        n = len(utility)
        dp = [[0] * (capacity + 1) for _ in range(n + 1)]

        # Build DP table
        for i in range(1, n + 1):
            for w in range(1, capacity + 1):
                if weights[i - 1] <= w:
                    dp[i][w] = max(dp[i - 1][w],
                                   dp[i - 1][w - weights[i - 1]] + utility[i - 1])
                else:
                    dp[i][w] = dp[i - 1][w]

        # Backtrack to find selected courses
        selected = []
        w = capacity
        for i in range(n, 0, -1):
            if dp[i][w] != dp[i - 1][w]:
                selected.append(weak_areas[i - 1])
                w -= weights[i - 1]

        # Generate pathway with ZPD-level resources
        pathway = []
        for area in selected:
            zpd_level = self.calculate_zpd_level(
                course_scores[area],
                cluster_means[area]
            )
            pathway.append({
                'course_index': area,
                'zpd_level': round(zpd_level, 1),
                'study_hours': 2,
                'resources': self.select_resources(zpd_level)
            })

        return pathway

    def select_resources(self, zpd_level):
        """Match resources to difficulty level (simulated)"""
        # In production: Query resource database with difficulty filter
        difficulty_bracket = int(zpd_level // 10) * 10  # Group by 10-point brackets

        return [
            f"Video Lecture (Level: {difficulty_bracket}-{difficulty_bracket + 9})",
            f"Practice Problems (Level: {difficulty_bracket}-{difficulty_bracket + 9})",
            f"Interactive Simulation (Level: {difficulty_bracket}-{difficulty_bracket + 9})"
        ]


# =======================
# 5. BLOCKCHAIN INTEGRATION
# =======================

class BlockchainSimulator:
    """
    Simulates Hyperledger Fabric functionality for:
    - IPFS content addressing
    - On-chain hash storage
    - Academic record verification
    """

    def __init__(self):
        self.chain = []
        self.records = {}

    def store_pathway(self, student_id, pathway_data):
        """Store pathway hash on blockchain with timestamp"""
        # Simulate IPFS storage
        cid = self._ipfs_store(pathway_data)

        # Create blockchain record
        timestamp = datetime.now().isoformat()
        record = {
            'student_id': student_id,
            'ipfs_cid': cid,
            'timestamp': timestamp,
            'tx_hash': hashlib.sha256(f"{student_id}{timestamp}".encode()).hexdigest()
        }

        # Add to chain
        self.chain.append(record)
        self.records[student_id] = record
        return record

    def _ipfs_store(self, data):
        """Simulate IPFS storage (returns content identifier)"""
        json_data = json.dumps(data).encode()
        return f"Qm{hashlib.sha256(json_data).hexdigest()[:46]}"

    def verify_record(self, student_id):
        """Verify pathway integrity using blockchain records"""
        return self.records.get(student_id)


# ======================
# 6. VISUALIZATION TOOLS
# ======================

class ResultVisualizer:
    """Generates publication-quality visualizations of results"""

    @staticmethod
    def plot_score_comparison(pre_scores, post_scores):
        """Visualize pre/post intervention score distribution"""
        plt.figure(figsize=(12, 8))

        # Kernel Density Estimation plots with bandwidth adjustment
        sns.kdeplot(pre_scores, fill=True, label='Pre-Intervention',
                    alpha=0.5, bw_adjust=0.5)  # Added bw_adjust for smoothing
        sns.kdeplot(post_scores, fill=True, label='Post-Intervention',
                    alpha=0.5, bw_adjust=0.5)  # Added bw_adjust for smoothing

        # Set y-axis limits
        plt.ylim(0, 0.035)  # Increased y-axis range

        plt.title('Score Distribution Improvement', fontsize=16)
        plt.xlabel('Scores', fontsize=14)
        plt.ylabel('Density', fontsize=14)
        plt.legend()
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.savefig('score_distribution_comparison.png', dpi=300, bbox_inches='tight')
        plt.close()

    @staticmethod
    def plot_cluster_performance(cluster_data):
        """Visualize cluster-specific improvements"""
        plt.figure(figsize=(14, 8))

        clusters = sorted(cluster_data['cluster'].unique())
        colors = plt.cm.viridis(np.linspace(0, 1, len(clusters)))

        for i, cluster in enumerate(clusters):
            cluster_df = cluster_data[cluster_data['cluster'] == cluster]
            plt.scatter(
                cluster_df['pre_score'],
                cluster_df['post_score'],
                color=colors[i],
                label=f'Cluster {cluster}: {cluster_df["profile_type"].iloc[0]}',
                alpha=0.7
            )

        # Add identity line
        max_score = max(cluster_data[['pre_score', 'post_score']].max().max(), 100)
        plt.plot([0, max_score], [0, max_score], 'k--', alpha=0.5)
        plt.plot([50, max_score], [50, 50], 'r:', alpha=0.3)  # Passing threshold
        plt.plot([50, 50], [50, max_score], 'r:', alpha=0.3)

        plt.title('Pre-vs-Post Scores by Cluster', fontsize=16)
        plt.xlabel('Pre-Intervention Scores', fontsize=14)
        plt.ylabel('Post-Intervention Scores', fontsize=14)
        plt.legend()
        plt.grid(True, linestyle='--', alpha=0.5)
        plt.savefig('cluster_performance.png', dpi=300, bbox_inches='tight')
        plt.close()

    @staticmethod
    def plot_cluster_validation(metrics):
        """Visualize cluster validation metrics"""
        fig, ax1 = plt.subplots(figsize=(12, 6))

        # Silhouette and Calinski-Harabasz
        k_values = range(2, len(metrics) + 2)
        sil_scores = [m['silhouette'] for m in metrics]
        cal_scores = [m['calinski_harabasz'] for m in metrics]

        ax1.plot(k_values, sil_scores, 'b-o', label='Silhouette Score')
        ax1.set_xlabel('Number of Clusters')
        ax1.set_ylabel('Silhouette Score', color='b')
        ax1.tick_params('y', colors='b')
        ax1.grid(True, alpha=0.3)

        ax2 = ax1.twinx()
        ax2.plot(k_values, cal_scores, 'r--o', label='Calinski-Harabasz Index')
        ax2.set_ylabel('Calinski-Harabasz Index', color='r')
        ax2.tick_params('y', colors='r')
        fig.tight_layout()
        plt.savefig('cluster_validation.png', dpi=300)
        plt.close()

    @staticmethod
    def calculate_performance_metrics(score_matrix, post_scores, pathways, courses, cluster_profiles):
        """
        Compute comprehensive performance metrics:
        1. Learning gain by cluster
        2. Weak area remediation rate
        3. Pathway efficiency
        4. Resource utilization
        5. Equity impact
        6. Improvement list
        """
        metrics = {}
        improvements = []  # NEW: Store individual improvements

        # 1. Overall learning gain
        all_pre = score_matrix[courses].values.flatten()
        all_post = post_scores[courses].values.flatten()
        metrics['overall_improvement'] = np.mean(all_post - all_pre)

        # 2. Cluster-specific gains
        cluster_improvements = {}
        for cluster_id in cluster_profiles['cluster']:
            cluster_students = score_matrix[score_matrix['cluster'] == cluster_id].index
            pre_avg = score_matrix.loc[cluster_students, courses].mean().mean()
            post_avg = post_scores.loc[cluster_students, courses].mean().mean()
            cluster_improvements[cluster_id] = {
                'improvement': post_avg - pre_avg,
                'size': len(cluster_students)
            }
        metrics['cluster_improvements'] = cluster_improvements

        # 3. Weak area remediation
        resolved_weak_areas = 0
        total_weak_areas = 0

        # NEW: Calculate individual student improvements
        for student in score_matrix.index:
            pre_avg = score_matrix.loc[student, courses].mean()
            post_avg = post_scores.loc[student, courses].mean()
            student_improvement = post_avg - pre_avg
            improvements.append(student_improvement)

            # Weak area analysis
            weak_areas = [i for i, score in enumerate(score_matrix.loc[student, courses]) if score < 60]
            total_weak_areas += len(weak_areas)

            if student in pathways and pathways[student]:
                for item in pathways[student]:
                    course_idx = item['course_index']
                    if post_scores.loc[student, courses[course_idx]] >= 60:
                        resolved_weak_areas += 1

        metrics['improvements'] = improvements  # NEW: Store for equity plot
        metrics['weak_area_resolution'] = resolved_weak_areas / total_weak_areas if total_weak_areas > 0 else 0

        # 4. Pathway efficiency
        study_hours = []
        for pathway in pathways.values():
            if pathway:
                study_hours.append(sum(item['study_hours'] for item in pathway))
        metrics['avg_study_hours'] = np.mean(study_hours) if study_hours else 0

        # 5. Equity impact (Gini coefficient of improvement)
        sorted_improvements = np.sort(improvements)
        n = len(sorted_improvements)
        index = np.arange(1, n + 1)
        gini = (np.sum((2 * index - n - 1) * sorted_improvements)) / (n * np.sum(sorted_improvements))
        metrics['gini_improvement'] = gini

        return metrics

    @staticmethod
    def plot_improvement_by_cluster(metrics, filename='cluster_improvements.png'):
        """Visualize improvement by cluster profile"""
        cluster_data = []
        for cluster_id, data in metrics['cluster_improvements'].items():
            cluster_data.append({
                'cluster': cluster_id,
                'improvement': data['improvement'],
                'size': data['size']
            })
        cluster_df = pd.DataFrame(cluster_data)

        plt.figure(figsize=(12, 6))
        ax = sns.barplot(x='cluster', y='improvement', data=cluster_df,
                         palette='viridis', hue='size', dodge=False)

        # Add size annotations
        for i, row in enumerate(cluster_df.itertuples()):
            ax.text(i, row.improvement + 0.2, f"n={row.size}",
                    ha='center', fontsize=10)

        plt.title('Average Score Improvement by Cluster', fontsize=16)
        plt.xlabel('Cluster', fontsize=14)
        plt.ylabel('Score Improvement', fontsize=14)
        plt.grid(axis='y', alpha=0.3)
        plt.savefig(filename, dpi=300, bbox_inches='tight')
        plt.close()

    @staticmethod
    def plot_equity_impact(improvements, filename='equity_impact.png'):
        """Visualize improvement distribution using Lorenz curve"""
        sorted_improvements = np.sort(improvements)
        cumulative = np.cumsum(sorted_improvements)
        cumulative = cumulative / cumulative[-1] if cumulative[-1] != 0 else cumulative

        perfect = np.linspace(0, 1, len(improvements))

        # Calculate Gini coefficient
        n = len(sorted_improvements)
        index = np.arange(1, n + 1)
        gini = (np.sum((2 * index - n - 1) * sorted_improvements)) / (n * np.sum(sorted_improvements))

        plt.figure(figsize=(10, 6))
        plt.plot(perfect, cumulative, label='Actual Improvement')
        plt.plot(perfect, perfect, 'k--', label='Perfect Equality')

        # Fill area between curves
        plt.fill_between(perfect, perfect, cumulative, alpha=0.1)

        # Add Gini annotation
        plt.annotate(f'Gini: {gini:.3f}', xy=(0.6, 0.3), fontsize=12,
                     bbox=dict(boxstyle='round,pad=0.3', fc='white', alpha=0.8))

        plt.title('Equity of Learning Improvements', fontsize=16)
        plt.xlabel('Percentage of Students', fontsize=14)
        plt.ylabel('Cumulative Improvement Share', fontsize=14)
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.savefig(filename, dpi=300)
        plt.close()

    @staticmethod
    def plot_radar_chart(comparison_df, filename="radar_chart.png"):
        """Create radar chart visualization of model metrics"""
        import matplotlib.pyplot as plt
        from math import pi

        # Prepare data
        models = comparison_df["Model"].values
        metrics = ["Coherence", "Distinctiveness", "Relevance"]
        values = comparison_df[metrics].values

        # Normalize values to 0-1 scale
        normalized = (values - values.min(axis=0)) / (values.max(axis=0) - values.min(axis=0) + 1e-8)

        # Compute angles
        N = len(metrics)
        angles = [n / float(N) * 2 * pi for n in range(N)]
        angles += angles[:1]

        # Create plot
        fig = plt.figure(figsize=(10, 10))
        ax = fig.add_subplot(111, polar=True)
        ax.set_theta_offset(pi / 2)
        ax.set_theta_direction(-1)
        plt.xticks(angles[:-1], metrics)

        # Draw ylabels
        ax.set_rlabel_position(0)
        plt.yticks([0.25, 0.5, 0.75], ["0.25", "0.5", "0.75"], color="grey", size=10)
        plt.ylim(0, 1)

        # Plot each model
        colors = plt.cm.viridis(np.linspace(0, 1, len(models)))
        for i, model in enumerate(models):
            stats = normalized[i].tolist()
            stats += stats[:1]  # Close the polygon
            ax.plot(angles, stats, linewidth=2, linestyle='solid', label=model, color=colors[i])
            ax.fill(angles, stats, alpha=0.1, color=colors[i])

        plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
        plt.title("Topic Model Comparison", size=16, y=1.1)
        plt.savefig(filename, dpi=300, bbox_inches="tight")
        plt.close()


# ========================
# TOPIC MODELING CLASSES
# ========================

class LDAOnlyModel:
    """Wrapper for LDA-only topic modeling"""

    def __init__(self, n_topics=5):
        self.n_topics = n_topics
        self.vectorizer = None
        self.lda = None

    def train(self, documents):
        """Train LDA model"""
        academic_stop_words = ['student', 'professor', 'university', 'chapter', 'section',
                               'example', 'problem', 'solution', 'study', 'learn']
        self.vectorizer = CountVectorizer(max_df=0.85, min_df=3, stop_words='english',
                                          ngram_range=(1, 2), max_features=1000)
        self.vectorizer.stop_words_ = set(list(self.vectorizer.get_stop_words()) + academic_stop_words)
        dtm = self.vectorizer.fit_transform(documents)
        self.lda = LatentDirichletAllocation(n_components=self.n_topics, random_state=42)
        self.lda.fit(dtm)

    def get_dominant_topic(self, documents):
        """Get dominant topic for documents"""
        if not self.lda:
            self.train(documents)
        dtm = self.vectorizer.transform(documents)
        return np.argmax(self.lda.transform(dtm), axis=1)


class BERTOnlyModel:
    """Wrapper for BERT-only topic modeling"""

    def __init__(self, n_topics=5):
        self.n_topics = n_topics
        self.bert_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
        self.kmeans = KMeans(n_clusters=n_topics, random_state=42)

    def get_dominant_topic(self, documents):
        """Get dominant topic for documents"""
        embeddings = self.bert_model.encode(documents)
        return self.kmeans.fit_predict(embeddings)

# =========================
# 7. MAIN EXECUTION PIPELINE
# =========================
def generate_template_file():
    """Create assessment template CSV if it doesn't exist"""
    if os.path.exists(TEMPLATES_FILENAME):
        return

    templates = {
        "course": [
            "Calculus I", "Calculus I", "Calculus I", "Calculus I",
            "Physics I", "Physics I", "Physics I", "Physics I",
            "Programming Fundamentals", "Programming Fundamentals", "Programming Fundamentals",
            "Programming Fundamentals",
            "Engineering Drawing", "Engineering Drawing", "Engineering Drawing", "Engineering Drawing",
            "Electrical Circuits", "Electrical Circuits", "Electrical Circuits", "Electrical Circuits"
        ],
        "template": [
            "Differential calculus problems involving {}",
            "Integral calculus applications in {}",
            "Limits and continuity exercises on {}",
            "Derivative applications for {} problems",
            "Kinematics problems in {} dimensions",
            "Dynamics of systems with {} interactions",
            "Thermodynamics applications for {} systems",
            "Electromagnetism principles in {} contexts",
            "Algorithms implementation using {} approach",
            "Data structures exercises with {} applications",
            "Object-oriented programming concepts for {}",
            "Problem-solving techniques with {} paradigm",
            "Orthographic projection of {} objects",
            "Isometric drawing techniques for {} structures",
            "CAD modeling exercises for {} components",
            "Dimensioning standards applied to {} designs",
            "Analysis of {} circuits using Kirchhoff's laws",
            "AC circuit behavior with {} components",
            "Transient response in {} networks",
            "Power distribution systems for {} applications"
        ],
        "topic": [
            "polynomial functions", "trigonometric functions", "exponential growth", "optimization",
            "mechanical systems", "fluid dynamics", "electromagnetic fields", "thermal systems",
            "sorting algorithms", "tree structures", "inheritance patterns", "recursive solutions",
            "mechanical parts", "architectural elements", "piping systems", "electrical components",
            "resistive networks", "capacitive circuits", "inductive loads", "filter designs"
        ]
    }

    pd.DataFrame(templates).to_csv(TEMPLATES_FILENAME, index=False)
    print(f"Created assessment template file: {TEMPLATES_FILENAME}")


if __name__ == "__main__":
    if IN_COLAB:
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
        # Download NLTK punkt_tab resource
        nltk.download('punkt_tab')

    print("=" * 60)
    print("SANKOFA PATHWAYS: Personalized Learning System")
    print("Soroti University Implementation (Google Colab)" if IN_COLAB else "Local Execution")
    print("=" * 60)

    # Define courses
    courses = [
        "Calculus I", "Physics I", "Programming Fundamentals",
        "Engineering Drawing", "Electrical Circuits"
    ]

    # Generate template file if needed
    generate_template_file()

    # --------------------
    # 1. Generate or load mock data
    # --------------------
    if os.path.exists(DATASET_FILENAME):
        print(f"\n[1/6] Loading dataset from {DATASET_FILENAME}...")
        raw_df = pd.read_csv(DATASET_FILENAME)
    else:
        print("\n[1/6] Generating synthetic dataset...")
        np.random.seed(42)

        # Load assessment templates
        templates_df = pd.read_csv(TEMPLATES_FILENAME)
        print(f"Loaded assessment templates from {TEMPLATES_FILENAME}")

        # Create 100 student records with multiple assessments
        student_ids = [f"STU{1000 + i}" for i in range(100)]
        data = []
        assessment_types = ['Test 1', 'Test 2', 'Coursework 1', 'Coursework 2', 'Final Exam']

        # Create distinct student groups
        for i, student in enumerate(student_ids):
            # Create 3 distinct performance groups
            if i < 30:  # Low performers
                base_score = np.random.normal(45, 8)
            elif i < 70:  # Medium performers
                base_score = np.random.normal(65, 10)
            else:  # High performers
                base_score = np.random.normal(80, 6)

            for course in courses:
                # Get templates for this course
                course_templates = templates_df[templates_df['course'] == course]

                # Generate 5 assessments per course
                for assessment_idx, assessment_name in enumerate(assessment_types):
                    # Add course-specific variations
                    course_adjustment = np.random.uniform(-5, 5)

                    # Assessment-specific variation
                    if assessment_name == 'Final Exam':
                        assessment_adjust = np.random.uniform(-3, 3)
                    else:
                        assessment_adjust = np.random.uniform(-8, 8)

                    score = base_score + course_adjustment + assessment_adjust
                    score = max(0, min(100, score))

                    # Select random template and topic
                    template_row = course_templates.sample(1).iloc[0]
                    template = template_row['template']
                    topic = template_row['topic']
                    assessment_text = f"{assessment_name}: " + template.format(topic)

                    data.append({
                        'student_id': student,
                        'course': course,
                        'score': score,
                        'assessment_text': assessment_text,
                        'assessment_type': assessment_name,
                        'assessment_idx': assessment_idx
                    })

        raw_df = pd.DataFrame(data)
        raw_df.to_csv(DATASET_FILENAME, index=False)
        print(f"Saved dataset to {DATASET_FILENAME}")

    # -------------------
    # 2. Preprocess data
    # -------------------
    print("[2/6] Preprocessing data with differential privacy...")
    preprocessor = DataPreprocessor(epsilon=0.85)
    processed_df = preprocessor.preprocess(raw_df)

    # Pivot to student-course matrix
    score_matrix = processed_df.pivot_table(
        index='hashed_id',
        columns='course',
        values='score',
        aggfunc='mean'
    ).fillna(65)  # Fill missing with mean

    # Ensure no missing values remain
    score_matrix = score_matrix.fillna(score_matrix.median())

    # ---------------------------
    # 3. Hybrid topic modeling
    # ---------------------------
    print("[3/6] Performing advanced hybrid topic modeling...")
    documents = processed_df['assessment_text'].unique()

    # Create all models for comprehensive comparison
    models_to_compare = {
        "LDA Only": LDAOnlyModel(n_topics=5),
        "BERT Only": BERTOnlyModel(n_topics=5),
        "Original Hybrid": HybridTopicModel(lambda_weight=0.85, n_topics=5),
        "Enhanced Hybrid": EnhancedHybridTopicModel(
            lambda_weight=0.85,  # More weight to BERT semantics
            n_topics=7,  # Slightly more topics
            fine_tune_steps=200  # More fine-tuning
        )
    }

    # Train models
    print("Training models for comparison...")
    for name, model in models_to_compare.items():
        if name == "LDA Only" or name == "BERT Only":
            continue  # Will be handled in evaluation

        print(f"- Training {name}...")
        if FINE_TUNE_BERT and IN_COLAB:
            model.fine_tune_bert(documents)

        if name == "Original Hybrid":
            model.train_lda(documents)
            # Ensure topic embeddings are computed
            if not hasattr(model, 'topic_embeddings') or model.topic_embeddings is None:
                model.compute_topic_embeddings(documents)
        elif name == "Enhanced Hybrid":
            # This handles all training internally
            model.compute_topic_embeddings(documents)

    # Evaluate and compare all models
    if not PRODUCTION_MODE:
        print("\n[3.5/6] Comprehensive model evaluation...")
        evaluator = TopicModelEvaluator(
            documents=processed_df['assessment_text'].tolist(),
            n_topics=5
        )

        # Perform comprehensive evaluation
        comparison_df, report = evaluator.comprehensive_evaluation(models_to_compare)

        # Print and save report
        print("\n" + "=" * 70)
        print("MODEL COMPARISON RESULTS:")
        print("=" * 70)
        print(report)

        # Save detailed report
        with open("model_comparison_report.txt", "w") as f:
            f.write(report)

        print("\nVisualizations saved:")
        print("- model_metrics_comparison.png")
        print("- radar_chart.png")
        print("- topic_quality_scatter.png")
        print("- model_comparison_report.txt")

        # Use the best model for the rest of the pipeline
        best_model_name = comparison_df.iloc[0]["Model"]
        print(f"\nSelected best model for the pipeline: {best_model_name}")
        topic_model = models_to_compare[best_model_name]
    else:
        # In production, default to enhanced hybrid
        topic_model = models_to_compare["Enhanced Hybrid"]
        print("Using Enhanced Hybrid model in production mode")

    # Continue with the selected model
    print("Computing document-topic distributions...")
    processed_df['dominant_topic'] = topic_model.get_dominant_topic(
        processed_df['assessment_text'].tolist()
    )

    # Create topic distribution per student
    topic_dist = pd.crosstab(
        index=processed_df['hashed_id'],
        columns=processed_df['dominant_topic'],
        normalize='index'
    ).add_prefix('topic_')

    # -------------------------
    # 4. Student clustering
    # -------------------------
    print("[4/6] Clustering students...")

    # Initialize visualizer
    visualizer = ResultVisualizer()

    # Prepare feature matrix: scores + topic distribution
    feature_matrix = pd.concat([score_matrix, topic_dist], axis=1)

    # Feature engineering - create all necessary columns
    feature_matrix['theory_avg'] = feature_matrix[['Calculus I', 'Physics I']].mean(axis=1)
    feature_matrix['practical_avg'] = feature_matrix[
        ['Programming Fundamentals', 'Engineering Drawing', 'Electrical Circuits']].mean(axis=1)
    feature_matrix['overall_performance'] = feature_matrix[courses].mean(axis=1)
    feature_matrix['variance'] = feature_matrix[courses].var(axis=1)
    feature_matrix['theory_practical_gap'] = abs(feature_matrix['theory_avg'] - feature_matrix['practical_avg'])
    feature_matrix['max_score'] = feature_matrix[courses].max(axis=1)
    feature_matrix['min_score'] = feature_matrix[courses].min(axis=1)
    feature_matrix['max_min_gap'] = feature_matrix['max_score'] - feature_matrix['min_score']
    feature_matrix['weak_subject_count'] = (feature_matrix[courses] < 60).sum(axis=1)
    feature_matrix['strong_subject_count'] = (feature_matrix[courses] > 80).sum(axis=1)

    # Additional features
    feature_matrix['theory_ratio'] = feature_matrix['theory_avg'] / feature_matrix['overall_performance']
    feature_matrix['practical_ratio'] = feature_matrix['practical_avg'] / feature_matrix['overall_performance']
    feature_matrix['imbalance_score'] = abs(feature_matrix['theory_ratio'] - 0.5)
    feature_matrix['weakness_factor'] = feature_matrix['weak_subject_count'] / len(courses)

    # Remove non-numeric columns for clustering
    clustering_features = feature_matrix.select_dtypes(include=[np.number])

    # Ensure proper alignment and no missing values
    clustering_features = clustering_features.dropna()

    # Create enhanced clusterer
    clusterer = EnhancedStudentClusterer(max_clusters=8)
    optimal_k, silhouette_scores = clusterer.optimize_cluster_count(clustering_features)
    print(f"Optimal cluster count: {optimal_k} (Silhouette optimized)")

    # Fit GMM and assign clusters
    cluster_labels = clusterer.fit_gmm(clustering_features)
    score_matrix['cluster'] = cluster_labels
    feature_matrix['cluster'] = cluster_labels

    # Interpret clusters and add profiles
    cluster_profiles = clusterer.interpret_clusters(feature_matrix)
    profile_mapping = cluster_profiles.set_index('cluster')['profile_type'].to_dict()
    feature_matrix['profile_type'] = feature_matrix['cluster'].map(profile_mapping)
    score_matrix['profile_type'] = feature_matrix['profile_type']

    # Print cluster statistics
    print("\nCluster Statistics:")
    print(feature_matrix.groupby('cluster').agg({
        'overall_performance': ['mean', 'std'],
        'theory_avg': 'mean',
        'practical_avg': 'mean',
        'variance': 'mean',
        'weak_subject_count': 'mean'
    }))

    # Print cluster validation metrics
    print("\nCluster Validation Metrics:")
    if optimal_k >= 2 and len(clusterer.cluster_metrics) >= optimal_k - 1:
        metrics_index = optimal_k - 2
        print(f"Silhouette Score: {clusterer.cluster_metrics[metrics_index]['silhouette']:.3f}")
        print(f"Calinski-Harabasz: {clusterer.cluster_metrics[metrics_index]['calinski_harabasz']:.1f}")
        print(f"Davies-Bouldin: {clusterer.cluster_metrics[metrics_index]['davies_bouldin']:.3f}")

    # Visualize clusters
    clusterer.plot_cluster_profiles(cluster_profiles)
    if clusterer.cluster_metrics:
        visualizer.plot_cluster_validation(clusterer.cluster_metrics)

    # ------------------------
    # 5. Pathway generation
    # ------------------------
    print("[5/6] Generating personalized pathways...")
    pathway_gen = PathwayGenerator(threshold=60, max_hours=12)
    blockchain = BlockchainSimulator()

    pathways = {}
    students_with_pathways = 0

    # Create reverse ID mapping
    reverse_mapping = {v: k for k, v in preprocessor.id_mapping.items()}

    for student in tqdm(score_matrix.index, desc="Generating pathways"):
        student_scores = score_matrix.loc[student, courses].values
        cluster_id = score_matrix.loc[student, 'cluster']

        # Use cluster mean from feature matrix
        cluster_mean = feature_matrix[feature_matrix['cluster'] == cluster_id][courses].mean().values

        weak_areas = pathway_gen.identify_weak_areas(student_scores)
        pathway = pathway_gen.optimize_pathway(
            weak_areas, student_scores, cluster_mean
        )

        if pathway:  # Only store if pathway is not empty
            blockchain.store_pathway(student, pathway)
            pathways[student] = pathway
            students_with_pathways += 1

    # Save all pathways to CSV
    pathway_output = []
    for hashed_id, pathway in pathways.items():
        original_id = reverse_mapping.get(hashed_id, hashed_id)
        for item in pathway:
            course_name = courses[item['course_index']]
            pathway_output.append({
                'student_id': original_id,
                'hashed_id': hashed_id,
                'cluster': score_matrix.loc[hashed_id, 'cluster'],
                'profile_type': score_matrix.loc[hashed_id, 'profile_type'],
                'course': course_name,
                'current_score': score_matrix.loc[hashed_id, course_name],
                'zpd_level': item['zpd_level'],
                'study_hours': item['study_hours'],
                'resources': ", ".join(item['resources'])
            })

    pathway_df = pd.DataFrame(pathway_output)
    pathway_df.to_csv("all_student_pathways.csv", index=False)
    print(f"Saved all pathways to all_student_pathways.csv")

    # --------------------
    # 6. Simulate results
    # --------------------
    print("[6/6] Simulating intervention and visualizing results...")

    # Simulate post-intervention scores with cluster-based improvements
    post_scores = score_matrix.copy()
    improvement_factors = {
        "Low Performers": 0.40, #was 0.3, then o.35 now increased to 0.40
        "Inconsistent Learners": 0.35, #was 0.25, then 0.30, now 0.35
        "Balanced Learners": 0.22, #was 0.2
        "Theory-Focused": 0.25, #was 0.22
        "Practice-Focused": 0.28, #was 0.22
        "High Performers": 0.12 #was 0.15
    }

    # More realistic improvement calculation
    for student in score_matrix.index:
        profile_type = score_matrix.loc[student, 'profile_type']
        improvement = improvement_factors.get(profile_type, 0.2)

        for course in courses:
            current_score = score_matrix.loc[student, course]
            if current_score < 60:  # Focused improvement for weak areas
                post_scores.loc[student, course] = min(100, current_score + (100 - current_score) * improvement)
            else:
                # Smaller improvement for already strong areas
                post_scores.loc[student, course] = min(100, current_score * (1 + improvement / 3))

    for course in courses:
        for cluster_type in improvement_factors:
            cluster_mask = score_matrix['profile_type'] == cluster_type
            current_scores = score_matrix.loc[cluster_mask, course]
            improvement = improvement_factors[cluster_type]
            post_scores.loc[cluster_mask, course] = np.minimum(
                current_scores + (100 - current_scores) * improvement,
                100
            )

    # Visualization
    visualizer = ResultVisualizer()

    # Flatten scores for distribution plot
    all_pre_scores = score_matrix[courses].values.flatten()
    all_post_scores = post_scores[courses].values.flatten()
    visualizer.plot_score_comparison(all_pre_scores, all_post_scores)

    # Prepare cluster performance data
    cluster_perf = score_matrix[['cluster', 'profile_type']].copy()
    cluster_perf['pre_score'] = score_matrix[courses].mean(axis=1)
    cluster_perf['post_score'] = post_scores[courses].mean(axis=1)
    visualizer.plot_cluster_performance(cluster_perf.reset_index())

    # NEW: Calculate comprehensive metrics
    metrics = visualizer.calculate_performance_metrics(
        score_matrix,
        post_scores,
        pathways,
        courses,
        cluster_profiles
    )

    # NEW: Visualize cluster improvements
    visualizer.plot_improvement_by_cluster(metrics)

    # NEW: Plot equity impact
    visualizer.plot_equity_impact(metrics['improvements'])
    print("Saved equity_impact.png")

    # ------------------
    # 7. Output results
    # ------------------
    print("\nRESULTS SUMMARY:")
    print(f"- Students clustered into {optimal_k} performance groups")
    print(f"- Average pre-intervention score: {np.mean(all_pre_scores):.1f}")
    print(f"- Average post-intervention score: {np.mean(all_post_scores):.1f}")
    print(f"- Average improvement: {np.mean(all_post_scores - all_pre_scores):.1f} points")
    print(f"- Personalized pathways generated for {students_with_pathways} students")
    print(f"- Average improvement: {metrics['overall_improvement']:.1f} points")
    print(f"- Weak area resolution rate: {metrics['weak_area_resolution'] * 100:.1f}%")
    print(f"- Average weekly study time: {metrics['avg_study_hours']:.1f} hours")
    print(f"- Improvement equity (Gini): {metrics['gini_improvement']:.3f} (lower=better)")

    print("\nCluster-Specific Improvements:")
    for cluster_id, data in metrics['cluster_improvements'].items():
        profile = cluster_profiles[cluster_profiles['cluster'] == cluster_id]['profile_type'].iloc[0]
        print(f"  - Cluster {cluster_id} ({profile}): +{data['improvement']:.1f} points (n={data['size']})")

    # Find a student with a pathway
    sample_student = next((sid for sid, path in pathways.items() if path), None)

    if sample_student:
        # Get original ID for sample student
        original_id = reverse_mapping.get(sample_student, "Unknown")
        profile_type = score_matrix.loc[sample_student, 'profile_type']

        print(f"\nSample pathway for student {original_id} ({profile_type}):")
        for item in pathways[sample_student]:
            course_name = courses[item['course_index']]
            current_score = score_matrix.loc[sample_student, course_name]
            print(f"  - Course: {course_name} (Current Score: {current_score:.1f})")
            print(f"    ZPD Level: {item['zpd_level']:.1f}")
            print(f"    Study Hours: {item['study_hours']}")
            print(f"    Resources: {item['resources'][0]}")

    print("\nVisualizations saved:")
    print("- bic_optimization.png: Cluster selection using BIC")
    print("- performance_profiles.png: Cluster performance profiles")
    print("- score_distribution_comparison.png: Pre/post score distributions")
    print("- cluster_performance.png: Improvement by cluster")
    print("- cluster_validation.png: Cluster quality metrics")

    print("\nExecution complete. Results saved in Colab environment." if IN_COLAB else "Execution complete.")

Gensim not installed. Topic coherence metrics disabled.


  axis.set_ylabel('$\lambda$ value')
  $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


SANKOFA PATHWAYS: Personalized Learning System
Soroti University Implementation (Google Colab)
Created assessment template file: assessment_templates.csv

[1/6] Generating synthetic dataset...
Loaded assessment templates from assessment_templates.csv
Saved dataset to soroti_engineering_dataset.csv
[2/6] Preprocessing data with differential privacy...
[3/6] Performing advanced hybrid topic modeling...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Loading model: sentence-transformers/all-MiniLM-L6-v2
Loading model: sentence-transformers/all-MiniLM-L6-v2
Training models for comparison...
- Training Original Hybrid...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:


Abort: 