In [1]:
# Environment verification: prints interpreter and key package versions
import sys
import importlib

print("Python executable:", sys.executable)

pkgs = [
    "pip",
    "numpy",
    "pandas",
    "sklearn",
    "sentence_transformers",
    "torch",
    "transformers",
    "joblib",
]

for p in pkgs:
    try:
        m = importlib.import_module(p)
        v = getattr(m, "__version__", "(no __version__)")
        print(f"{p}: {v}")
    except Exception as e:
        print(f"{p}: not installed ({e})")

Python executable: /Users/pablowatfi/repos/topic-pred/.venv/bin/python
pip: 25.3
numpy: 1.26.4
pandas: 2.3.3
sklearn: 1.7.2


  from .autonotebook import tqdm as notebook_tqdm


sentence_transformers: 5.1.2
torch: 2.2.2
transformers: 4.57.1
joblib: 1.5.2


# Content-to-Topic Prediction Model

## Overview
This notebook implements a content-to-topic matching system using:
1. **Sentence Transformers** for multilingual embeddings
2. **Direct Topic Prediction** (predicts actual topics)
3. **Embedding-based Similarity** for semantic matching

## Key Features
- **Content Sampling**: Process a subset of content for faster testing
- **Direct Predictions**: Predict actual topic IDs
- **Embedding-only Scoring**: Uses sentence-transformers embeddings for similarity
- **Multilingual Support**: Works across different languages

In [2]:
import pandas as pd
import numpy as np
import gc
import re
from pathlib import Path
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score, precision_score, recall_score
import joblib
from collections import defaultdict, Counter
from typing import List, Dict, Set, Tuple
import warnings
warnings.filterwarnings('ignore')

## Configuration

In [3]:
# ========================
# CONFIGURATION PARAMETERS
# ========================

# Sampling settings
SAMPLE_CONTENT = True  # Set to False to use all content
CONTENT_SAMPLE_SIZE = 20000  # Number of content items to sample

# Model settings
SENTENCE_TRANSFORMER_MODEL = 'paraphrase-multilingual-MiniLM-L12-v2'
BATCH_SIZE = 64

# Paths
ROOT = Path.cwd().parent
DATA_DIR = ROOT / "data"
ARTIFACTS_DIR = ROOT / "artifacts"

print("Configuration:")
print(f"  Sample content: {SAMPLE_CONTENT}")
if SAMPLE_CONTENT:
    print(f"  Sample size: {CONTENT_SAMPLE_SIZE}")
print(f"  Model: {SENTENCE_TRANSFORMER_MODEL}")
print(f"  Batch size: {BATCH_SIZE}")

Configuration:
  Sample content: True
  Sample size: 20000
  Model: paraphrase-multilingual-MiniLM-L12-v2
  Batch size: 64


## Step 1: Load Data

In [4]:
# Load data files
content_path = DATA_DIR / "content.csv"
correlations_path = DATA_DIR / "correlations.csv"
topics_path = DATA_DIR / "topics.csv"

content_df = pd.read_csv(content_path)
topics_df = pd.read_csv(topics_path)
correlations_df = pd.read_csv(correlations_path)

print(f"Content: {content_df.shape}")
print(f"Topics: {topics_df.shape}")
print(f"Correlations: {correlations_df.shape}")

Content: (154047, 8)
Topics: (76972, 9)
Correlations: (61517, 2)


In [5]:
# Preview data
print("\nContent Sample:")
print(content_df.head(3))
print("\nTopics Sample:")
print(topics_df.head(3))
print("\nCorrelations Sample:")
print(correlations_df.head(3))


Content Sample:
               id                                             title  \
0  c_00002381196d  Sumar números de varios dígitos: 48,029+233,930    
1  c_000087304a9e                    Trovare i fattori di un numero   
2  c_0000ad142ddb                           Sumar curvas de demanda   

                                         description   kind text language  \
0  Suma 48,029+233,930 mediante el algoritmo está...  video  NaN       es   
1                    Sal trova i fattori di 120.\n\n  video  NaN       it   
2                  Cómo añadir curvas de demanda\n\n  video  NaN       es   

  copyright_holder license  
0              NaN     NaN  
1              NaN     NaN  
2              NaN     NaN  

Topics Sample:
               id                                   title  \
0  t_00004da3a1b2              Откриването на резисторите   
1  t_000095e03056  Unit 3.3 Enlargements and Similarities   
2  t_00068291e9a4         Entradas e saídas de uma função   

            

##  Process Topics

In [6]:
# Process Topics (All Topics)

# Create combined text for topics
print("Processing all topics...")
topics_df['combined_text'] = (
    topics_df['title'].fillna('') + ' ' +
    topics_df['description'].fillna('')
).str.strip()

print(f"Processed {len(topics_df)} topics")

# Create topic ID to index mapping
topic_id_to_idx = {topic_id: idx for idx, topic_id in enumerate(topics_df['id'])}
topic_idx_to_id = {idx: topic_id for topic_id, idx in topic_id_to_idx.items()}

print("\nTopic examples:")
for i in range(min(3, len(topics_df))):
    print(f"\nTopic {i+1}: {topics_df.iloc[i]['title']}")

Processing all topics...
Processed 76972 topics

Topic examples:

Topic 1: Откриването на резисторите

Topic 2: Unit 3.3 Enlargements and Similarities

Topic 3: Entradas e saídas de uma função


# Some examples

In [7]:
# Parse correlations
correlations_df['content_list'] = correlations_df['content_ids'].str.split()

content_to_topics = defaultdict(list)
for _, row in correlations_df.iterrows():
    topic_id = row['topic_id']
    for content_id in row['content_list']:
        content_to_topics[content_id].append(topic_id)

print(f"Content-topic mappings: {len(content_to_topics)}")

Content-topic mappings: 154047


In [8]:
def _get_until_root_topic(topic_id, df):
    d = topics_df[topics_df["id"]==topic_id]
    df = pd.concat([d,df])
    if str(d.iloc[0]["parent"])[0:2]=="t_":
        df = _get_until_root_topic(d.iloc[0]["parent"], df)
    return df


def content_topic_analyzer(cont_id):
    c = content_df[content_df["id"]==cont_id]
    topics_per_content = content_to_topics[cont_id]
    dataframe_to_concat = []
    for topic_id in topics_per_content:
        d = _get_until_root_topic(topic_id, pd.DataFrame())
        dataframe_to_concat.append(d)
    all_topics_df = pd.concat(dataframe_to_concat).drop_duplicates(subset=['id'], keep='first')
    return c, all_topics_df

In [9]:
# analyze one content
i = "c_00002381196d"
c , tops = content_topic_analyzer(i)
print(c.title.values[0])
print(c.description.values[0])

Sumar números de varios dígitos: 48,029+233,930 
Suma 48,029+233,930 mediante el algoritmo estándar.




In [10]:
tops

Unnamed: 0,id,title,description,channel,category,level,language,parent,has_content,combined_text
62887,t_d226fda51096,PORTAL ESTUDIANTIL,,484bd2,supplemental,0,es,,False,PORTAL ESTUDIANTIL
6228,t_14ef70a90844,SUMA,,484bd2,supplemental,1,es,t_d226fda51096,False,SUMA
56818,t_bddee107da5b,Nivel 3,,484bd2,supplemental,2,es,t_14ef70a90844,False,Nivel 3
38836,t_81be1094dd83,Suma de números de varios dígitos con reagrupa...,Aprende a sumar números como 259 + 363. Estas ...,484bd2,supplemental,3,es,t_bddee107da5b,True,Suma de números de varios dígitos con reagrupa...
72147,t_f00e771ec807,Khan Academy (Español),,36a98b,source,0,es,,False,Khan Academy (Español)
53102,t_b1bf0868b3ed,Matemáticas,,36a98b,source,1,es,t_f00e771ec807,False,Matemáticas
14394,t_302da6500046,Aritmética,Aprende los conceptos básicos de aritmética: t...,36a98b,source,2,es,t_b1bf0868b3ed,False,Aritmética Aprende los conceptos básicos de ar...
75726,t_fc0ecbe53c46,Sumas y restas,"En este tema, vamos a sumar números naturales....",36a98b,source,3,es,t_302da6500046,False,"Sumas y restas En este tema, vamos a sumar núm..."
62518,t_d0edb1c53d90,Suma de números de varios dígitos con reagrupa...,Aprende a sumar números como 259 + 363. Estas ...,36a98b,source,4,es,t_fc0ecbe53c46,True,Suma de números de varios dígitos con reagrupa...
6223,t_14ed7b950b78,2° grado,"Números, suma, resta y multiplicación",484bd2,supplemental,1,es,t_d226fda51096,False,"2° grado Números, suma, resta y multiplicación"


In [11]:
# several topics has more or less the same description
# in this case "adding natural numbers"
# I think we should group them, to have less target values
# or analize a way to check if the predicted topic is somewhat similar
# maybe analize the cosine distance with a high threshold

In [12]:
i = "c_0c6473c3480d"
c , tops = content_topic_analyzer(i)
print(c.title.values[0])
print(c.description.values[0])

Чертане на показателни финкции
Сал чертае y=-2*3ˣ+5, като използва интерактивното приспособление за чертане
на графики.




In [13]:
tops

Unnamed: 0,id,title,description,channel,category,level,language,parent,has_content,combined_text
56530,t_bcf10c48c68b,Khan Academy (български език),,000cf7,source,0,bg,,False,Khan Academy (български език)
52564,t_afd8db5e4f44,Математика,,000cf7,source,1,bg,t_bcf10c48c68b,False,Математика
28468,t_5f4cc8e02423,Алгебра (цялото съдържание),"Научи алгебрата — променливи, уравнения, функц...",000cf7,source,2,bg,t_afd8db5e4f44,False,Алгебра (цялото съдържание) Научи алгебрата — ...
67915,t_e2452e21d252,Показателни и логаритмични функции,Тази тема покрива:\n\n- Корени и рационални ст...,000cf7,source,3,bg,t_5f4cc8e02423,False,Показателни и логаритмични функции Тази тема п...
4,t_0006d41a73a8,Графики на експоненциални функции (Алгебра 2 н...,Научи повече за графиките на сложните показате...,000cf7,source,4,bg,t_e2452e21d252,True,Графики на експоненциални функции (Алгебра 2 н...
12474,t_29e1aab672b5,Алгебра II,Учи с Алгебра 2 - справяй се с по-сложни (и ин...,000cf7,source,2,bg,t_afd8db5e4f44,False,Алгебра II Учи с Алгебра 2 - справяй се с по-с...
36111,t_7897fb86fd0f,Степени и логаритми,"Научи за логаритмите, които са обратното на ст...",000cf7,source,3,bg,t_29e1aab672b5,False,"Степени и логаритми Научи за логаритмите, коит..."
27407,t_5bc5a80f2d71,Графики на експоненциални функции,Научи повече за графиките на сложните показате...,000cf7,source,4,bg,t_7897fb86fd0f,True,Графики на експоненциални функции Научи повече...
50475,t_a8d3be502f0b,Математика (България),,000cf7,source,1,bg,t_bcf10c48c68b,False,Математика (България)
41468,t_8a9bbb171ed0,11. клас (България),"Учи математика за 11. клас, за задължителна и ...",000cf7,source,2,bg,t_a8d3be502f0b,False,11. клас (България) Учи математика за 11. клас...


## Process Content (with Sampling)

In [14]:
# Sample content if enabled
if SAMPLE_CONTENT and len(content_df) > CONTENT_SAMPLE_SIZE:
    print(f"Sampling {CONTENT_SAMPLE_SIZE} out of {len(content_df)} content items...")
    sampled_indices = np.random.choice(len(content_df), CONTENT_SAMPLE_SIZE, replace=False)
    sampled_indices = sorted(sampled_indices)
    content_df_sampled = content_df.iloc[sampled_indices].reset_index(drop=True)
    print(f"Sampled content shape: {content_df_sampled.shape}")
else:
    print("Using all content (no sampling)")
    content_df_sampled = content_df.copy()
    sampled_indices = np.arange(len(content_df))

# Create combined text
print("\nProcessing content...")
content_df_sampled['combined_text'] = (
    content_df_sampled['title'].fillna('') + ' ' +
    content_df_sampled['description'].fillna('')
).str.strip()

print(f"Processed {len(content_df_sampled)} content items")

# Show examples
print("\nContent examples:")
for i in range(min(3, len(content_df_sampled))):
    print(f"\nContent {i+1}:")
    print(f"  Text: {content_df_sampled.iloc[i]['combined_text'][:100]}...")

Sampling 20000 out of 154047 content items...
Sampled content shape: (20000, 8)

Processing content...
Processed 20000 content items

Content examples:

Content 1:
  Text: Kuhesabu vizio mraba kutafuta kanuni ya eneo Sal anatumia kizio mraba kuona kwa nini kuzidisha pande...

Content 2:
  Text: Translations...

Content 3:
  Text: TI-AIE: Conjecturing and generalising in mathematics: introducing algebra...


## Step 5: Create Content-Topic Mappings

In [15]:
# Parse correlations
correlations_df['content_list'] = correlations_df['content_ids'].str.split()

content_to_topics = defaultdict(list)
for _, row in correlations_df.iterrows():
    topic_id = row['topic_id']
    for content_id in row['content_list']:
        content_to_topics[content_id].append(topic_id)

print(f"Content-topic mappings: {len(content_to_topics)}")

print(f"Total content items with topics: {len(content_to_topics)}")
print(f"Average topics per content: {np.mean([len(v) for v in content_to_topics.values()]):.2f}")

# Count how many sampled content items have topic associations
sampled_with_topics = sum(1 for cid in content_df_sampled['id'] if cid in content_to_topics)
print(f"\nSampled content with topics: {sampled_with_topics}/{len(content_df_sampled)}")

Content-topic mappings: 154047
Total content items with topics: 154047
Average topics per content: 1.82

Sampled content with topics: 20000/20000


## Step 6: Generate Embeddings

In [16]:
# Load multilingual sentence transformer
print("Loading sentence transformer model...")
model = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL)
print("Model loaded successfully")

Loading sentence transformer model...
Model loaded successfully


In [17]:
# Generate embeddings for ALL topics (no sampling - we need all topics for predictions)
print("\nGenerating topic embeddings (all topics)...")

topic_texts = topics_df['combined_text'].tolist()
topic_embeddings = model.encode(
    topic_texts,
    batch_size=BATCH_SIZE,
    show_progress_bar=True
)
print(f"Topic embeddings shape: {topic_embeddings.shape}")


Generating topic embeddings (all topics)...


Batches: 100%|██████████| 1203/1203 [03:31<00:00,  5.68it/s]


Topic embeddings shape: (76972, 384)


In [18]:
# Generate embeddings for sampled content
# we are going to predict only on these content items to see the metrics
print("\nGenerating content embeddings (sampled)...")
content_embeddings = []

for i in range(0, len(content_df_sampled), BATCH_SIZE):
    batch = content_df_sampled['combined_text'].iloc[i:i+BATCH_SIZE].tolist()
    batch_embeddings = model.encode(batch, show_progress_bar=True)
    content_embeddings.extend(batch_embeddings)

    if (i + BATCH_SIZE) % 5000 == 0:
        gc.collect()

content_embeddings = np.array(content_embeddings)
print(f"Content embeddings shape: {content_embeddings.shape}")


Generating content embeddings (sampled)...


Batches: 100%|██████████| 2/2 [00:00<00:00,  2.35it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00,  2.24it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00,  3.70it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00,  3.36it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00,  2.47it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00,  3.86it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00,  3.83it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00,  3.57it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00,  3.19it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00,  3.13it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00,  2.85it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00,  3.49it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00,  6.52it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00,  3.25it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00,  7.50it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00,  7.54it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00,  3.30it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00,  3.53it/s]
Batches: 1

Content embeddings shape: (20000, 384)


## Step 7:  Prediction Function

This function predicts topics 

In [19]:
def predict_topics(
    content_embedding: np.ndarray,
    topic_embeddings: np.ndarray,
    topic_ids: List[str],
    min_score: float = 0.3,
    top_k: int = 3
) -> List[str]:
    """
    Predict topics using embedding similarity only.

    Returns up to `top_k` topic ids whose cosine similarity with the content
    embedding is >= min_score.
    """
    # Calculate embedding similarities
    embedding_similarities = cosine_similarity(
        content_embedding.reshape(1, -1),
        topic_embeddings
    )[0]

    # Use embeddings-only scores
    hybrid_scores = embedding_similarities

    # Get top-k topics above threshold
    top_indices = np.argsort(hybrid_scores)[::-1][:top_k]
    predicted_topics = [
        topic_ids[idx] for idx in top_indices
        if hybrid_scores[idx] >= min_score
    ]

    return predicted_topics


def predict_topics_batch(
    content_embeddings: np.ndarray,
    topic_embeddings: np.ndarray,
    topic_ids: List[str],
    min_score: float = 0.3,
    top_k: int = 3
) -> List[List[str]]:
    """
    Predict topics for MULTIPLE contents at once (batch processing) using embeddings only.
    """
    # Calculate ALL embedding similarities at once (n_contents x n_topics)
    embedding_similarities = cosine_similarity(
        content_embeddings,
        topic_embeddings
    )

    # Combine scores (embeddings only)
    hybrid_scores = embedding_similarities

    # Get top-k predictions for each content
    predictions = []
    for i in range(len(content_embeddings)):
        top_indices = np.argsort(hybrid_scores[i])[::-1][:top_k]
        pred_topics = [
            topic_ids[idx] for idx in top_indices
            if hybrid_scores[i, idx] >= min_score
        ]
        predictions.append(pred_topics)

    return predictions


## Step 8: Prepare Training Data

In [20]:
# Prepare training data from sampled content
print("Preparing training data...")
train_indices = []
y_train = []  # Will contain lists of topic IDs

for idx, content_id in enumerate(content_df_sampled['id']):
    if content_id in content_to_topics:
        train_indices.append(idx)
        topic_ids = content_to_topics[content_id]
        y_train.append(topic_ids)

X_train = content_embeddings[train_indices]

print(f"Training samples: {len(X_train)}")
print(f"Average topics per sample: {np.mean([len(topics) for topics in y_train]):.2f}")
print(f"Total unique topics in training: {len(set([t for topics in y_train for t in topics]))}")

Preparing training data...
Training samples: 20000
Average topics per sample: 1.83
Total unique topics in training: 23631


In [21]:
# Prepare topic data for prediction function
topic_ids_list = topics_df['id'].tolist()

print(f"Total topics available for prediction: {len(topic_ids_list)}")

Total topics available for prediction: 76972


## Step 9: Evaluate

In [22]:
# Sample for faster evaluation (optional)
EVAL_SAMPLE_SIZE = min(500, len(X_train))
eval_indices = np.random.choice(len(X_train), EVAL_SAMPLE_SIZE, replace=False)

X_eval = X_train[eval_indices]
y_eval = [y_train[i] for i in eval_indices]

print(f"Evaluation sample size: {EVAL_SAMPLE_SIZE}")

Evaluation sample size: 500


In [23]:
# Grid search parameters
top_k_values = [2, 3, 5]
min_score_values = [0.3]

print("\n" + "="*80)
print("Parameter Grid Evaluation")
print("="*80)

# Store results for all configurations
all_results = []

# Helper function to find matching topics based on embedding similarity
def find_matching_topics(predicted_topics, true_topics, similarity_threshold=0.90):
    """
    Find matching topics between predicted and true topics based on embedding similarity.
    Returns sets of matched and unmatched predicted topics.
    """
    # Get texts for predicted and true topics
    pred_texts = [topics_df[topics_df['id'] == tid]['combined_text'].iloc[0] for tid in predicted_topics]
    true_texts = [topics_df[topics_df['id'] == tid]['combined_text'].iloc[0] for tid in true_topics]

    # Convert texts to embeddings (this is not efficient at all, but works for now)
    pred_embeddings = model.encode(pred_texts)
    true_embeddings = model.encode(true_texts)

    # Calculate similarity matrix
    similarities = cosine_similarity(pred_embeddings, true_embeddings)

    # Find matches
    matched_pred = set()
    unmatched_pred = set()

    for i, pred_tid in enumerate(predicted_topics):
        # Check if this prediction matches any true topic with high similarity
        max_sim = similarities[i].max() if len(similarities[i]) > 0 else 0
        if max_sim >= similarity_threshold:
            matched_pred.add(pred_tid)
        else:
            unmatched_pred.add(pred_tid)

    return matched_pred, unmatched_pred

# Test each combination of parameters
for top_k in top_k_values:
    for min_score in min_score_values:
        print(f"\nTesting: top_k={top_k}, min_score={min_score:.1f}")

        # Make predictions with current parameters
        y_pred = predict_topics_batch(
            X_eval,
            topic_embeddings,
            topic_ids_list,
            min_score=min_score,
            top_k=top_k
        )

        # Calculate metrics using embedding similarity
        tp = 0  # True positives
        fp = 0  # False positives
        fn = 0  # False negatives

        for pred_topics, true_topics in zip(y_pred, y_eval):
            # Find matches based on embedding similarity
            matched_pred, unmatched_pred = find_matching_topics(pred_topics, true_topics)

            # Update metrics
            tp += len(matched_pred)
            fp += len(unmatched_pred)
            # Count true topics that don't have a matching prediction
            fn += len(true_topics) - len(matched_pred)

        # Calculate final metrics
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        # Calculate prediction stats
        avg_pred = np.mean([len(topics) for topics in y_pred])
        exact_matches = 0
        for pred, true in zip(y_pred, y_eval):
            matched, _ = find_matching_topics(pred, true)
            if len(matched) == len(true):  # All true topics have matches
                exact_matches += 1
        exact_match_pct = (exact_matches / len(y_eval)) * 100

        # Store results
        all_results.append({
            'top_k': top_k,
            'min_score': min_score,
            'F1_Score': f1,
            'Precision': precision,
            'Recall': recall,
            'Avg_Pred_Topics': avg_pred,
            'Exact_Match_%': exact_match_pct
        })

# Convert results to DataFrame and display
results_df = pd.DataFrame(all_results)
results_df = results_df.sort_values(['top_k', 'min_score'])

# Format the display
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

print("\n" + "="*80)
print("GRID EVALUATION RESULTS (Using Embedding Similarity >= 0.90)")
print("="*80)
print(results_df.to_string(index=False))
print("="*80)


Parameter Grid Evaluation

Testing: top_k=2, min_score=0.3

Testing: top_k=3, min_score=0.3

Testing: top_k=5, min_score=0.3

GRID EVALUATION RESULTS (Using Embedding Similarity >= 0.90)
 top_k  min_score  F1_Score  Precision  Recall  Avg_Pred_Topics  Exact_Match_%
     2     0.3000    0.2306     0.2220  0.2400           2.0000        15.2000
     3     0.3000    0.2375     0.1920  0.3114           3.0000        14.0000
     5     0.3000    0.2231     0.1528  0.4130           5.0000        16.0000


## Step 10: Detailed Evaluation with Best Configuration

In [24]:
# Helper function to get topic details
def get_topic_details(topic_ids, max_display=3):
    """Display topic titles and descriptions."""
    for tid in topic_ids[:max_display]:
        topic_row = topics_df[topics_df['id'] == tid]
        if not topic_row.empty:
            title = topic_row.iloc[0]['title']
            desc = str(topic_row.iloc[0]['description'])[:100]
            print(f"    - [{tid}] {title}: {desc}")
    if len(topic_ids) > max_display:
        print(f"    ... and {len(topic_ids) - max_display} more topics")

In [25]:
# Detailed evaluation on larger sample
DETAILED_EVAL_SIZE = min(5000, len(X_train))
detailed_eval_indices = np.random.choice(len(X_train), DETAILED_EVAL_SIZE, replace=False)

print(f"Running detailed evaluation on {DETAILED_EVAL_SIZE} samples...")
y_pred_detailed = []
y_true_detailed = [y_train[i] for i in detailed_eval_indices]

for i in detailed_eval_indices:
    pred_topics = predict_topics(
        X_train[i],
        topic_embeddings,
        topic_ids_list,
        min_score=0.3,
        top_k=3
    )
    y_pred_detailed.append(pred_topics)

# Calculate metrics using embedding similarity
tp = 0  # True positives
fp = 0  # False positives
fn = 0  # False negatives
perfect_matches = 0

# Process each prediction
for pred_topics, true_topics in zip(y_pred_detailed, y_true_detailed):
    # Find matches based on embedding similarity
    matched_pred, unmatched_pred = find_matching_topics(pred_topics, true_topics)

    # Update metrics
    tp += len(matched_pred)
    fp += len(unmatched_pred)
    # Count true topics that don't have a matching prediction
    fn += len(true_topics) - len(matched_pred)

    # Check for perfect match
    if len(matched_pred) == len(true_topics):  # All true topics have matches
        perfect_matches += 1

# Calculate final metrics
precision_final = tp / (tp + fp) if (tp + fp) > 0 else 0
recall_final = tp / (tp + fn) if (tp + fn) > 0 else 0
f1_final = 2 * (precision_final * recall_final) / (precision_final + recall_final) if (precision_final + recall_final) > 0 else 0

print("\n" + "="*80)
print("FINAL EVALUATION RESULTS (Using Embedding Similarity >= 0.90)")
print("="*80)
print(f"Evaluation samples: {DETAILED_EVAL_SIZE}")
print(f"Total topics in dataset: {len(topic_ids_list)}")
print(f"\nMetrics:")
print(f"  F1 Score:            {f1_final:.4f}")
print(f"  Precision:           {precision_final:.4f}")
print(f"  Recall:              {recall_final:.4f}")
print(f"\nPrediction Stats:")
print(f"  Avg true topics:      {np.mean([len(t) for t in y_true_detailed]):.2f}")
print(f"  Avg predicted topics: {np.mean([len(t) for t in y_pred_detailed]):.2f}")
print(f"  Perfect matches:      {perfect_matches} ({perfect_matches/len(y_true_detailed)*100:.2f}%)")
print("="*80)

Running detailed evaluation on 5000 samples...

FINAL EVALUATION RESULTS (Using Embedding Similarity >= 0.90)
Evaluation samples: 5000
Total topics in dataset: 76972

Metrics:
  F1 Score:            0.2034
  Precision:           0.1655
  Recall:              0.2638

Prediction Stats:
  Avg true topics:      1.88
  Avg predicted topics: 3.00
  Perfect matches:      693 (13.86%)


## Step 11: Show Prediction Examples

In [26]:
# Show prediction examples
print("\n" + "="*80)
print("PREDICTION EXAMPLES")
print("="*80)

def get_topic_similarities(pred_topics, true_topics):
    """Get similarity scores between predicted and true topics."""
    pred_texts = [topics_df[topics_df['id'] == tid]['combined_text'].iloc[0] for tid in pred_topics]
    true_texts = [topics_df[topics_df['id'] == tid]['combined_text'].iloc[0] for tid in true_topics]

    pred_embeddings = model.encode(pred_texts)
    true_embeddings = model.encode(true_texts)

    similarities = cosine_similarity(pred_embeddings, true_embeddings)
    return similarities

num_examples = min(5, len(detailed_eval_indices))
for i in range(num_examples):
    idx = detailed_eval_indices[i]
    content_row = content_df_sampled.iloc[train_indices[idx]]

    pred_topics = y_pred_detailed[i]
    true_topics = y_true_detailed[i]

    # Get similarity matrix
    similarities = get_topic_similarities(pred_topics, true_topics)
    matched_pred, unmatched_pred = find_matching_topics(pred_topics, true_topics)

    print(f"\n{'='*80}")
    print(f"Example {i+1}:")
    print(f"{'='*80}")
    print(f"Content ID: {content_row['id']}")
    print(f"Language: {content_row.get('language', 'N/A')}")
    print(f"\nText: {content_row['combined_text'][:200]}...")

    print(f"\nPredicted Topics ({len(pred_topics)}):")
    for j, tid in enumerate(pred_topics):
        topic_row = topics_df[topics_df['id'] == tid]
        if not topic_row.empty:
            title = topic_row.iloc[0]['title']
            desc = str(topic_row.iloc[0]['description'])[:100]
            max_sim = similarities[j].max() if len(similarities[j]) > 0 else 0
            match_status = "✓ MATCH" if tid in matched_pred else "✗ NO MATCH"
            print(f"    - [{tid}] {title}: {desc}")
            print(f"      Max Similarity: {max_sim:.4f} ({match_status})")

    print(f"\nTrue Topics ({len(true_topics)}):")
    get_topic_details(true_topics)

    print(f"\nMatches Summary:")
    print(f"  - Matched topics: {len(matched_pred)}")
    print(f"  - Unmatched predictions: {len(unmatched_pred)}")
    print(f"  - Missing true topics: {len(true_topics) - len(matched_pred)}")


PREDICTION EXAMPLES

Example 1:
Content ID: c_0ce36c8d2db8
Language: en

Text: Level 3: The interior, exterior and arcs of a circle v0.1...

Predicted Topics (3):
    - [t_01de1e98d688] The interior, exterior and arcs of a circle: v0.1
      Max Similarity: 1.0000 (✓ MATCH)
    - [t_49c1e25d3dd2] The interior, exterior and arcs of a circle: v0.1
      Max Similarity: 1.0000 (✓ MATCH)
    - [t_8c50b9362824] Area of a circle: v0.1
      Max Similarity: 0.8694 (✗ NO MATCH)

True Topics (2):
    - [t_49c1e25d3dd2] The interior, exterior and arcs of a circle: v0.1
    - [t_f9a64bcb95b2] Assessments: nan

Matches Summary:
  - Matched topics: 2
  - Unmatched predictions: 1
  - Missing true topics: 0

Example 2:
Content ID: c_03ac5464c800
Language: en

Text: Chapter 198: Secure Shell Connection in Python Chapter 198 of the book on Python....

Predicted Topics (3):
    - [t_a52a884bbb0d] Book: Python for Everybody (Severance): nan
      Max Similarity: 0.6061 (✗ NO MATCH)
    - [t_fa6937e17836

## Step 12: Save Model

In [27]:
# Save model artifacts
model_artifacts = {
    'topic_embeddings': topic_embeddings,
    'topic_ids_list': topic_ids_list,
}

output_path = ARTIFACTS_DIR / 'topic_predictor_direct_model.pkl'
output_path.parent.mkdir(exist_ok=True)
joblib.dump(model_artifacts, output_path)

print(f"\n{'='*80}")
print("MODEL SAVED")
print(f"{'='*80}")
print(f"Location: {output_path}")
print(f"\nModel includes:")
print(f"  - {len(topic_ids_list)} topic embeddings")
print(f"  - Topic metadata and mappings")
print(f"  - Performance metrics (F1={f1_final:.4f})")
print(f"{'='*80}")


MODEL SAVED
Location: /Users/pablowatfi/repos/topic-pred/artifacts/topic_predictor_direct_model.pkl

Model includes:
  - 76972 topic embeddings
  - Topic metadata and mappings
  - Performance metrics (F1=0.2034)
