# Xctopus - Quick Start for Contributors

Minimal notebook to run Xctopus clustering layer. Assumes localhost environment.

In [None]:
import sys
from pathlib import Path
import torch
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

project_root = Path().resolve().parent
sys.path.insert(0, str(project_root / "src"))

from xctopus.main import initialize_components, process_dataset
from xctopus.fusion import fuse_knowledge_nodes
from xctopus.settings import EMBEDDING_DIM, DEVICE, DTYPE

print(f"Device: {DEVICE}, Dtype: {DTYPE}, Embedding Dim: {EMBEDDING_DIM}")

In [None]:
# Configuration
DATASET_PATH = "../datasets/your_dataset.csv"  # Update with your dataset path
TEXT_COLUMNS = ['text']  # Update with your text column name(s)
EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'

# Load and clean dataset
df = pd.read_csv(DATASET_PATH)
if isinstance(TEXT_COLUMNS, list) and len(TEXT_COLUMNS) > 1:
    texts = df[TEXT_COLUMNS].apply(lambda row: " ".join(str(val) for val in row if pd.notna(val)), axis=1).tolist()
else:
    text_col = TEXT_COLUMNS[0] if isinstance(TEXT_COLUMNS, list) else TEXT_COLUMNS
    texts = df[text_col].astype(str).fillna('').str.strip().tolist()
    texts = [t for t in texts if len(t) >= 3]

print(f"Loaded {len(texts)} texts")

In [None]:
# Generate embeddings
embeddings_path = DATASET_PATH.replace('.csv', '_embeddings.npy')

if Path(embeddings_path).exists():
    embeddings_np = np.load(embeddings_path)
    print(f"Loaded embeddings: {embeddings_np.shape}")
else:
    model = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)
    embeddings_np = model.encode(texts, convert_to_numpy=True, show_progress_bar=True, batch_size=32, normalize_embeddings=True)
    np.save(embeddings_path, embeddings_np)
    print(f"Generated and saved embeddings: {embeddings_np.shape}")

embeddings = [torch.from_numpy(emb).to(device=DEVICE, dtype=DTYPE) for emb in embeddings_np]
print(f"Converted to {len(embeddings)} FP16 tensors")

In [None]:
# Initialize components
repository, filter_bayesian, orchestrator = initialize_components()
print("Components initialized")

In [None]:
# Process embeddings
process_dataset(embeddings, repository, filter_bayesian, orchestrator, progress_interval=100)
print("Processing completed")

In [None]:
# Run fusion
fusion_stats = fuse_knowledge_nodes(repository, orchestrator, progress_interval=10)
print(f"Fusion: {fusion_stats['initial_kns']} -> {fusion_stats['final_kns']} KNs")

In [None]:
# Final statistics
signatures = repository.get_all_signatures()
print(f"Final KNs: {len(signatures)}")
print(f"Total mass: {sum(s['mass'] for s in signatures)}")
print(f"Avg mass: {sum(s['mass'] for s in signatures) / len(signatures):.1f}")
print(f"Avg variance: {sum(s['variance'] for s in signatures) / len(signatures):.4f}")

repository.close()