In [None]:
from pathlib import Path
import json

EXPORT_JSON = Path("/home/pitfa/Documents/explore_unizg/faculties_export.json")

with EXPORT_JSON.open("r", encoding="utf-8") as f:
    faculties = json.load(f)

# Build a list of {abbreviation, embedding}
faculty_embeddings = [
    {
        "abbreviation": (fac.get("abbreviation") or "").strip(),
        "embedding": fac.get("embedding"),
    }
    for fac in faculties
    if (fac.get("abbreviation") or "").strip()
]

print(f"Loaded {len(faculty_embeddings)} embeddings")
faculty_embeddings[:2]



In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Filter out entries with missing embeddings
valid = [
    (item["abbreviation"], item["embedding"]) 
    for item in faculty_embeddings 
    if item.get("embedding")
]

if not valid:
    raise ValueError("No valid embeddings found. Ensure faculty_embeddings is populated.")

abbreviations = [abbr for abbr, _ in valid]
X = np.array([emb for _, emb in valid], dtype=float)

# L2 normalize rows
norms = np.linalg.norm(X, axis=1, keepdims=True)
norms = np.where(norms == 0.0, 1.0, norms)
X_norm = X / norms

# PCA to 2D
pca = PCA(n_components=2, random_state=42)
X_2d = pca.fit_transform(X_norm)
print(f"Explained variance ratio (PC1+PC2): {pca.explained_variance_ratio_.sum():.4f}")

# Keep results for later use
pca_2d = [
    {"abbreviation": abbreviations[i], "x": float(X_2d[i, 0]), "y": float(X_2d[i, 1])}
    for i in range(len(abbreviations))
]

# Plot
fig, ax = plt.subplots(figsize=(10, 8))
ax.scatter(X_2d[:, 0], X_2d[:, 1], s=30, alpha=0.8)
ax.set_title("Faculties PCA (L2-normalized embeddings) - 2D")
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.grid(True, linestyle="--", alpha=0.3)

# Annotate a subset to reduce clutter
max_labels = 30
for i, label in enumerate(abbreviations[:max_labels]):
    ax.annotate(label, (X_2d[i, 0], X_2d[i, 1]), fontsize=8, alpha=0.8)

plt.show()



In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Prepare data again to be self-contained
valid = [
    (item["abbreviation"], item["embedding"]) 
    for item in faculty_embeddings 
    if item.get("embedding")
]

if not valid:
    raise ValueError("No valid embeddings found. Ensure faculty_embeddings is populated.")

abbreviations = [abbr for abbr, _ in valid]
X = np.array([emb for _, emb in valid], dtype=float)

# L2 normalize rows
norms = np.linalg.norm(X, axis=1, keepdims=True)
norms = np.where(norms == 0.0, 1.0, norms)
X_norm = X / norms

# PCA to 3D
pca3 = PCA(n_components=3, random_state=42)
X_3d = pca3.fit_transform(X_norm)
print(f"Explained variance ratio (PC1+PC2+PC3): {pca3.explained_variance_ratio_.sum():.4f}")

# Keep results for later use
pca_3d = [
    {
        "abbreviation": abbreviations[i],
        "x": float(X_3d[i, 0]),
        "y": float(X_3d[i, 1]),
        "z": float(X_3d[i, 2]),
    }
    for i in range(len(abbreviations))
]

# Plot 3D
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection="3d")
ax.scatter(X_3d[:, 0], X_3d[:, 1], X_3d[:, 2], s=20, alpha=0.85)
ax.set_title("Faculties PCA (L2-normalized embeddings) - 3D")
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_zlabel("PC3")

# Optionally annotate a small subset to reduce clutter
max_labels = 20
for i, label in enumerate(abbreviations[:max_labels]):
    ax.text(X_3d[i, 0], X_3d[i, 1], X_3d[i, 2], label, fontsize=8, alpha=0.8)

plt.tight_layout()
plt.show()



In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Prepare data
valid = [
    (item["abbreviation"], item["embedding"]) 
    for item in faculty_embeddings 
    if item.get("embedding")
]

if not valid:
    raise ValueError("No valid embeddings found. Ensure faculty_embeddings is populated.")

abbreviations = [abbr for abbr, _ in valid]
X = np.array([emb for _, emb in valid], dtype=float)

# L2 normalize
norms = np.linalg.norm(X, axis=1, keepdims=True)
norms = np.where(norms == 0.0, 1.0, norms)
X_norm = X / norms

# Cluster in embedding space
k = 5
kmeans = KMeans(n_clusters=k, n_init="auto", random_state=42)
labels = kmeans.fit_predict(X_norm)
unique_labels = sorted(set(labels))
print(f"Requested clusters: {k}, clusters populated: {len(unique_labels)}")

# PCA to 2D for visualization
pca = PCA(n_components=2, random_state=42)
X_2d = pca.fit_transform(X_norm)
print(f"Explained variance ratio (PC1+PC2): {pca.explained_variance_ratio_.sum():.4f}")

# Plot with cluster colors
fig, ax = plt.subplots(figsize=(10, 8))
scatter = ax.scatter(X_2d[:, 0], X_2d[:, 1], c=labels, cmap="tab10", s=30, alpha=0.9)
ax.set_title("KMeans (k=10) + PCA 2D of L2-normalized embeddings")
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.grid(True, linestyle="--", alpha=0.3)

# Legend for clusters that exist
handles = []
for lab in unique_labels:
    handles.append(plt.Line2D([0], [0], marker='o', color='w', label=f"Cluster {lab}",
                              markerfacecolor=scatter.cmap(scatter.norm(lab)), markersize=8))
ax.legend(handles=handles, title="Clusters", loc="best", ncols=2)

plt.show()



In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Prepare data
valid = [
    (item["abbreviation"], item["embedding"]) 
    for item in faculty_embeddings 
    if item.get("embedding")
]

if not valid:
    raise ValueError("No valid embeddings found. Ensure faculty_embeddings is populated.")

abbreviations = [abbr for abbr, _ in valid]
X = np.array([emb for _, emb in valid], dtype=float)

# L2 normalize
norms = np.linalg.norm(X, axis=1, keepdims=True)
norms = np.where(norms == 0.0, 1.0, norms)
X_norm = X / norms

# PCA first (to 2D for direct clustering on PCs)
pca = PCA(n_components=2, random_state=42)
X_2d = pca.fit_transform(X_norm)
print(f"Explained variance ratio (PC1+PC2): {pca.explained_variance_ratio_.sum():.4f}")

# Then KMeans on the 2D PCs
k = 5
kmeans = KMeans(n_clusters=k, n_init="auto", random_state=42)
labels = kmeans.fit_predict(X_2d)
unique_labels = sorted(set(labels))
print(f"Requested clusters: {k}, clusters populated: {len(unique_labels)}")

# Keep 2D results for reuse
pca_kmeans_2d = [
    {"abbreviation": abbreviations[i], "x": float(X_2d[i, 0]), "y": float(X_2d[i, 1]), "cluster": int(labels[i])}
    for i in range(len(abbreviations))
]

# Plot with cluster colors
fig, ax = plt.subplots(figsize=(10, 8))
scatter = ax.scatter(X_2d[:, 0], X_2d[:, 1], c=labels, cmap="tab10", s=30, alpha=0.9)
ax.set_title("PCA (2D) then KMeans (k=10) on PCs")
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.grid(True, linestyle="--", alpha=0.3)

# Legend
handles = []
for lab in unique_labels:
    handles.append(plt.Line2D([0], [0], marker='o', color='w', label=f"Cluster {lab}",
                              markerfacecolor=scatter.cmap(scatter.norm(lab)), markersize=8))
ax.legend(handles=handles, title="Clusters", loc="best", ncols=2)

plt.show()



In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from matplotlib.patches import Polygon, Ellipse

# Helper: convex hull via Andrew's monotone chain (no scipy dependency)
def _cross(o, a, b):
    return (a[0] - o[0]) * (b[1] - o[1]) - (a[1] - o[1]) * (b[0] - o[0])

def convex_hull(points: np.ndarray) -> np.ndarray:
    pts = np.unique(points, axis=0)
    if len(pts) <= 1:
        return pts
    pts = pts[np.lexsort((pts[:, 1], pts[:, 0]))]
    lower = []
    for p in pts:
        while len(lower) >= 2 and _cross(lower[-2], lower[-1], p) <= 0:
            lower.pop()
        lower.append(tuple(p))
    upper = []
    for p in reversed(pts):
        while len(upper) >= 2 and _cross(upper[-2], upper[-1], p) <= 0:
            upper.pop()
        upper.append(tuple(p))
    hull = lower[:-1] + upper[:-1]
    return np.array(hull)

# Prepare data
valid = [
    (item["abbreviation"], item["embedding"]) 
    for item in faculty_embeddings 
    if item.get("embedding")
]
if not valid:
    raise ValueError("No valid embeddings found. Ensure faculty_embeddings is populated.")

abbreviations = [abbr for abbr, _ in valid]
X = np.array([emb for _, emb in valid], dtype=float)

# L2 normalize
norms = np.linalg.norm(X, axis=1, keepdims=True)
norms = np.where(norms == 0.0, 1.0, norms)
X_norm = X / norms

# PCA to 2D
pca = PCA(n_components=2, random_state=42)
X_2d = pca.fit_transform(X_norm)

# KMeans on 2D PCs
k = 5
kmeans = KMeans(n_clusters=k, n_init="auto", random_state=42)
labels = kmeans.fit_predict(X_2d)
unique_labels = sorted(set(labels))

# Plot
fig, ax = plt.subplots(figsize=(10, 8))

# Color map per cluster index (compact mapping)
cmap = plt.cm.get_cmap("tab10", len(unique_labels))
label_to_color = {lab: cmap(i) for i, lab in enumerate(unique_labels)}

# Base scatter per cluster for consistent coloring
for lab in unique_labels:
    pts = X_2d[labels == lab]
    color = label_to_color[lab]
    ax.scatter(pts[:, 0], pts[:, 1], s=30, alpha=0.9, color=color, edgecolor="white", linewidth=0.3)

# Enclosing shapes (convex hulls or small ellipses)
x_range = float(X_2d[:, 0].max() - X_2d[:, 0].min())
y_range = float(X_2d[:, 1].max() - X_2d[:, 1].min())
base_radius = 0.02 * max(x_range, y_range)

for lab in unique_labels:
    pts = X_2d[labels == lab]
    color = label_to_color[lab]
    if len(pts) >= 3:
        hull = convex_hull(pts)
        if len(hull) >= 3:
            poly = Polygon(hull, closed=True, facecolor=color, edgecolor=color, alpha=0.15, linewidth=2)
            ax.add_patch(poly)
    else:
        # For 1-2 points, draw a small ellipse around the centroid
        center = pts.mean(axis=0)
        ell = Ellipse(xy=(center[0], center[1]), width=2*base_radius, height=2*base_radius,
                      facecolor=color, edgecolor=color, alpha=0.15, linewidth=2)
        ax.add_patch(ell)

ax.set_title("PCA (2D) then KMeans (k=10) with Enclosing Shapes")
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.grid(True, linestyle="--", alpha=0.3)

# Legend
handles = [plt.Line2D([0], [0], marker='o', color='w', label=f"Cluster {lab}",
                      markerfacecolor=label_to_color[lab], markersize=8)
           for lab in unique_labels]
ax.legend(handles=handles, title="Clusters", loc="best", ncols=2)

plt.show()



In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors

# Prepare data
valid = [
    (item["abbreviation"], item["embedding"]) 
    for item in faculty_embeddings 
    if item.get("embedding")
]
if not valid:
    raise ValueError("No valid embeddings found. Ensure faculty_embeddings is populated.")

abbreviations = [abbr for abbr, _ in valid]
X = np.array([emb for _, emb in valid], dtype=float)

# L2 normalize rows
norms = np.linalg.norm(X, axis=1, keepdims=True)
norms = np.where(norms == 0.0, 1.0, norms)
X_norm = X / norms

# PCA to 2D
pca = PCA(n_components=2, random_state=42)
X_2d = pca.fit_transform(X_norm)
print(f"Explained variance ratio (PC1+PC2): {pca.explained_variance_ratio_.sum():.4f}")

# kNN in 2D (n=3). Use n+1 and drop self-neighbor.
n_neighbors = 5
nbrs = NearestNeighbors(n_neighbors=n_neighbors + 1)
nbrs.fit(X_2d)
_, indices = nbrs.kneighbors(X_2d)

# Build undirected edge set without duplicates
edges = set()
for i, neigh in enumerate(indices):
    for j in neigh[1:]:  # skip self at position 0
        a, b = (i, j) if i < j else (j, i)
        edges.add((a, b))

# Plot
fig, ax = plt.subplots(figsize=(10, 8))
ax.scatter(X_2d[:, 0], X_2d[:, 1], s=30, color="#1f77b4", alpha=0.9)

# Draw edges
for a, b in edges:
    ax.plot([X_2d[a, 0], X_2d[b, 0]], [X_2d[a, 1], X_2d[b, 1]], color="gray", alpha=0.4, linewidth=0.8)

ax.set_title("PCA (2D) + kNN (k=3) neighbor connections")
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.grid(True, linestyle="--", alpha=0.3)
plt.show()



In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors

# Prepare data
valid = [
    (item["abbreviation"], item["embedding"]) 
    for item in faculty_embeddings 
    if item.get("embedding")
]
if not valid:
    raise ValueError("No valid embeddings found. Ensure faculty_embeddings is populated.")

abbreviations = [abbr for abbr, _ in valid]
X = np.array([emb for _, emb in valid], dtype=float)

# L2 normalize rows
norms = np.linalg.norm(X, axis=1, keepdims=True)
norms = np.where(norms == 0.0, 1.0, norms)
X_norm = X / norms

# PCA to 3D
pca = PCA(n_components=3, random_state=42)
X_3d = pca.fit_transform(X_norm)
print(f"Explained variance ratio (PC1+PC2+PC3): {pca.explained_variance_ratio_.sum():.4f}")

# kNN in 3D (n=3). Use n+1 and drop self-neighbor.
n_neighbors = 5
nbrs = NearestNeighbors(n_neighbors=n_neighbors + 1)
nbrs.fit(X_3d)
_, indices = nbrs.kneighbors(X_3d)

# Build undirected edge set without duplicates
edges = set()
for i, neigh in enumerate(indices):
    for j in neigh[1:]:  # skip self at position 0
        a, b = (i, j) if i < j else (j, i)
        edges.add((a, b))

# Plot 3D scatter and neighbor edges
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection="3d")
ax.scatter(X_3d[:, 0], X_3d[:, 1], X_3d[:, 2], s=20, color="#1f77b4", alpha=0.9)

for a, b in edges:
    ax.plot(
        [X_3d[a, 0], X_3d[b, 0]],
        [X_3d[a, 1], X_3d[b, 1]],
        [X_3d[a, 2], X_3d[b, 2]],
        color="gray", alpha=0.35, linewidth=0.8,
    )

ax.set_title("PCA (3D) + kNN (k=3) neighbor connections")
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_zlabel("PC3")
plt.tight_layout()
plt.show()



In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors

# Prepare data
valid = [
    (item["abbreviation"], item["embedding"]) 
    for item in faculty_embeddings 
    if item.get("embedding")
]
if not valid:
    raise ValueError("No valid embeddings found. Ensure faculty_embeddings is populated.")

abbreviations = [abbr for abbr, _ in valid]
X = np.array([emb for _, emb in valid], dtype=float)

# L2 normalize rows
norms = np.linalg.norm(X, axis=1, keepdims=True)
norms = np.where(norms == 0.0, 1.0, norms)
X_norm = X / norms

# PCA to 3D
pca = PCA(n_components=3, random_state=42)
X_3d = pca.fit_transform(X_norm)
print(f"Explained variance ratio (PC1+PC2+PC3): {pca.explained_variance_ratio_.sum():.4f}")

# KMeans (k=5) on 3D PCs
k_clusters = 5
kmeans = KMeans(n_clusters=k_clusters, n_init="auto", random_state=42)
cluster_labels = kmeans.fit_predict(X_3d)
unique_clusters = sorted(set(cluster_labels))
print(f"Clusters requested: {k_clusters}, populated: {len(unique_clusters)}")

# kNN (k=5) graph on 3D PCs (use k+1 to include self, then drop)
k_neighbors = 5
nbrs = NearestNeighbors(n_neighbors=k_neighbors + 1)
nbrs.fit(X_3d)
_, indices = nbrs.kneighbors(X_3d)

edges = set()
for i, neigh in enumerate(indices):
    for j in neigh[1:]:  # skip self
        a, b = (i, j) if i < j else (j, i)
        edges.add((a, b))

# Prepare output structure if needed later
combined_3d = [
    {
        "abbreviation": abbreviations[i],
        "x": float(X_3d[i, 0]),
        "y": float(X_3d[i, 1]),
        "z": float(X_3d[i, 2]),
        "cluster": int(cluster_labels[i]),
    }
    for i in range(len(abbreviations))
]

# Plot
fig = plt.figure(figsize=(11, 9))
ax = fig.add_subplot(111, projection="3d")

# Color by cluster
cmap = plt.cm.get_cmap("tab10", len(unique_clusters))
colors = [cmap(unique_clusters.index(c)) for c in cluster_labels]
ax.scatter(X_3d[:, 0], X_3d[:, 1], X_3d[:, 2], s=18, c=colors, alpha=0.95)

# Draw kNN edges
for a, b in edges:
    ax.plot(
        [X_3d[a, 0], X_3d[b, 0]],
        [X_3d[a, 1], X_3d[b, 1]],
        [X_3d[a, 2], X_3d[b, 2]],
        color="gray", alpha=0.28, linewidth=0.7,
    )

ax.set_title("L2 → PCA(3D) → KMeans(k=5) + kNN(k=5)")
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_zlabel("PC3")
plt.tight_layout()
plt.show()



In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors

# Prepare data
valid = [
    (item["abbreviation"], item["embedding"]) 
    for item in faculty_embeddings 
    if item.get("embedding")
]
if not valid:
    raise ValueError("No valid embeddings found. Ensure faculty_embeddings is populated.")

abbreviations = [abbr for abbr, _ in valid]
X = np.array([emb for _, emb in valid], dtype=float)

# L2 normalize rows
norms = np.linalg.norm(X, axis=1, keepdims=True)
norms = np.where(norms == 0.0, 1.0, norms)
X_norm = X / norms

# PCA to 3D
pca = PCA(n_components=3, random_state=42)
X_3d = pca.fit_transform(X_norm)
print(f"Explained variance ratio (PC1+PC2+PC3): {pca.explained_variance_ratio_.sum():.4f}")

# KMeans (k=5) for colors
k_clusters = 5
kmeans = KMeans(n_clusters=k_clusters, n_init="auto", random_state=42)
cluster_labels = kmeans.fit_predict(X_3d)
unique_clusters = sorted(set(cluster_labels))

# kNN (k=5) edges (optional, light background structure)
k_neighbors = 5
nbrs = NearestNeighbors(n_neighbors=k_neighbors + 1)
nbrs.fit(X_3d)
_, indices = nbrs.kneighbors(X_3d)
knn_edges = set()
for i, neigh in enumerate(indices):
    for j in neigh[1:]:
        a, b = (i, j) if i < j else (j, i)
        knn_edges.add((a, b))

# Minimum Spanning Tree (Prim's algorithm) on full Euclidean graph in 3D
n = X_3d.shape[0]
# Full pairwise Euclidean distance matrix
D = np.sqrt(((X_3d[:, None, :] - X_3d[None, :, :]) ** 2).sum(axis=2))

in_mst = np.zeros(n, dtype=bool)
parent = -np.ones(n, dtype=int)
key = np.full(n, np.inf)

key[0] = 0.0
for _ in range(n):
    # pick min key vertex not yet included
    u = np.argmin(np.where(in_mst, np.inf, key))
    in_mst[u] = True
    # relax neighbors
    for v in range(n):
        if not in_mst[v] and D[u, v] < key[v]:
            key[v] = D[u, v]
            parent[v] = u

mst_edges = [(int(parent[v]), int(v)) for v in range(1, n) if parent[v] != -1]
length_mst = float(np.sum([D[a, b] for a, b in mst_edges]))
print(f"MST edges: {len(mst_edges)} (total length: {length_mst:.4f})")

# Plot
fig = plt.figure(figsize=(11, 9))
ax = fig.add_subplot(111, projection="3d")

# Color by KMeans cluster
cmap = plt.cm.get_cmap("tab10", len(unique_clusters))
colors = [cmap(unique_clusters.index(c)) for c in cluster_labels]
ax.scatter(X_3d[:, 0], X_3d[:, 1], X_3d[:, 2], s=18, c=colors, alpha=0.95)

# Optional: draw kNN edges (light)
for a, b in knn_edges:
    ax.plot(
        [X_3d[a, 0], X_3d[b, 0]],
        [X_3d[a, 1], X_3d[b, 1]],
        [X_3d[a, 2], X_3d[b, 2]],
        color="gray", alpha=0.2, linewidth=0.6,
    )

# Draw MST edges (emphasized)
for a, b in mst_edges:
    ax.plot(
        [X_3d[a, 0], X_3d[b, 0]],
        [X_3d[a, 1], X_3d[b, 1]],
        [X_3d[a, 2], X_3d[b, 2]],
        color="#d62728", alpha=0.9, linewidth=2.0,
    )

ax.set_title("L2 → PCA(3D) → KMeans(k=5) + kNN(k=5) + MST")
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_zlabel("PC3")
plt.tight_layout()
plt.show()



In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from mpl_toolkits.mplot3d.art3d import Poly3DCollection

# Optional 3D convex hull via SciPy
try:
    from scipy.spatial import ConvexHull  # type: ignore
    SCIPY_AVAILABLE = True
except Exception:
    SCIPY_AVAILABLE = False

# 2D convex hull (fallback) via Andrew's monotone chain
def _cross(o, a, b):
    return (a[0] - o[0]) * (b[1] - o[1]) - (a[1] - o[1]) * (b[0] - o[0])

def convex_hull_2d(points: np.ndarray) -> np.ndarray:
    pts = np.unique(points, axis=0)
    if len(pts) <= 1:
        return pts
    pts = pts[np.lexsort((pts[:, 1], pts[:, 0]))]
    lower = []
    for p in pts:
        while len(lower) >= 2 and _cross(lower[-2], lower[-1], p) <= 0:
            lower.pop()
        lower.append(tuple(p))
    upper = []
    for p in reversed(pts):
        while len(upper) >= 2 and _cross(upper[-2], upper[-1], p) <= 0:
            upper.pop()
        upper.append(tuple(p))
    hull = lower[:-1] + upper[:-1]
    return np.array(hull)

# Prepare data
valid = [
    (item["abbreviation"], item["embedding"]) 
    for item in faculty_embeddings 
    if item.get("embedding")
]
if not valid:
    raise ValueError("No valid embeddings found. Ensure faculty_embeddings is populated.")

abbreviations = [abbr for abbr, _ in valid]
X = np.array([emb for _, emb in valid], dtype=float)

# L2 normalize rows
norms = np.linalg.norm(X, axis=1, keepdims=True)
norms = np.where(norms == 0.0, 1.0, norms)
X_norm = X / norms

# PCA to 3D
pca = PCA(n_components=3, random_state=42)
X_3d = pca.fit_transform(X_norm)
print(f"Explained variance ratio (PC1+PC2+PC3): {pca.explained_variance_ratio_.sum():.4f}")

# KMeans (k=5) for colors
k_clusters = 5
kmeans = KMeans(n_clusters=k_clusters, n_init="auto", random_state=42)
cluster_labels = kmeans.fit_predict(X_3d)
unique_clusters = sorted(set(cluster_labels))

# kNN (k=5) edges
k_neighbors = 5
nbrs = NearestNeighbors(n_neighbors=k_neighbors + 1)
nbrs.fit(X_3d)
_, indices = nbrs.kneighbors(X_3d)
knn_edges = set()
for i, neigh in enumerate(indices):
    for j in neigh[1:]:
        a, b = (i, j) if i < j else (j, i)
        knn_edges.add((a, b))

# Minimum Spanning Tree (Prim's algorithm) on full Euclidean graph in 3D
n = X_3d.shape[0]
D = np.sqrt(((X_3d[:, None, :] - X_3d[None, :, :]) ** 2).sum(axis=2))

in_mst = np.zeros(n, dtype=bool)
parent = -np.ones(n, dtype=int)
key = np.full(n, np.inf)
key[0] = 0.0
for _ in range(n):
    u = np.argmin(np.where(in_mst, np.inf, key))
    in_mst[u] = True
    for v in range(n):
        if not in_mst[v] and D[u, v] < key[v]:
            key[v] = D[u, v]
            parent[v] = u

mst_edges = [(int(parent[v]), int(v)) for v in range(1, n) if parent[v] != -1]
length_mst = float(np.sum([D[a, b] for a, b in mst_edges]))
print(f"MST edges: {len(mst_edges)} (total length: {length_mst:.4f})")

# Plot
fig = plt.figure(figsize=(11, 9))
ax = fig.add_subplot(111, projection="3d")

# Color by cluster
cmap = plt.cm.get_cmap("tab10", len(unique_clusters))
colors = [cmap(unique_clusters.index(c)) for c in cluster_labels]
ax.scatter(X_3d[:, 0], X_3d[:, 1], X_3d[:, 2], s=18, c=colors, alpha=0.95)

# Draw kNN edges (light)
for a, b in knn_edges:
    ax.plot(
        [X_3d[a, 0], X_3d[b, 0]],
        [X_3d[a, 1], X_3d[b, 1]],
        [X_3d[a, 2], X_3d[b, 2]],
        color="gray", alpha=0.18, linewidth=0.6,
    )

# Draw MST edges (more transparent red)
for a, b in mst_edges:
    ax.plot(
        [X_3d[a, 0], X_3d[b, 0]],
        [X_3d[a, 1], X_3d[b, 1]],
        [X_3d[a, 2], X_3d[b, 2]],
        color="#d62728", alpha=0.55, linewidth=2.0,
    )

# Draw convex hulls per cluster
for lab in unique_clusters:
    pts = X_3d[cluster_labels == lab]
    color = cmap(unique_clusters.index(lab))
    if pts.shape[0] >= 4 and SCIPY_AVAILABLE:
        try:
            hull = ConvexHull(pts)
            faces = [pts[simplex] for simplex in hull.simplices]
            poly3d = Poly3DCollection(faces, facecolor=color, edgecolor=color, linewidth=0.5, alpha=0.12)
            ax.add_collection3d(poly3d)
        except Exception:
            pass
    elif pts.shape[0] == 3:
        tri = Poly3DCollection([pts], facecolor=color, edgecolor=color, linewidth=0.6, alpha=0.12)
        ax.add_collection3d(tri)
    elif pts.shape[0] >= 3:
        # Fallback: project to PC1-PC2 plane and draw a thin plate at mean Z
        hull2d = convex_hull_2d(pts[:, :2])
        if len(hull2d) >= 3:
            z = float(pts[:, 2].mean())
            plate = np.column_stack([hull2d, np.full(len(hull2d), z)])
            poly3d = Poly3DCollection([plate], facecolor=color, edgecolor=color, linewidth=0.5, alpha=0.12)
            ax.add_collection3d(poly3d)
        
# Labels and layout
ax.set_title("L2 → PCA(3D) → KMeans(k=5) + kNN(k=5) + MST + Cluster Hulls")
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_zlabel("PC3")
plt.tight_layout()
plt.show()

