# **Clustering using LLM**

### Installing required packages

In [None]:
pip install --upgrade openai

Collecting openai
  Downloading openai-1.102.0-py3-none-any.whl.metadata (29 kB)
Downloading openai-1.102.0-py3-none-any.whl (812 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m812.0/812.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 0.28.0
    Uninstalling openai-0.28.0:
      Successfully uninstalled openai-0.28.0
Successfully installed openai-1.102.0


### **1. Main code**

In [None]:
import os
import json
import math
from typing import List, Dict, Any
from collections import defaultdict

# ---------- CONFIG ----------
USE_OPENAI = True  # Always use OpenAI
# Read the API key from Colab's secrets manager or env
try:
    from google.colab import userdata
    OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
except ImportError:
    OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
    if not OPENAI_API_KEY:
        print("Warning: OPENAI_API_KEY not found in environment variables. Please set it or use Colab secrets.")

JSON_PATH = "/content/Updated_combined_content_With_Both(Slide_and_Presentation)_Summaries_V4.json"  # path to the uploaded json
OUTPUT_DIR = "./llm_clusters_out"
EMBEDDING_MODEL_OPENAI = "text-embedding-3-small"
LLM_MODEL_FOR_LABEL = "gpt-4o-mini"
# ----------------------------

os.makedirs(OUTPUT_DIR, exist_ok=True)

# Required packages
try:
    import numpy as np
    import pandas as pd
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    import umap.umap_ as umap
    import hdbscan
    from tqdm import tqdm
except Exception as e:
    raise ImportError("Please install required packages: numpy pandas scikit-learn umap-learn hdbscan tqdm") from e

# OpenAI client
openai_client = None
if USE_OPENAI and OPENAI_API_KEY:
    try:
        from openai import OpenAI
        openai_client = OpenAI(api_key=OPENAI_API_KEY)
    except Exception:
        openai_client = None
        print("Warning: could not initialize OpenAI client.")
elif USE_OPENAI and not OPENAI_API_KEY:
    print("Warning: USE_OPENAI is True but OPENAI_API_KEY is not set.")

# ---------------- Helper functions ----------------

def load_json(path: str) -> List[Dict[str, Any]]:
    with open(JSON_PATH, 'r', encoding='utf-8') as f:
        return json.load(f)

def build_document_text(entry: Dict[str, Any]) -> str:
    """Concatenate pptx_name, proxy_summary, slide texts and tags into one representative string."""
    parts = []
    name = entry.get('pptx_name') or ''
    parts.append(name)
    proxy = entry.get('proxy_summary') or ''
    if proxy:
        parts.append(proxy)
    slides = entry.get('slide_details', [])
    tags = []
    for s in slides:
        t = s.get('Text') or ''
        if t:
            parts.append(t)
        ts = s.get('Tags associated') or []
        if isinstance(ts, list):
            tags.extend(ts)
    if tags:
        parts.append(' '.join(tags))
    doc = '\n'.join(parts)
    doc = ' '.join(doc.split())
    return doc[:15000]

def get_openai_embeddings(texts: List[str], model: str = EMBEDDING_MODEL_OPENAI, batch_size: int = 16) -> List[List[float]]:
    assert openai_client is not None, "OpenAI not available; set OPENAI_API_KEY correctly"
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="OpenAI embeddings"):
        batch = texts[i:i+batch_size]
        resp = openai_client.embeddings.create(model=model, input=batch)
        for item in resp.data:
            embeddings.append(item.embedding)
    return embeddings

def compute_embeddings(texts: List[str]) -> np.ndarray:
    if openai_client is None:
        raise RuntimeError("OpenAI client not initialized. Please set OPENAI_API_KEY.")
    embs = get_openai_embeddings(texts)
    return np.array(embs)

def reduce_dim(embeddings: np.ndarray, n_components: int = 64) -> np.ndarray:
    reducer = umap.UMAP(n_components=n_components, random_state=42)
    return reducer.fit_transform(embeddings)

def cluster_embeddings(emb_reduced: np.ndarray, min_cluster_size: int = 2) -> np.ndarray:
    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, metric='euclidean')
    labels = clusterer.fit_predict(emb_reduced)
    return labels

def pick_representatives(texts: List[str], embeddings: np.ndarray, labels: np.ndarray, topk: int = 3) -> Dict[int, List[int]]:
    reps = {}
    unique_labels = sorted(set(labels))
    for lab in unique_labels:
        if lab == -1:
            continue
        idxs = [i for i, l in enumerate(labels) if l == lab]
        if len(idxs) == 0:
            continue
        cluster_embs = embeddings[idxs]
        centroid = cluster_embs.mean(axis=0, keepdims=True)
        sims = cosine_similarity(cluster_embs, centroid).flatten()
        order = np.argsort(-sims)
        chosen = [idxs[i] for i in order[:min(topk, len(order))]]
        reps[lab] = chosen
    return reps

def label_with_llm(cluster_examples: List[str], system_prompt: str = None) -> str:
    """Ask an LLM to label clusters. Falls back to TF-IDF if OpenAI unavailable."""
    if openai_client is not None:
        messages = [
            {"role": "system", "content": system_prompt or "You are an assistant that reads a few document snippets and returns a concise cluster label (3-5 words) and a 1-2 sentence description."},
            {"role": "user", "content": "Here are representative snippets from documents in a cluster:\n\n" + "\n---\n".join(cluster_examples) + "\n\nPlease reply in JSON format: {\"label\": \"...\", \"description\": \"...\"}. Keep the label short (max 5 words)."}
        ]
        resp = openai_client.chat.completions.create(
            model=LLM_MODEL_FOR_LABEL,
            messages=messages,
            temperature=0.2,
            max_tokens=200
        )
        text = resp.choices[0].message.content.strip()
        try:
            parsed = json.loads(text)
            return parsed.get('label', '') + ' -- ' + parsed.get('description', '')
        except Exception:
            return text
    else:
        # fallback
        vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=50, stop_words='english')
        X = vectorizer.fit_transform(cluster_examples)
        sums = np.asarray(X.sum(axis=0)).ravel()
        terms = np.array(vectorizer.get_feature_names_out())
        top_terms = terms[np.argsort(-sums)][:5]
        label = " / ".join(top_terms[:3])
        desc = "Representative keywords: " + ", ".join(top_terms)
        return label + ' -- ' + desc

# ---------------- Main pipeline ----------------

def main():
    print("Loading JSON...")
    data = load_json(JSON_PATH)
    docs = []
    meta = []
    for entry in data:
        doc = build_document_text(entry)
        docs.append(doc)
        meta.append({
            'file_id': entry.get('file_id'),
            'pptx_name': entry.get('pptx_name')
        })

    print(f"Building embeddings for {len(docs)} documents...")
    embs = compute_embeddings(docs)

    print("Reducing dimensionality (UMAP)...")
    emb_reduced = reduce_dim(embs, n_components=64)

    print("Clustering with HDBSCAN...")
    labels = cluster_embeddings(emb_reduced, min_cluster_size=max(2, len(docs)//20))

    print("Selecting representative documents for each cluster...")
    reps = pick_representatives(docs, embs, labels, topk=3)

    clusters_out = defaultdict(lambda: { 'members': [], 'representatives': [], 'label': None, 'description': None })
    for i, lab in enumerate(labels):
        clusters_out[int(lab)]['members'].append(i)

    for lab, d in reps.items():
        clusters_out[int(lab)]['representatives'] = d

    print("Labeling clusters with LLM/fallback...")
    for lab, entry in list(clusters_out.items()):
        if lab == -1:
            entry['label'] = 'Noise / Miscellaneous'
            entry['description'] = 'Documents that did not fit any clear cluster.'
            continue
        rep_idxs = entry['representatives']
        if not rep_idxs:
            entry['label'] = 'Unlabeled Cluster'
            entry['description'] = 'No representative texts found.'
            continue
        snippets = [docs[idx][:1000] for idx in rep_idxs]
        try:
            label_desc = label_with_llm(snippets)
        except Exception as e:
            label_desc = "Error labeling cluster: " + str(e)

        if ' -- ' in label_desc:
            lab_text, desc = label_desc.split(' -- ', 1)
            entry['label'] = lab_text.strip()
            entry['description'] = desc.strip()
        else:
            entry['label'] = label_desc.strip()
            entry['description'] = ''

    results = []
    for lab, entry in clusters_out.items():
        result_entry = {
            'cluster_id': lab,
            'label': entry['label'],
            'description': entry['description'],
            'num_members': len(entry['members']),
            'members': [],
            'representatives': []
        }
        for member_idx in entry['members']:
            result_entry['members'].append(meta[member_idx])
        for rep_idx in entry['representatives']:
            result_entry['representatives'].append(meta[rep_idx])
        results.append(result_entry)

    results.sort(key=lambda x: x['num_members'], reverse=True)

    out_json = os.path.join(OUTPUT_DIR, 'clusters_full.json')
    with open(out_json, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=4)

    rows = []
    for r in results:
        for m in r['members']:
            rows.append({ 'cluster_id': r['cluster_id'], 'cluster_label': r['label'], 'pptx_name': m['pptx_name'], 'file_id': m['file_id'] })
    df = pd.DataFrame(rows)
    csv_path = os.path.join(OUTPUT_DIR, 'clusters_preview.csv')
    df.to_csv(csv_path, index=False)

    print(f"Done. Results saved to: {out_json} and {csv_path}")
    print("Summary:")
    for r in results:
        print(f"Cluster {r['cluster_id']}: {r['label']} ({r['num_members']} docs)")

if __name__ == '__main__':
    main()

Loading JSON...
Building embeddings for 578 documents...


OpenAI embeddings: 100%|██████████| 37/37 [00:40<00:00,  1.10s/it]


Reducing dimensionality (UMAP)...



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



Clustering with HDBSCAN...
Selecting representative documents for each cluster...
Labeling clusters with LLM/fallback...



'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



Done. Results saved to: ./llm_clusters_out/clusters_full.json and ./llm_clusters_out/clusters_preview.csv
Summary:
Cluster 2: AI Solutions and Applications (344 docs)
Cluster 1: Analytics Consultancy Overview (104 docs)
Cluster -1: Noise / Miscellaneous (96 docs)
Cluster 0: AI Solutions in Healthcare (34 docs)


### Basic clustering using ppt_name

In [None]:
import plotly.express as px
import pandas as pd
import json
import os

# Load the full clustering results
clusters_json_path = os.path.join(OUTPUT_DIR, 'clusters_full.json')
with open(clusters_json_path, 'r', encoding='utf-8') as f:
    clusters_data = json.load(f)

# Prepare data for sunburst chart
# We need a hierarchy: Cluster -> Document
data_for_sunburst = []
for cluster in clusters_data:
    cluster_id = cluster['cluster_id']
    cluster_label = cluster['label']
    num_members = cluster['num_members']

    # Add cluster as a parent node
    data_for_sunburst.append(dict(
        ids=f"cluster_{cluster_id}",
        labels=f"Cluster {cluster_id}: {cluster_label} ({num_members})",
        parents=[""], # Root node has empty parent
        value=num_members # Value can be number of members
    ))

    # Add members as child nodes of the cluster
    for member in cluster['members']:
        file_id = member['file_id']
        pptx_name = member['pptx_name']
        data_for_sunburst.append(dict(
            ids=file_id,
            labels=pptx_name,
            parents=[f"cluster_{cluster_id}"],
            value=1 # Each document adds 1 to the count
        ))

# Create DataFrame for Plotly
sunburst_df = pd.DataFrame(data_for_sunburst)

# Create the sunburst chart
fig = px.sunburst(
    sunburst_df,
    ids='ids',
    names='labels',
    parents='parents',
    values='value',
    title='Document Clusters Sunburst Chart'
)

# Update layout for better readability
fig.update_layout(margin=dict(t=0, l=0, r=0, b=0))

# Show the plot
fig.show()

### 2. Clustering and visualization using pptx_path

In [None]:
import plotly.express as px
import pandas as pd
import json
import os
from collections import defaultdict

# Load the full clustering results
clusters_json_path = os.path.join(OUTPUT_DIR, 'clusters_full.json')
with open(clusters_json_path, 'r', encoding='utf-8') as f:
    clusters_data = json.load(f)

# Load the original data to get pptx_path for hierarchy information
try:
    with open(JSON_PATH, 'r', encoding='utf-8') as f:
        original_data = json.load(f)
    # Create a mapping from file_id to pptx_path and pptx_name
    file_id_info = {entry.get('file_id'): {'pptx_path': entry.get('pptx_path'), 'pptx_name': entry.get('pptx_name', 'UnknownName')} for entry in original_data}
except FileNotFoundError:
    print(f"Error: Original data file not found at {JSON_PATH}. Cannot build folder hierarchy.")
    original_data = []
    file_id_info = {}
except Exception as e:
    print(f"Error loading original data: {e}")
    original_data = []
    file_id_info = {}


# Prepare data for sunburst chart with hierarchy: Folder -> ... -> Cluster -> Document
data_for_sunburst = []
added_nodes = set() # To keep track of added nodes (folders, clusters, documents)

# Add a root node representing the starting point of the visualization
root_label = "Algo_Org_PPTs"
root_id = root_label.replace(" ", "_") # Simple ID based on label
if root_id not in added_nodes:
    data_for_sunburst.append(dict(
        ids=root_id,
        labels=root_label,
        parents=[""], # Root node has empty parent
        value=0, # Value will be summed up by children
        cluster_id=None # Add cluster_id for coloring
    ))
    added_nodes.add(root_id)

# Dictionary to hold hierarchical structure based on pptx_path parts after the root
# path_part -> { children: {}, cluster_id: None, file_id: None }
hierarchy_tree = {}

# Populate the hierarchy tree based on pptx_path
for cluster in clusters_data:
    cluster_id = cluster['cluster_id']
    # cluster_label is not used directly in tree building but available

    for member in cluster['members']:
        file_id = member['file_id']
        info = file_id_info.get(file_id, {})
        pptx_path = info.get('pptx_path')
        pptx_name = info.get('pptx_name', member.get('pptx_name', f"Unknown_{file_id}")) # Use pptx_name from original or cluster data

        if not pptx_path:
            # print(f"Warning: pptx_path not found for file_id {file_id}. Skipping in hierarchy.")
            continue # Skip documents without a pptx_path

        # Find the part of the path after 'Algo_Org_PPTs'
        try:
            # Ensure path is treated as a string
            path_str = str(pptx_path)
            # Handle potential absolute paths or leading slashes
            cleaned_pptx_path = path_str.lstrip('/')
            path_parts_full = cleaned_pptx_path.split('/')
            if root_label in path_parts_full:
                root_index = path_parts_full.index(root_label)
                path_parts_after_root = path_parts_full[root_index + 1:]
            else:
                # If the root label is not in the path, skip this document
                continue
        except Exception as e:
            print(f"Error processing path '{pptx_path}' for file_id {file_id}: {e}")
            continue # Skip if path processing fails

        if not path_parts_after_root:
            # This document is directly inside the root folder 'Algo_Org_PPTs'
            doc_name = pptx_name # Use pptx_name as the leaf label
            current_level = hierarchy_tree
            # Store the cluster_id and file_id at the leaf node representing the document filename
            if doc_name not in current_level:
                 current_level[doc_name] = {'cluster_id': cluster_id, 'file_id': file_id, 'pptx_name': pptx_name, 'label': doc_name}
            else:
                 # Handle duplicate document names if necessary, here we just assign to the first cluster found
                 current_level[doc_name]['cluster_id'] = cluster_id
        else:
            current_level = hierarchy_tree
            for part in path_parts_after_root[:-1]: # Iterate through sub-folders after the root
                if part not in current_level:
                    current_level[part] = {'children': {}}
                current_level = current_level[part]['children']

            # At the document level in the hierarchy tree, store cluster info and file_id
            doc_name = path_parts_after_root[-1] # This is the actual filename part
            # Store the cluster_id and file_id at the leaf node representing the document filename
            if doc_name not in current_level:
                 current_level[doc_name] = {'cluster_id': cluster_id, 'file_id': file_id, 'pptx_name': pptx_name, 'label': doc_name}
            else:
                 # Handle duplicate document names if necessary, here we just assign to the first cluster found
                 current_level[doc_name]['cluster_id'] = cluster_id


# Function to flatten the hierarchy tree into the sunburst data format
def flatten_hierarchy(tree, parent_id, data_list, added_nodes_set, clusters_data_map):
    for name, node_data in tree.items():
        current_id = f"{parent_id}/{name}" # Unique ID based on path

        if 'children' in node_data: # It's a folder node
            if current_id not in added_nodes_set:
                data_list.append(dict(
                    ids=current_id,
                    labels=name,
                    parents=[parent_id],
                    value=0, # Value will be summed by children
                    cluster_id=None # Folders don't have a specific cluster ID for coloring
                ))
                added_nodes_set.add(current_id)
            flatten_hierarchy(node_data['children'], current_id, data_list, added_nodes_set, clusters_data_map)
        elif 'cluster_id' in node_data: # It's a document node
            cluster_id = node_data['cluster_id']
            file_id = node_data['file_id']
            pptx_name = node_data['pptx_name']
            doc_label = node_data['label'] # Use the filename as the label

            # Get the cluster label
            cluster_info = clusters_data_map.get(cluster_id, {'label': f'Cluster {cluster_id}'})
            cluster_label_text = f"Cluster {cluster_id}: {cluster_info['label']}"

            # Create a unique ID for the cluster node under this specific parent folder
            cluster_node_id_in_path = f"{parent_id}_cluster_{cluster_id}"

            # Add cluster node if not already added under this specific parent folder
            if cluster_node_id_in_path not in added_nodes_set:
                 # Add the cluster node
                 data_list.append(dict(
                    ids=cluster_node_id_in_path,
                    labels=cluster_label_text,
                    parents=[parent_id], # Parent is the current folder node
                    value=0, # Value will be summed by documents
                    cluster_id=cluster_id # Assign cluster_id for coloring the cluster node
                 ))
                 added_nodes_set.add(cluster_node_id_in_path)

            # Add the document node
            doc_unique_id = f"{cluster_node_id_in_path}_{file_id}" # Make document ID unique under its cluster and path

            if doc_unique_id not in added_nodes_set:
                data_list.append(dict(
                    ids=doc_unique_id,
                    labels=pptx_name if pptx_name else doc_label, # Use pptx_name as label if available, fallback to doc_label
                    parents=[cluster_node_id_in_path], # Parent is the cluster node under this folder
                    value=1, # Each document adds 1 to the count
                    cluster_id=cluster_id # Assign cluster_id for coloring the document node
                ))
                added_nodes_set.add(doc_unique_id)


# Create a map of cluster_id to cluster data for easy lookup
clusters_data_map = {cluster['cluster_id']: cluster for cluster in clusters_data}

# Flatten the hierarchy tree starting from the root we defined
flatten_hierarchy(hierarchy_tree, root_id, data_for_sunburst, added_nodes, clusters_data_map)

# Create DataFrame for Plotly
sunburst_df = pd.DataFrame(data_for_sunburst)

# Convert cluster_id to string for categorical coloring in Plotly
sunburst_df['cluster_id'] = sunburst_df['cluster_id'].astype(str)


# Create the sunburst chart
# Use path and parent-child relationships
fig = px.sunburst(
    sunburst_df,
    ids='ids',
    names='labels',
    parents='parents',
    values='value',
    color='cluster_id', # Use cluster_id for coloring
    title=f'Document Clusters Sunburst Chart with Folder Hierarchy (Starting from "{root_label}")'
)

# Update layout for better readability
fig.update_layout(margin=dict(t=0, l=0, r=0, b=0))

# Show the plot
fig.show()

### 3. Saving the JSON with path and cluster data

In [None]:
import json
import os

# Define paths (ensure these match your setup)
clusters_json_path = os.path.join(OUTPUT_DIR, 'clusters_full.json')
original_json_path = JSON_PATH # Assuming JSON_PATH is defined in a previous cell
output_json_path_with_paths = os.path.join(OUTPUT_DIR, 'clusters_with_paths.json')

# Load the full clustering results
try:
    with open(clusters_json_path, 'r', encoding='utf-8') as f:
        clusters_data = json.load(f)
except FileNotFoundError:
    print(f"Error: Clusters JSON file not found at {clusters_json_path}. Please run the clustering code first.")
    clusters_data = []
except Exception as e:
    print(f"Error loading clusters JSON: {e}")
    clusters_data = []


# Load the original data to get pptx_path mapping
file_id_to_pptx_path = {}
try:
    with open(original_json_path, 'r', encoding='utf-8') as f:
        original_data = json.load(f)
    file_id_to_pptx_path = {entry.get('file_id'): entry.get('pptx_path', 'NoPathFound') for entry in original_data}
except FileNotFoundError:
    print(f"Error: Original JSON file not found at {original_json_path}.")
except Exception as e:
    print(f"Error loading original JSON data: {e}")


# Update cluster data with pptx_path for each member
updated_clusters_data = []
for cluster in clusters_data:
    updated_members = []
    for member in cluster.get('members', []):
        file_id = member.get('file_id')
        pptx_path = file_id_to_pptx_path.get(file_id, member.get('pptx_name', 'NoPathFound')) # Use pptx_path if found, fallback to pptx_name, then 'NoPathFound'
        updated_member = member.copy()
        updated_member['pptx_path'] = pptx_path
        updated_members.append(updated_member)

    updated_cluster = cluster.copy()
    updated_cluster['members'] = updated_members
    updated_clusters_data.append(updated_cluster)


# Save the updated cluster data with paths to a new JSON file
if updated_clusters_data:
    try:
        with open(output_json_path_with_paths, 'w', encoding='utf-8') as f:
            json.dump(updated_clusters_data, f, indent=4)
        print(f"Cluster data with file paths saved to: {output_json_path_with_paths}")
    except Exception as e:
        print(f"Error saving updated cluster data: {e}")
else:
    print("No cluster data to save.")

Cluster data with file paths saved to: ./llm_clusters_out/clusters_with_paths.json
