In [1]:
# Import necessary libraries
import os
from dotenv import load_dotenv
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from collections import Counter
from pinecone.grpc import PineconeGRPC as Pinecone

# Load environment variables
load_dotenv()

# Set up Pinecone API key and initialize
pinecone_api_key = os.getenv("PINECONE_API_KEY")
if not pinecone_api_key:
    raise ValueError("PINECONE_API_KEY is not set in the .env file")

pc = Pinecone(api_key=pinecone_api_key)

index_name = 'idea-index'
namespace = os.getenv("PINECONE_NAMESPACE")
index = pc.Index(index_name)

# Fetch ideas and embeddings
ideas = []  # Placeholder for actual idea text
embeddings = []
batch_size = 500

for i in range(0, len(all_ids), batch_size):
    batch_ids = all_ids[i:i + batch_size]
    response = index.fetch(ids=batch_ids, namespace=namespace)
    
    # Extract only the "values" from each vector
    embeddings.extend([vector["values"] for vector in response["vectors"].values()])
    ideas.extend([vector["metadata"]["title"] for vector in response["vectors"].values()])  # Adjust based on your metadata schema

# Convert embeddings to a NumPy array
embeddings = np.array(embeddings, dtype=np.float32)  # Ensure data type consistency

# Step 1: Scale the embeddings
scaler = StandardScaler()
scaled_embeddings = scaler.fit_transform(embeddings)

# Step 2: Dimensionality reduction (Optional)
pca = PCA(n_components=50)
reduced_embeddings = pca.fit_transform(scaled_embeddings)

# Step 3: Apply DBSCAN
# Set hyperparameters: eps (distance threshold) and min_samples (minimum points per cluster)
dbscan = DBSCAN(eps=0.5, min_samples=5, metric='cosine')
labels = dbscan.fit_predict(reduced_embeddings)

# Step 4: Visualization using t-SNE
tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(reduced_embeddings)

plt.figure(figsize=(10, 8))
scatter = plt.scatter(tsne_results[:, 0], tsne_results[:, 1], c=labels, cmap='tab20', s=50)
plt.colorbar(scatter, label="Cluster")
plt.title("DBSCAN Clustering of Ideas")
plt.xlabel("t-SNE Dimension 1")
plt.ylabel("t-SNE Dimension 2")
plt.show()

# Step 5: Interpretation and Insights
# Map ideas to clusters
unique_labels = set(labels)
clustered_ideas = {cluster: [] for cluster in unique_labels if cluster != -1}  # Exclude noise points (-1)
noise_points = [idea for idea, label in zip(ideas, labels) if label == -1]

for idea, cluster in zip(ideas, labels):
    if cluster != -1:  # Exclude noise points
        clustered_ideas[cluster].append(idea)

# Generate insights
for cluster, idea_list in clustered_ideas.items():
    print(f"\nCluster {cluster}:")
    print(f"Number of ideas: {len(idea_list)}")
    
    # Show the most common words (optional)
    all_words = " ".join(idea_list).lower().split()
    common_words = Counter(all_words).most_common(5)
    print(f"Top words: {common_words}")
    
    # Sample ideas
    print("Sample ideas:")
    for sample_idea in idea_list[:3]:  # Show first 3 ideas in this cluster
        print(f"- {sample_idea}")
    
    # Actionable insights
    print(f"Insight: Cluster {cluster} appears to focus on {' and '.join([word[0] for word in common_words])}.")

# Step 6: Save results for further analysis
results_df = pd.DataFrame({"Idea": ideas, "Cluster": labels})
results_df.to_csv("dbscan_clustered_ideas.csv", index=False)
print("\nClustered ideas saved to 'dbscan_clustered_ideas.csv'.")

# Step 7: Analyze Noise
print(f"\nNoise points (ideas that don't belong to any cluster): {len(noise_points)}")
print("Sample noise points:")
for noise in noise_points[:5]:
    print(f"- {noise}")

print("Embedding shape:", np.array(embeddings).shape)
print("Sample embedding:", embeddings[:3])

embedding_matrix = np.array(embeddings)
variance = np.var(embedding_matrix, axis=0)
print("Variance across dimensions:", variance)
print("Mean variance:", np.mean(variance))


NameError: name 'all_ids' is not defined