In [28]:
# Import necessary libraries
import os
import numpy as np
from IPython.display import display, HTML
from tqdm import tqdm
from collections import defaultdict
from dotenv import load_dotenv
from openai import OpenAI
from pinecone.grpc import PineconeGRPC as Pinecone

# Load environment variables
load_dotenv()

# Set up Pinecone API key and initialize
pinecone_api_key = os.getenv("PINECONE_API_KEY")
if not pinecone_api_key:
    raise ValueError("PINECONE_API_KEY is not set in the .env file")

pc = Pinecone(api_key=pinecone_api_key)

client = OpenAI()

# Connect to the index
index_name = 'idea-index'
namespace = os.getenv("PINECONE_NAMESPACE")

details = pc.describe_index(index_name)
index = pc.Index(host=details.host)

def get_all_vector_ids(index, namespace):
    """Fetch all vector IDs from the Pinecone index."""
    vector_ids = []

    print(f"Fetching vector IDs from namespace: {namespace}")

    for ids in index.list(namespace=namespace):
        vector_ids.extend(ids)

    print(f"Total vector IDs fetched: {len(vector_ids)}")

    return vector_ids


def get_all_embeddings(index, vector_ids, namespace, batch_size=100):
    """Fetch embeddings and metadata from Pinecone by vector IDs with a progress bar."""
    print(f"Fetching embeddings for {len(vector_ids)} vector IDs.")
    embeddings = []

    # Wrap the loop with tqdm for a progress bar
    for i in tqdm(range(0, len(vector_ids), batch_size), desc="Fetching Embeddings"):
        batch_ids = vector_ids[i:i + batch_size]
        response = index.fetch(ids=batch_ids, namespace=namespace)
        for vector_id, vector_data in response["vectors"].items():
            embeddings.append({
                "id": vector_id,
                "values": vector_data["values"],
                "metadata": vector_data["metadata"],
            })

    return embeddings


def cosine_similarity(vec1, vec2):
    """Calculate cosine similarity between two vectors."""
    vec1 = np.array(vec1)
    vec2 = np.array(vec2)
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))


def group_duplicates(embeddings, threshold=0.95):
    """Group duplicates into clusters based on pairwise similarity."""
    print(f"Grouping duplicates with threshold: {threshold}")
    graph = defaultdict(set)  # Graph to store similarity connections

    # Build graph by connecting similar embeddings
    for i in tqdm(range(len(embeddings)), desc="Building Similarity Graph"):
        for j in range(i + 1, len(embeddings)):
            sim = cosine_similarity(embeddings[i]["values"], embeddings[j]["values"])
            if sim >= threshold:
                graph[embeddings[i]["id"]].add(embeddings[j]["id"])
                graph[embeddings[j]["id"]].add(embeddings[i]["id"])

    # Find connected components in the graph
    visited = set()
    clusters = []

    def dfs(node, cluster):
        """Depth-first search to find all connected nodes."""
        if node in visited:
            return
        visited.add(node)
        cluster.append(node)
        for neighbor in graph[node]:
            dfs(neighbor, cluster)

    for node in graph:
        if node not in visited:
            cluster = []
            dfs(node, cluster)
            clusters.append(cluster)

    return clusters


# Step 1: Fetch all vector IDs
vector_ids = get_all_vector_ids(index, namespace)

# Step 2: Fetch all embeddings
embeddings = get_all_embeddings(index, vector_ids, namespace)


# Step 3: Group duplicates
if embeddings:
    threshold = 0.85  # Adjust threshold as needed
    duplicate_clusters = group_duplicates(embeddings, threshold=threshold)
    if duplicate_clusters:
        for group_number, cluster in enumerate(duplicate_clusters, start=1):  # Start numbering from 1
            print(f"Duplicate Group {group_number}:")
            
            for idea_id in cluster:
                # Fetch metadata for the current idea
                idea_metadata = next(e["metadata"] for e in embeddings if e["id"] == idea_id)
                idea_code = idea_metadata.get("code", "N/A")
                idea_title = idea_metadata.get("title", "No Title")
                link_url = f"https://bi.brightidea.com/productrequests/{idea_code}"  # Replace with your actual base URL

                # Display the idea as a clickable link
                html_content = f'<a href="{link_url}" target="_blank">[{idea_code}]</a> - {idea_title}'
                display(HTML(html_content))
    else:
        print("No duplicate clusters found.")
else:
    print("No embeddings found in the index.")


Fetching vector IDs from namespace: bi-internal-ideas
Total vector IDs fetched: 2115
Fetching embeddings for 2115 vector IDs.


Fetching Embeddings: 100%|██████████| 22/22 [00:10<00:00,  2.05it/s]


Grouping duplicates with threshold: 0.85


Building Similarity Graph: 100%|██████████| 2115/2115 [04:18<00:00,  8.19it/s] 

Duplicate Group 1:





Duplicate Group 2:


Duplicate Group 3:


Duplicate Group 4:


Duplicate Group 5:


Duplicate Group 6:


Duplicate Group 7:


Duplicate Group 8:


Duplicate Group 9:


Duplicate Group 10:


Duplicate Group 11:
