In [21]:
# Import necessary libraries
import os
import json
import numpy as np
from tqdm import tqdm
from dotenv import load_dotenv
from openai import OpenAI
from pinecone.grpc import PineconeGRPC as Pinecone

# Load environment variables
load_dotenv()

# Set up Pinecone API key and initialize
pinecone_api_key = os.getenv("PINECONE_API_KEY")
if not pinecone_api_key:
    raise ValueError("PINECONE_API_KEY is not set in the .env file")

pc = Pinecone(api_key=pinecone_api_key)

client = OpenAI()

# Connect to the index
index_name = 'idea-index'
namespace = os.getenv("PINECONE_NAMESPACE")

details = pc.describe_index(index_name)
index = pc.Index(host=details.host)

def get_all_vector_ids(index, namespace):
    """Fetch all vector IDs from the Pinecone index."""
    vector_ids = []

    print(f"Fetching vector IDs from namespace: {namespace}")

    for ids in index.list(namespace=namespace):
        vector_ids.extend(ids)

    print(f"Total vector IDs fetched: {len(vector_ids)}")

    return vector_ids



def get_all_embeddings(index, vector_ids, namespace, batch_size=100):
    """Fetch embeddings and metadata from Pinecone by vector IDs with a progress bar."""
    print(f"Fetching embeddings for {len(vector_ids)} vector IDs.")
    embeddings = []

    # Wrap the loop with tqdm for a progress bar
    for i in tqdm(range(0, len(vector_ids), batch_size), desc="Fetching Embeddings"):
        batch_ids = vector_ids[i:i + batch_size]
        response = index.fetch(ids=batch_ids, namespace=namespace)
        for vector_id, vector_data in response["vectors"].items():
            embeddings.append({
                "id": vector_id,
                "values": vector_data["values"],
                "metadata": vector_data["metadata"],
            })

    return embeddings


def cosine_similarity(vec1, vec2):
    """Calculate cosine similarity between two vectors."""
    vec1 = np.array(vec1)
    vec2 = np.array(vec2)
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def pairwise_similarity(embeddings, threshold=0.95):
    """Perform pairwise similarity between all embeddings."""
    print(f"Calculating pairwise similarity with threshold: {threshold}")
    results = []

    # Wrap the outer loop with tqdm for progress tracking
    for i in tqdm(range(len(embeddings)), desc="Pairwise Similarity"):
        for j in range(i + 1, len(embeddings)):  # Avoid duplicate pairs
            sim = cosine_similarity(embeddings[i]["values"], embeddings[j]["values"])
            if sim >= threshold:
                results.append({
                    "id1": embeddings[i]["id"],
                    "id2": embeddings[j]["id"],
                    "similarity": sim,
                    "metadata1": embeddings[i]["metadata"],
                    "metadata2": embeddings[j]["metadata"]
                })

    return results

# Step 1: Fetch all vector IDs
vector_ids = get_all_vector_ids(index, namespace)

# Step 2: Fetch all embeddings
embeddings = get_all_embeddings(index, vector_ids, namespace)

# Step 3: Calculate pairwise similarity
if embeddings:
    threshold = 0.90  # Adjust threshold as needed
    similar_items = pairwise_similarity(embeddings, threshold=threshold)
    if similar_items:
        print("Duplicates found:")
        for item in similar_items:
            print(f"ID1: {item['id1']}, ID2: {item['id2']}, Similarity: {item['similarity']}")
    else:
        print("No duplicates found.")
else:
    print("No embeddings found in the index.")


Fetching vector IDs from namespace: bi-internal-ideas
Total vector IDs fetched: 2115
Fetching embeddings for 2115 vector IDs.


Fetching Embeddings: 100%|██████████| 22/22 [00:09<00:00,  2.27it/s]


Calculating pairwise similarity with threshold: 0.9


Pairwise Similarity: 100%|██████████| 2115/2115 [04:20<00:00,  8.13it/s] 

Duplicates found:
ID1: 1057, ID2: 1343, Similarity: 0.9224126332633487
ID1: 1057, ID2: 1428, Similarity: 0.9492330376713959
ID1: 1342, ID2: 877, Similarity: 0.9044388957220413
ID1: 1343, ID2: 1428, Similarity: 0.9411793056984264
ID1: 1468, ID2: 1467, Similarity: 0.9999990514307626
ID1: 1468, ID2: 1471, Similarity: 0.9475324074467495
ID1: 1468, ID2: 1470, Similarity: 0.9475324074467495
ID1: 1467, ID2: 1471, Similarity: 0.9475247891294817
ID1: 1467, ID2: 1470, Similarity: 0.9475247891294817
ID1: 1469, ID2: 1471, Similarity: 0.9076110114831853
ID1: 1469, ID2: 1470, Similarity: 0.9076110114831853
ID1: 1471, ID2: 1470, Similarity: 1.0
ID1: 1710, ID2: 1845, Similarity: 0.9097417833587553
ID1: 1732, ID2: 1730, Similarity: 0.9559935107954475
ID1: 1948, ID2: 2044, Similarity: 0.9021673468451203



