In [None]:
import os
import time
import pandas as pd
from tqdm import tqdm
from openai import OpenAI
from dotenv import load_dotenv
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

# Load environment variables
load_dotenv()

pinecone_api_key = os.getenv("PINECONE_API_KEY")
if not pinecone_api_key:
    raise ValueError("PINECONE_API_KEY is not set in the .env file")

# Initialize Pinecone
pc = Pinecone(api_key=pinecone_api_key)
client = OpenAI()

# Function to clean metadata comprehensively
def clean_metadata(row):
    """Clean metadata fields to ensure no NaN or invalid values."""
    metadata = {
        "title": row.get("Title", ""),
        "description": row.get("Description", ""),
        "difference": row.get("How is the proposed idea different from previous attempts?", ""),
        "benefits": row.get("What is in it for the customer, the department, the company?", ""),
        "category": row.get("Category", ""),
        "status": row.get("Status", ""),
        "submitter": row.get("Submitter", ""),
        "date_submitted": row.get("Submitted", ""),
        "comments": row.get("Comments", "")
    }
    
    # Replace any NaN or non-string values
    for key, value in metadata.items():
        if pd.isna(value) or not isinstance(value, str):
            metadata[key] = ""  # Replace with empty string
    
    return metadata

# Step 1: Load and preprocess the data
file_path = os.path.expanduser("~/Desktop/Preprocessing/Ideas_Cleaned.csv")  # Adjust path as needed
data = pd.read_csv(file_path)

# Ensure 'LemmatizedText' is cleaned and valid
data = data.dropna(subset=["LemmatizedText"])
data = data[data["LemmatizedText"].str.strip() != ""]
data["LemmatizedText"] = data["LemmatizedText"].astype(str)

# Limit to the first N rows for demonstration
limited_data = data.head(3000).copy()

# Step 2: Generate embeddings sequentially
embeddings_list = []
for row_text in tqdm(limited_data["LemmatizedText"], desc="Generating embeddings"):
    response = client.embeddings.create(model="text-embedding-3-large", input=row_text)
    embedding = response.data[0].embedding
    if not isinstance(embedding, list) or not all(isinstance(v, float) for v in embedding):
        raise ValueError(f"Invalid embedding: {embedding}")
    embeddings_list.append(embedding)

# Add embeddings back to the DataFrame
limited_data["Embeddings"] = embeddings_list

# Step 3: Prepare data for Pinecone upsertion
upsert_data = [
    {
        "id": str(index),  # Unique identifier for each entry
        "values": embedding,  # Embedding vector
        "metadata": clean_metadata(row)
    }
    for index, (embedding, row) in enumerate(zip(embeddings_list, limited_data.to_dict(orient="records")))
]

# Verify upsert data
print("Sample Upsert Data:")
print(upsert_data[:3])  # Print the first 3 records to verify structure

# Step 4: Create or connect to a Pinecone index
index_name = "idea-index"
namespace = "bi-internal-ideas"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=3072,  # Set to the embedding dimension (3072 for this model)
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

# Wait for the index to be ready
while not pc.describe_index(index_name).status["ready"]:
    time.sleep(1)

# Connect to the index
index = pc.Index(index_name)

# Step 5: Upsert data to Pinecone
batch_size = 10  # Process data in batches
for i in range(0, len(upsert_data), batch_size):
    batch = upsert_data[i:i + batch_size]
    response = index.upsert(vectors=batch, namespace=namespace)
    print(f"Upsert response for batch {i // batch_size + 1}: {response}")

# Check namespace stats after upsertion
stats = index.describe_index_stats()
print(f"Namespace Stats after upsertion: {stats['namespaces']}")

print("Data successfully upserted to Pinecone!")

# Step 6: Query the Pinecone index
def query_pinecone(query_text, index, namespace, top_k=5):
    """Query Pinecone index with a text input."""
    print("Validating Pinecone index...")

    # Validate the index
    try:
        index_stats = index.describe_index_stats()
        print("Index is valid. Stats:")
        print(index_stats)
    except Exception as e:
        print("Failed to validate the index. Error:")
        raise e

    print("Generating query embedding...")
    # Generate embedding for the query
    response = client.embeddings.create(model="text-embedding-3-large", input=query_text)
    query_embedding = response.data[0].embedding

    if not isinstance(query_embedding, list) or not all(isinstance(v, float) for v in query_embedding):
        raise ValueError("Query embedding is invalid.")

    print("Querying Pinecone index...")
    # Query Pinecone
    result = index.query(
        vector=query_embedding,
        top_k=top_k,
        include_metadata=True,
        namespace=namespace
    )
    print(f"Query successful. Retrieved {len(result['matches'])} results.")
    return result

# Function to wait for Pinecone index readiness
def wait_for_pinecone_index(index, namespace, max_wait=30, interval=2):
    """Wait for Pinecone index to be ready for querying."""
    waited = 0
    while waited < max_wait:
        stats = index.describe_index_stats()
        vector_count = stats["namespaces"].get(namespace, {}).get("vector_count", 0)
        if vector_count > 0:
            print(f"Pinecone index is ready with {vector_count} vectors in namespace '{namespace}'.")
            return
        print(f"Waiting for Pinecone index... (elapsed time: {waited}s)")
        time.sleep(interval)
        waited += interval
    raise TimeoutError(f"Pinecone index not ready after {max_wait} seconds.")

# Example query
query_text = "How can we improve collaboration in teams?"

# Wait for the Pinecone index to be ready
wait_for_pinecone_index(index, namespace, max_wait=60, interval=5)

# Perform the query
result = query_pinecone(query_text, index, namespace, top_k=5)

# Print results
print("Query Results:")
for match in result.get("matches", []):
    print(f"ID: {match['id']}, Score: {match['score']}, Metadata: {match['metadata']}")
