In [None]:
# Import necessary libraries
import os
import pandas as pd
import time
from openai import OpenAI
from dotenv import load_dotenv
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

# Load environment variables
load_dotenv()

pinecone_api_key = os.getenv("PINECONE_API_KEY")
if not pinecone_api_key:
    raise ValueError("PINECONE_API_KEY is not set in the .env file")

# Initialize Pinecone
pc = Pinecone(api_key=pinecone_api_key)

client = OpenAI()

# Step 1: Load the preprocessed data
file_path = os.path.expanduser("~/Desktop/Preprocessing/Ideas_Cleaned.csv")  # Adjust path as needed
data = pd.read_csv(file_path)

# Ensure 'LemmatizedText' is cleaned and valid
data = data.dropna(subset=['LemmatizedText'])
data = data[data['LemmatizedText'].str.strip() != ""]
data['LemmatizedText'] = data['LemmatizedText'].astype(str)

# Limit to the first 100 rows
limited_data = data.head(1000).copy()

# Initialize an empty list to store embeddings
embeddings_list = []

# Process each row one by one
for index, row in limited_data.iterrows():
    # Call the API for each LemmatizedText entry
    response = client.embeddings.create(
        model="text-embedding-3-large",
        input=row['LemmatizedText']  # Single string input
    )
    # Extract the embedding and validate
    embedding = response.data[0].embedding
    if not isinstance(embedding, list) or not all(isinstance(v, float) for v in embedding):
        raise ValueError(f"Invalid embedding at index {index}: {embedding}")
    
    # Append the valid embedding to the list
    embeddings_list.append(embedding)

    # Print progress
    if index % 10 == 0:
        print(f"Processed {index} entries...")

# Add embeddings back to the DataFrame
limited_data['Embeddings'] = embeddings_list

# Step 2: Prepare data for Pinecone upsertion
upsert_data = [
    {
        "id": str(index),  # Unique identifier for each entry
        "values": embedding,  # Embedding vector
        "metadata": {         # Include metadata for search and filtering
            "title": row['Title'],
            "description": row['Description'],
            "category": row.get('Category', ''),
            "status": row.get('Status', ''),
            "submitter": row.get('Submitter', ''),
            "date_submitted": row.get('Submitted', ''),
        }
    }
    for index, (embedding, row) in enumerate(zip(embeddings_list, data.to_dict(orient='records')))
]

# Validate the upsert data
for item in upsert_data:
    if not isinstance(item["values"], list) or not all(isinstance(v, float) for v in item["values"]):
        raise ValueError(f"Invalid values in upsert data: {item['values']}")

# Step 3: Create or connect to a Pinecone index
index_name = "idea-index"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=3072,  # Set to the embedding dimension (3072 for this model)
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        )
    )

# Wait for the index to be ready
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

# Connect to the index
index = pc.Index(index_name)

# Step 4: Upsert data to Pinecone
batch_size = 10  # Process data in batches
for i in range(0, len(upsert_data), batch_size):
    batch = upsert_data[i:i + batch_size]
    index.upsert(
        vectors=batch,
        namespace="bi-internal-ideas"
    )

print("Data successfully upserted to Pinecone!")

# Step 5: Query the Pinecone index
def query_pinecone(query_text, index, top_k=5):
    """Query Pinecone index with a text input."""
    response = client.embeddings.create(
        model="text-embedding-3-large",
        input=query_text
    )
    query_embedding = response.data[0].embedding
    result = index.query(
        vector=query_embedding,
        top_k=top_k,
        include_metadata=True,
        namespace="bi-internal-ideas"
    )
    return result
