In [16]:
# Import necessary libraries
import os
import pandas as pd
import time
from openai import OpenAI
from dotenv import load_dotenv
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

# Load environment variables
load_dotenv()

pinecone_api_key = os.getenv("PINECONE_API_KEY")
if not pinecone_api_key:
    raise ValueError("PINECONE_API_KEY is not set in the .env file")

# Initialize Pinecone
pc = Pinecone(api_key=pinecone_api_key)

client = OpenAI()

# Step 1: Load the preprocessed data
file_path = os.path.expanduser("~/Desktop/Preprocessing/Ideas_Cleaned.csv")  # Adjust path as needed
data = pd.read_csv(file_path)

# Ensure 'LemmatizedText' is cleaned and valid
data = data.dropna(subset=['LemmatizedText'])
data = data[data['LemmatizedText'].str.strip() != ""]
data['LemmatizedText'] = data['LemmatizedText'].astype(str)

# Limit to the first 100 rows
limited_data = data.head(5).copy()

# Initialize an empty list to store embeddings
embeddings_list = []

# Process each row one by one
for index, row in limited_data.iterrows():
    # Call the API for each LemmatizedText entry
    response = client.embeddings.create(
        model="text-embedding-3-large",
        input=row['LemmatizedText'],  # Single string input
        encoding_format="float"
    )
    # Extract the embedding and append to the list
    embeddings_list.append(response.data[0].embedding)

    # Print progress
    if index % 10 == 0:
        print(f"Processed {index} entries...")

# Add embeddings back to the DataFrame
#data['Embeddings'] = embeddings_list
limited_data['Embeddings'] = embeddings_list

# Verify results
print(limited_data.head())


# Step 3: Prepare data for Pinecone upsertion
upsert_data = [
    {
        "id": str(index),  # Unique identifier for each entry
        "values": embedding,  # Embedding vector
        "metadata": {         # Include metadata for search and filtering
            "title": row['Title'],
            "description": row['Description'],
            "category": row.get('Category', ''),
            "status": row.get('Status', ''),
            "submitter": row.get('Submitter', ''),
            "date_submitted": row.get('Submitted', ''),
        }
    }
    for index, (embedding, row) in enumerate(zip(limited_data, data.to_dict(orient='records')))
]

# Step 4: Create or connect to a Pinecone index
index_name = "idea-index"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=3072,  # Set to the embedding dimension (3072 for this model)
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        )
    )

# Wait for the index to be ready
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

# Connect to the index
index = pc.Index(index_name)

# Step 5: Upsert data to Pinecone
batch_size = 10  # Process data in batches
for i in range(0, len(upsert_data), batch_size):
    batch = upsert_data[i:i + batch_size]
    index.upsert(
        vectors=batch,
        namespace="bi-internal-ideas"
    )

print("Data successfully upserted to Pinecone!")

# Step 6: Query the Pinecone index
def query_pinecone(query_text, index, top_k=5):
    """Query Pinecone index with a text input."""
    query_embedding = embeddings.embed_query(query_text)
    response = index.query(
        vector=query_embedding,
        top_k=top_k,
        include_metadata=True,
        namespace="bi-internal-ideas"
    )
    return response

# Example query
query_text = input("Enter a query to search the index: ")
results = query_pinecone(query_text, index)
print(f"Query Results: {results}")


Processed 0 entries...
     Code              Category         Submitted             Submitter  \
0  D27603           Enhancement  12/20/2024 18:54               J Leone   
1  D27602           Enhancement  12/19/2024 10:25               J Leone   
2  D27600           Enhancement  12/18/2024 11:09  Brian Wermerskirchen   
3  D27599           Enhancement  12/17/2024 15:53       Emerson Lambert   
4  D27595  Reporting/Dashboards  12/13/2024 19:59         France Dreyer   

                  Submitter email Team Name Submission Team  \
0           jleone@brightidea.com       NaN             NaN   
1           jleone@brightidea.com       NaN             NaN   
2  bwermerskirchen@brightidea.com       NaN             NaN   
3         elambert@brightidea.com       NaN             NaN   
4          fdreyer@brightidea.com       NaN             NaN   

  Submission Team email                                              Title  \
0                   NaN                                AI-First White

AttributeError: 'PineconeGRPC' object has no attribute 'has_index'