In [9]:
import os
import pandas as pd
from dotenv import load_dotenv

# Import the Pinecone library
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import time

# Import LangChain
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large"
)

# Load environment variables
load_dotenv()
pinecone_api_key = os.getenv("PINECONE_API_KEY")
if not pinecone_api_key:
    raise ValueError("PINECONE_API_KEY is not set in the .env file")

pc = Pinecone(api_key=pinecone_api_key)

openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
    raise ValueError("OPENAI_API_KEY is not set in the .env file")

# Step 1: Load the preprocessed file
# Update the path to match where you saved `Ideas_Cleaned.csv`
file_path = os.path.expanduser("~/Desktop/Preprocessing/Ideas_Cleaned.csv")  # Replace 'YourFolderName' with your folder
data = pd.read_csv(file_path)

# Convert the text into numerical vectors that Pinecone can index
embeddings_list = embeddings.embed_documents(data['LemmatizedText'].tolist())

# Ensure embeddings_list matches the row order of the original data
data['Embeddings'] = embeddings_list

# Prepare upsertion data with metadata
upsert_data = [
    {
        "id": str(index),  # Unique identifier (e.g., row index)
        "values": embedding,  # Embedding vector
        "metadata": {         # Add metadata from the original document
            "title": row['Title'],
            "description": row['Description'],
            "category": row.get('Category', ''),
            "status": row.get('Status', ''),
            "submitter": row.get('Submitter', ''),
            "date_submitted": row.get('Submitted', ''),
        }
    }
    for index, (embedding, row) in enumerate(zip(embeddings_list, data.to_dict(orient='records')))
]


# Create a serverless index
index_name = "bi-internal-ideas-index"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=3072,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        ) 
    ) 

# Wait for the index to be ready
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

# Target the index where you'll store the vector embeddings
index = pc.Index(index_name)

# Reduce batch size to prevent exceeding the size limit
batch_size = 10  # Start with 10 and adjust as needed
for i in range(0, len(upsert_data), batch_size):
    batch = upsert_data[i:i + batch_size]
    index.upsert(
        vectors=batch,
        namespace="bi-internal-ideas"
    )


print("Data successfully upserted to Pinecone!")


Data successfully upserted to Pinecone!
