# All imports and inits

In [None]:
import pandas as pd
import os

from dotenv import load_dotenv
from langchain_text_splitters import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer

load_dotenv()

DATA_PATH = os.getenv("DATA_PATH")
PINECONE_API = os.getenv("PINECONE_API")

# Excel Parser


In [4]:
def load_excel_document():
    """
    Load data from potential_clients.xlsx file
    Returns a list of documents with content and metadata
    """

    documents = []

    try:
        # Read the Excel file
        df = pd.read_excel(DATA_PATH)
        print(f"Successfully loaded {DATA_PATH}")

        # Convert each row to a document
        for idx, row in df.iterrows():
            # Convert row to string representation
            content = " ".join([f"{col}: {str(row[col])}" for col in df.columns])

            # Create document with content and metadata
            documents.append(
                {
                    "content": content,
                    "metadata": {
                        "source": DATA_PATH,
                        "file_type": "excel",
                        "row_id": idx,
                    },
                }
            )

        print(f"Processed {len(documents)} rows from Excel file")
        return documents

    except Exception as e:
        print(f"Error loading Excel file: {e}")
        return []


# Load documents from Excel
documents = load_excel_document()

# Display first document as example
if documents:
    print("\nExample document:")
    print(documents[0])

Successfully loaded ./potential-talents.xlsx
Processed 104 rows from Excel file

Example document:
{'content': 'id: 1 job_title: 2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring Human Resources professional location: Houston, Texas connection: 85 fit: nan', 'metadata': {'source': './potential-talents.xlsx', 'file_type': 'excel', 'row_id': 0}}


## Text Splitting \ Chunking for llama text embed v2 via pinecone

In [None]:
def count_tokens(text: str) -> int:
    tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v2-base-en")
    # Encode the text into tokens
    tokens = tokenizer.encode(text)
    return len(tokens)


def split_documents(documents):
    # Each chunk is ~800-1000 tokens to leave room for metadata tokens if needed

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=32000,  # fit comfortably within 2048 token limit
        chunk_overlap=16000,  # helps retain context between chunks
        length_function=len,  # use token length if tokenizer available
        is_separator_regex=True,  # respect newline and semantic breaks
    )

    chunks = []

    for doc in documents:
        split_texts = text_splitter.split_text(doc["content"])

        for i, chunk in enumerate(split_texts):
            chunks.append(
                {"content": chunk, "metadata": {**doc["metadata"], "chunk_id": i}}
            )
    return chunks


# Example usage:
chunks = split_documents(documents)

# print(len(chunks))
print(chunks[0])
print(
    "Tokens: ", count_tokens(chunks[0]["content"])
)  # Check if splitting looks reasonable

{'content': 'id: 1 job_title: 2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring Human Resources professional location: Houston, Texas connection: 85 fit: nan', 'metadata': {'source': './potential-talents.xlsx', 'file_type': 'excel', 'row_id': 0, 'chunk_id': 0}}
Tokens:  40


# Init Pinecone

In [None]:
pc = Pinecone(api_key=PINECONE_API)
# print(PINECONE_API)

# Connect to the index
index = pc.Index("potential-talents")  # -- COMPLETE SURGICAL TECH BOOTCAMP

### Get Emebddings and Upsertion functions

In [None]:
sbert_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def get_embedding(text="None"):
    # Generate embedding using the pre-loaded model
    embedding = sbert_model.encode(text)

    # Return the embedding as a list/array
    return embedding.tolist()

# print(len(get_embedding("Surgical Conscience")))

def upsert_chunks_to_pinecone(index, chunks):
    count = 0
    for chunk in chunks:
        # Ensure the chunk has the correct structure
        content = chunk.get("content")
        metadata = chunk.get("metadata", {})

        # Get the embedding for the chunk
        # embedding = get_embedding(content).data[0]['values']
        embedding = get_embedding(content)

        # Add the text as part of the metadata
        metadata["text"] = content  # Store text in metadata
        # metadata["token_count"] = count_tokens(content)

        # Create a unique vector ID for each chunk (e.g., based on count or some unique identifier)
        vector_id = f"vec_{count}"

        # Upsert the embedding along with its metadata
        index.upsert(vectors=[(vector_id, embedding, metadata)])

        count += 1
        print(f"Embedding {count} upserted to Pinecone with metadata")

    print(f"All {count} embeddings have been upserted to Pinecone")

upsert_chunks_to_pinecone(index, chunks)

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Embedding 1 upserted to Pinecone with metadata
Embedding 2 upserted to Pinecone with metadata
Embedding 3 upserted to Pinecone with metadata
Embedding 4 upserted to Pinecone with metadata
Embedding 5 upserted to Pinecone with metadata
Embedding 6 upserted to Pinecone with metadata
Embedding 7 upserted to Pinecone with metadata
Embedding 8 upserted to Pinecone with metadata
Embedding 9 upserted to Pinecone with metadata
Embedding 10 upserted to Pinecone with metadata
Embedding 11 upserted to Pinecone with metadata
Embedding 12 upserted to Pinecone with metadata
Embedding 13 upserted to Pinecone with metadata
Embedding 14 upserted to Pinecone with metadata
Embedding 15 upserted to Pinecone with metadata
Embedding 16 upserted to Pinecone with metadata
Embedding 17 upserted to Pinecone with metadata
Embedding 18 upserted to Pinecone with metadata
Embedding 19 upserted to Pinecone with metadata
Embedding 20 upserted to Pinecone with metadata
Embedding 21 upserted to Pinecone with metadata
E