# 1. Prerequisites
* A Pinecone A/c
* A cohere API key
* Python env with Jupyter notebook
* Basic knowledge of Python and env vars

# 2. Install Dependencies


In [1]:
# Install dependencies
%pip install -q pinecone cohere datasets python-dotenv 

Note: you may need to restart the kernel to use updated packages.


# 3. Initialize Pinecone & Setup API Key

In [2]:
# initialize Pinecone and setup API Key
import pinecone
import os
from dotenv import load_dotenv
import cohere

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# load API keyes from .env file
load_dotenv()
pinecone_api_key = os.getenv("PINECONE_API_KEY")
cohere_api_key = os.getenv("COHERE_API_KEY")

# 4. Create a Pinecone Index

In [4]:
# Initialize connection to Pinecone 
from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(api_key=pinecone_api_key)

index_name = "cohere-pinecone-tree"

In [5]:
# Create a pinecone index for the dataset if not there already

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        metric="cosine",
        dimension=1024,
        spec= ServerlessSpec(
            cloud = "aws",
            region = "us-east-1",
        )
    )   

pc.list_indexes()

[
    {
        "name": "cohere-pinecone-tree",
        "metric": "cosine",
        "host": "cohere-pinecone-tree-z2k0we0.svc.aped-4627-b74a.pinecone.io",
        "spec": {
            "serverless": {
                "cloud": "aws",
                "region": "us-east-1"
            }
        },
        "status": {
            "ready": true,
            "state": "Ready"
        },
        "vector_type": "dense",
        "dimension": 1024,
        "deletion_protection": "disabled",
        "tags": null
    }
]

# 5. Load & Prepare Dataset

In [6]:
# load & Prepare the Dataset - Using sample dataset from HF
from datasets import load_dataset

dataset = load_dataset("trec", split="train[:1000]")

# Extract embeddings
texts = [item["text"] for item in dataset.select(range(100))]


# 6. Generate Embeddings using Cohere

In [7]:
# Generate Embeddings using cohre
co = cohere.Client(api_key=cohere_api_key)

# Generate Embeddings
embeddings = co.embed(
    texts = texts, 
    model="embed-english-v3.0",
    input_type="search_document",
    truncate="END").embeddings

# Check dimensions of the embeddings
print("Embeddings Shape: ", len(embeddings), len(embeddings[0])) # 100, 1024 - should match the index (pinecone) dimension


Embeddings Shape:  100 1024


# 7. Store Embeddings in Pinecone

In [None]:
# Store Embeddings in Pinecone
index = pc.Index(index_name)     # get the index

# prepare and upsert/insert data
vectors = [(str(i), embeddings[i]) for i in range(len(embeddings))]
index.upsert(vectors)

{'upserted_count': 100}

# 8. Perform Sementic Search

In [15]:
# Create a query & Embed it
query = "What caused the 1929 Great Depression?"
query_embedding = co.embed(
    texts=[query], 
    model="embed-english-v3.0",
    input_type="search_query",  # Corrected input_type
    truncate="END"
).embeddings[0]


In [17]:
# search pinecone for similar results
response = index.query(vector=[query_embedding], top_k=5, include_metadata=True)


In [19]:
# Display results
for match in response["matches"]:
    print(f"Matched text: {texts[int(match.id)]} - Similarity: {match.score:.2f}")
    print("--------------------------------------------------")
    

Matched text: What are bear and bull markets ? - Similarity: 0.23
--------------------------------------------------
Matched text: How did serfdom develop in and then leave Russia ? - Similarity: 0.23
--------------------------------------------------
Matched text: What is considered the costliest disaster the insurance industry has ever faced ? - Similarity: 0.23
--------------------------------------------------
Matched text: Who killed Gandhi ? - Similarity: 0.22
--------------------------------------------------
Matched text: What was cash-conscious Colonel Edwin L. Drake the first to drill ? - Similarity: 0.21
--------------------------------------------------


# 9. Clean-up
To delete pinecone index

In [23]:
pc.delete_index(index_name) # delete the index