### Generate Embeddings

In [1]:
import pandas as pd

df = pd.read_csv('sentences.csv')
df.head()

Unnamed: 0,sentence
0,A little girl is smiling and running outside
1,A man is drawing on a digital dry erase board
2,A black bird is sitting on a dead tree
3,An elderly man is sitting on a bench
4,A man and a woman are sitting comfortably on t...


In [2]:
import os
from openai import OpenAI

api_key = os.environ.get("OPENAI_API_KEY")

client = OpenAI(api_key=api_key)

# Generate embeddings for each sentence in the dataframe
def generate_embeddings(sentence: str):
    return (
        client.embeddings.create(input=sentence, model="text-embedding-3-small").data[0].embedding
    )
    

In [3]:
import numpy as np

# Load the embeddings from the file if it exists
if os.path.exists("embedded_sentences.csv"):
    df = pd.read_csv("embedded_sentences.csv")

    # Convert the string representation of the embeddings to numpy arrays
    df["embedding"] = df.embedding.apply(eval).apply(np.array)

# Otherwise, generate the embeddings
else:
    df["embedding"] = df.sentence.apply(generate_embeddings)
    df.to_csv("embedded_sentences.csv", index=False)


In [4]:
# Create a new column to store the id of each sentence in the dataframe
df['id'] = range(1, len(df) + 1)

df.head()

Unnamed: 0,sentence,embedding,id
0,A little girl is smiling and running outside,"[0.0436425618827343, 0.01375775970518589, 0.00...",1
1,A man is drawing on a digital dry erase board,"[-0.008048108778893948, 0.030766354873776436, ...",2
2,A black bird is sitting on a dead tree,"[0.027433251962065697, 1.8205369087809231e-06,...",3
3,An elderly man is sitting on a bench,"[-0.004122881218791008, -0.056238383054733276,...",4
4,A man and a woman are sitting comfortably on t...,"[0.021146269515156746, -0.032280709594488144, ...",5


In [5]:
df.embedding[0]

array([ 0.04364256,  0.01375776,  0.00951197, ..., -0.02644038,
        0.0155397 ,  0.02192083])

In [6]:
embedding_dimension = len(df.embedding[0])
embedding_dimension

1536

### **FAISS**

In [7]:
# Create a numpy array to store the embeddings
embeddings = np.array(df.embedding.tolist())

query = "I love to play football"
xq = generate_embeddings(query)

In [8]:
len(xq)

1536

#### **IndexFlatL2** - Exact Search for L2 (Eucledian distance)

In [9]:
import faiss

# Create an index for the embeddings
index_l2 = faiss.IndexFlatL2(embedding_dimension)

# Check if the index is trained
index_l2.is_trained

True

In [10]:
# Add the embeddings to the index and return the number of embeddings added
index_l2.add(embeddings)
index_l2.ntotal

1000

In [11]:
# Search for the nearest neighbors of the query in the index 
_, doc_indices = index_l2.search(np.expand_dims(xq, axis=0), k=4)
df.iloc[doc_indices[0]] 

Unnamed: 0,sentence,embedding,id
247,A shirtless man is playing football on a field,"[0.010925278067588806, 0.01937379315495491, -0...",248
246,A man with a shirt is holding a football,"[0.00947174895554781, 0.013827533461153507, -0...",247
172,Two men are playing table football,"[-0.03752468526363373, 0.05197532847523689, -0...",173
979,The crowd is watching a football game,"[-0.01140331570059061, 0.015461748465895653, -...",980


#### **IndexIVFlat -** Inverted file with exact post-verification

In [12]:
n_centroids = 20

# Create a quantizer for the embeddings using the L2 distance metric 
quantizer = faiss.IndexFlatL2(embedding_dimension)

# Create an index for the embeddings using the IVF algorithm with the quantizer and the number of centroids
index_ivf = faiss.IndexIVFFlat(quantizer, embedding_dimension, n_centroids,)
index_ivf.is_trained

False

In [13]:
# Train the index with the embeddings 
index_ivf.train(embeddings)
index_ivf.is_trained

True

In [14]:
index_ivf.add(embeddings)
index_ivf.ntotal

1000

In [15]:
_, doc_indices = index_ivf.search(np.expand_dims(xq, axis=0), k=4)
df.iloc[doc_indices[0]]

Unnamed: 0,sentence,embedding,id
172,Two men are playing table football,"[-0.03752468526363373, 0.05197532847523689, -0...",173
979,The crowd is watching a football game,"[-0.01140331570059061, 0.015461748465895653, -...",980
551,A football player is running past an official ...,"[0.03300335630774498, 0.017781982198357582, -0...",552
469,A football player in a red and white uniform i...,"[-0.01379761379212141, -0.04831472039222717, 0...",470


In [16]:
# Set the number of probes to 5 for the index
index_ivf.nprobe = 5

_, doc_indices = index_ivf.search(np.expand_dims(xq, axis=0), k=4)
df.iloc[doc_indices[0]]

Unnamed: 0,sentence,embedding,id
247,A shirtless man is playing football on a field,"[0.010925278067588806, 0.01937379315495491, -0...",248
246,A man with a shirt is holding a football,"[0.00947174895554781, 0.013827533461153507, -0...",247
172,Two men are playing table football,"[-0.03752468526363373, 0.05197532847523689, -0...",173
979,The crowd is watching a football game,"[-0.01140331570059061, 0.015461748465895653, -...",980


#### **IndexIVPQ - IVF + Product Quantizer (PQ)**

In [17]:
code_size = 8
bits_per_centroid = 4

# Create an index for the embeddings using the IVF algorithm with the quantizer and the number of centroids
index_ivf_pq = faiss.IndexIVFPQ(
    quantizer, embedding_dimension, n_centroids, code_size, bits_per_centroid
)
index_ivf_pq.is_trained

False

In [18]:
index_ivf_pq.train(embeddings) 
index_ivf_pq.add(embeddings)
index_ivf_pq.ntotal

1000

In [19]:
index_ivf_pq.nprobe = 5
_, doc_indices = index_ivf_pq.search(np.expand_dims(xq, axis=0), k=4)
df.iloc[doc_indices[0]]

Unnamed: 0,sentence,embedding,id
352,An opponent is tackling a soccer player,"[0.0006955061689950526, -0.02708514593541622, ...",353
551,A football player is running past an official ...,"[0.03300335630774498, 0.017781982198357582, -0...",552
469,A football player in a red and white uniform i...,"[-0.01379761379212141, -0.04831472039222717, 0...",470
684,A man is punching a soccer ball,"[-0.01688985712826252, 0.029744451865553856, 0...",685


### **Pinecone**

In [20]:
%load_ext autoreload
%autoreload 2

In [21]:
from pinecone import Pinecone

PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
database = Pinecone(api_key=PINECONE_API_KEY)

In [24]:
from pinecone import ServerlessSpec

serverless_spec = ServerlessSpec(cloud="aws", region="us-east-1")

In [26]:
import time

INDEX_NAME = "random-sentences"

# Create an index with the specified name, dimension, metric, and serverless spec if it does not exist
if INDEX_NAME not in database.list_indexes().names():
    database.create_index(
        name=INDEX_NAME,
        dimension=embedding_dimension,
        metric="cosine",
        spec=serverless_spec,
    )

    time.sleep(1)

pinecone_index = database.Index(INDEX_NAME)

In [28]:
pinecone_index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [29]:
# Add the embeddings to the Pinecone index with the corresponding ids
def iterator(df, size):
    for i in range(0, len(df), size):
        yield df.iloc[i : i + size]


def vector(batch):
    vector = []
    for i in batch.to_dict("records"):
        vector.append((str(i["id"]), i["embedding"], {"sentence": i["sentence"]}))

    return vector


In [30]:
if pinecone_index.describe_index_stats()["total_vector_count"] == 0:
    for batch in iterator(df, 100):
        pinecone_index.upsert(vector(batch))
        

In [31]:
pinecone_index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1000}},
 'total_vector_count': 1000}

In [32]:
# Search for the nearest neighbors of the query in the Pinecone index
response = pinecone_index.query(vector=xq, top_k=4, include_metadata=True)
for match in response["matches"]:
    print(match["metadata"]["sentence"])

A shirtless man is playing football on a field
A man with a shirt is holding a football
Two men are playing table football
The crowd is watching a football game


In [33]:
query_2 = "What likes to eat a lot"
xq_2 = generate_embeddings(query_2)
response_2 = pinecone_index.query(vector=xq_2, top_k=4, include_metadata=True)
for match in response_2["matches"]:
    print(match["metadata"]["sentence"])

The animal with big eyes is voraciously eating
A lemur is eating quickly
A man is eating
A man is eating
