## Implementing RAG with Pinecone

https://github.com/openai/openai-cookbook/blob/main/examples/vector_databases/pinecone/Using_Pinecone_for_embeddings_search.ipynb

### Loading the data

In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import os
import ast
from typing import List, Iterator

# Function to set the wd as the root of the repository
def find_repo_root(repo_name):
    current_dir = os.getcwd()
    while current_dir != '/':
        if os.path.basename(current_dir) == repo_name:
            return current_dir
        current_dir = os.path.dirname(current_dir)
    raise FileNotFoundError(f"Repository root '{repo_name}' not found.")

# Setting the working directory
repo_name = 'orare-model'
repo_root = find_repo_root(repo_name)
os.chdir(repo_root)

In [3]:
# Ignore unclosed SSL socket warnings - optional in case you get these errors
#import warnings

#warnings.filterwarnings(action="ignore", message="unclosed", category=ResourceWarning)
#warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [2]:
# Reading the bible interpreted with embeddings
bible_data = pd.read_csv('bible/data/bible_by_theme_int_embedding.txt', sep='|', encoding='utf-8')

# Tranforming vector string into list
bible_data['interpretacion_vector'] = bible_data['interpretacion_vector'].apply(ast.literal_eval)

# Showing the bible_data object
bible_data.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 861 entries, 0 to 860
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     861 non-null    object
 1   pasaje                 861 non-null    object
 2   texto                  861 non-null    object
 3   interpretación         861 non-null    object
 4   temas                  861 non-null    object
 5   área_vida              861 non-null    object
 6   texto_vector           861 non-null    object
 7   interpretacion_vector  861 non-null    object
dtypes: object(8)
memory usage: 53.9+ KB


### Setting the connection to OpenAI API and Pinecone API

In [3]:
import openai
import os
# Pinecone's client library for Python
import pinecone
from pinecone import Pinecone, ServerlessSpec
from pinecone import ServerlessSpec

# Pinecone API
pc_api_key = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=pc_api_key)

# OpenAI API
openai.api_key = os.getenv('OPENAI_API_KEY')
client = openai.OpenAI()

  from tqdm.autonotebook import tqdm


### Helper function

In [4]:
# Models a simple batch generator that make chunks out of an input DataFrame
class BatchGenerator:
   
    def __init__(self, batch_size: int = 10) -> None:
        self.batch_size = batch_size
    
    # Makes chunks out of an input DataFrame
    def to_batches(self, df: pd.DataFrame) -> Iterator[pd.DataFrame]:
        splits = self.splits_num(df.shape[0])
        if splits <= 1:
            yield df
        else:
            for chunk in np.array_split(df, splits):
                yield chunk

    # Determines how many chunks DataFrame contains
    def splits_num(self, elements: int) -> int:
        return round(elements / self.batch_size)
    
    __call__ = to_batches

df_batcher = BatchGenerator(300)

### Creating an Index

In [7]:
index_name = 'bible-verses'

if index_name in pc.list_indexes().names():
    print(f"Index '{index_name}' already exists. Deleting it.")
    pc.delete_index(index_name)

# Check whether the index with the same name already exists - if so, delete it
if index_name in pc.list_indexes():
    pc.delete_index(index_name)
    
# Create new index with the required 'spec' argument
spec = ServerlessSpec(
    cloud='aws',
    region='us-east-1'
)

# Getting the embedding vector length
vector_length = len(bible_data['interpretacion_vector'][0])
# Creates new index
pc.create_index(name=index_name, dimension=vector_length, spec=spec)

Index 'bible-verses' already exists. Deleting it.


In [5]:
index_name = 'bible-verses'
# Setting the host of the index from the Pinecone web admin portal
index = pc.Index(index_name=index_name, host='https://bible-verses-rsup9mo.svc.aped-4627-b74a.pinecone.io')

# Confirm our index was created
pc.list_indexes()

{'indexes': [{'deletion_protection': 'disabled',
              'dimension': 1536,
              'host': 'orare-app-rsup9mo.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'orare-app',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}},
             {'deletion_protection': 'disabled',
              'dimension': 1536,
              'host': 'bible-verses-rsup9mo.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'bible-verses',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}

### Uploading the vectors to the content namespace

In [9]:
# Upsert content vectors in content namespace - this can take a few minutes
print("Uploading vectors to content namespace..")
for batch_df in df_batcher(bible_data):
    index.upsert(vectors=zip(batch_df.id, batch_df.interpretacion_vector), namespace='content')

Uploading vectors to content namespace..


  return bound(*args, **kwds)


In [6]:
# Check index size for each namespace to confirm all of our docs have loaded
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'content': {'vector_count': 861}},
 'total_vector_count': 861}

In [7]:
# Creating dictionaries mapping vector IDs to their outputs so we can retrieve the text for our search results
titles_mapped = dict(zip(bible_data.id,bible_data.pasaje))
content_mapped = dict(zip(bible_data.id,bible_data.interpretación))

### Vector Search Query

In [8]:
# Setting the embedding model of OpenAI
EMBEDDING_MODEL = "text-embedding-3-small"

# Function to query the vector search
def query_article(query, namespace, top_k=3):

    # Function to get the embeddings of the input of the search
    def get_embedding(text, model="text-embedding-3-small"):
       text = text.replace("\n", " ")
       return client.embeddings.create(input = [text], model=model).data[0].embedding
    
    embedded_query = get_embedding(query,model=EMBEDDING_MODEL)

    # Query namespace passed as parameter using title vector
    query_result = index.query(vector=embedded_query, 
                               top_k=top_k,
                               namespace=namespace)

    # Print query results 
    print(f'\nMost similar results to {query} in "{namespace}" namespace:\n')
    if not query_result.matches:
        print('no query result')
    
    matches = query_result.matches
    ids = [res.id for res in matches]
    scores = [res.score for res in matches]
    df = pd.DataFrame({'id':ids, 
                       'score':scores,
                       'title': [titles_mapped[_id] for _id in ids],
                       'content': [content_mapped[_id] for _id in ids],
                       })
    
    counter = 0
    for k,v in df.iterrows():
        counter += 1
        print(f'{v.title}: {v.content} (score = {v.score})')
    
    print('\n')

    return df

In [9]:
query = '''Dios. Quiero que en este proceso de búsqueda seas mi fuerza y me acompañes en todo momento. 
Yo se que tienes para mi un plan. Yo se que tu siempre das en la medida correcta y que poco a poco me has mostrado el camino para mi crecimiento.
Hace un tiempo me faltaba confianza en mi mismo. Se que con la experiencia laboral anterior me diste la oportunidad de abrir los ojos y saber quien soy.
Te doy las gracias porque se que cada paso que me muestras es un paso hacia desarrollar mi máximo potencial.
'''
query_output = query_article(query=query,namespace='content')


Most similar results to Dios. Quiero que en este proceso de búsqueda seas mi fuerza y me acompañes en todo momento. 
Yo se que tienes para mi un plan. Yo se que tu siempre das en la medida correcta y que poco a poco me has mostrado el camino para mi crecimiento.
Hace un tiempo me faltaba confianza en mi mismo. Se que con la experiencia laboral anterior me diste la oportunidad de abrir los ojos y saber quien soy.
Te doy las gracias porque se que cada paso que me muestras es un paso hacia desarrollar mi máximo potencial.
 in "content" namespace:

Proverbios 3:5, 6: Este pasaje nos enseña a confiar plenamente en Dios y no depender únicamente de nuestra propia sabiduría o entendimiento. Al reconocer a Dios en todas nuestras acciones y decisiones, Él guiará y corregirá nuestro camino. (score = 0.551148355)
Job 11:18, 19: Este pasaje habla de la confianza en Dios y la esperanza que trae seguridad y paz. Se menciona que, al confiar en Dios, uno puede descansar sin temor y que otros buscarán 