# Inserting Data into Pinecone db

https://github.com/openai/openai-cookbook/blob/main/examples/vector_databases/pinecone/Using_Pinecone_for_embeddings_search.ipynb

In [2]:
# Importing libraries
import pandas as pd
import numpy as np
import os
import ast
from typing import List, Iterator
import json

# Setting the working directory
def find_repo_root(repo_name='orare-model'):
    current_dir = os.getcwd()
    while current_dir != '/':
        if os.path.basename(current_dir) == repo_name:
            return current_dir
        current_dir = os.path.dirname(current_dir)
    raise FileNotFoundError(f"Repository root '{repo_name}' not found.")

# Setting directory
repo_root = find_repo_root()
os.chdir(repo_root)

In [3]:
import pinecone
from pinecone import Pinecone

# Pinecone API
pc_api_key = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=pc_api_key)

  from tqdm.autonotebook import tqdm


In [4]:
# Models a simple batch generator that make chunks out of an input DataFrame
class BatchGenerator:
   
    def __init__(self, batch_size: int = 10) -> None:
        self.batch_size = batch_size
    
    # Makes chunks out of an input DataFrame
    def to_batches(self, df: pd.DataFrame) -> Iterator[pd.DataFrame]:
        splits = self.splits_num(df.shape[0])
        if splits <= 1:
            yield df
        else:
            for chunk in np.array_split(df, splits):
                yield chunk

    # Determines how many chunks DataFrame contains
    def splits_num(self, elements: int) -> int:
        return int(np.ceil(elements / self.batch_size))
    
    __call__ = to_batches

df_batcher = BatchGenerator(200)

In [23]:
# Index for Bible By Theme Interpreted v1 and v2
index_name = 'bible-verses-openai-small'

# Setting the host of the index from the Pinecone web admin portal
index = pc.Index(index_name=index_name, host='https://bible-verses-openai-small-rsup9mo.svc.aped-4627-b74a.pinecone.io')

# Check index size of the namespace to confirm all of our docs have been loaded
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

## Bible By Theme Interpreted: v1

In [24]:
# Reading the bible interpreted with embeddings
bible_data = pd.read_csv('bible/data/bible_by_theme_int_emb_v1.txt', sep='|', encoding='utf-8')

# Tranforming vector string into list
bible_data['interpretacion_vector'] = bible_data['interpretacion_vector'].apply(ast.literal_eval)

# Showing the bible_data object
bible_data.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 861 entries, 0 to 860
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     861 non-null    object
 1   pasaje                 861 non-null    object
 2   texto                  861 non-null    object
 3   interpretación         861 non-null    object
 4   temas                  861 non-null    object
 5   área_vida              861 non-null    object
 6   texto_vector           861 non-null    object
 7   interpretacion_vector  861 non-null    object
dtypes: object(8)
memory usage: 53.9+ KB


In [26]:
# Formatting the Bible dataframe
# Transforming index as numeric value
bible_data['id'] = bible_data['id'].str.replace('vec', '').astype(int)

# Craeting column "metadata"
bible_data['metadata'] = bible_data.apply(lambda row: {
    'pasaje': row['pasaje'],
    'texto': row['texto'],
    'interpretacion': row['interpretación'],
    'temas': row['temas'],
    'area_vida': row['área_vida']
}, axis=1)

# Dropping the columns inserted in metadata
bible_data = bible_data.drop(columns=['pasaje','texto','interpretación','temas','área_vida'])

# Reordering columns
columns = ['id','interpretacion_vector','metadata']
bible_data = bible_data[columns]
bible_data.head()

Unnamed: 0,id,interpretacion_vector,metadata
0,1,"[0.05086807906627655, 0.006340509746223688, 0....","{'pasaje': '1 Corintios 10:12', 'texto': 'Así ..."
1,2,"[0.04511460289359093, -0.013647807762026787, -...","{'pasaje': '1 Corintios 10:13', 'texto': 'No o..."
2,3,"[0.0449271984398365, 0.04290250688791275, -0.0...","{'pasaje': '1 Corintios 10:31', 'texto': 'Si p..."
3,4,"[0.06745994836091995, 0.06209307909011841, -0....","{'pasaje': '1 Corintios 11:9', 'texto': 'Porqu..."
4,5,"[0.030204113572835922, -0.004405524581670761, ...","{'pasaje': '1 Corintios 13:13', 'texto': 'Y ah..."


In [27]:
# Upsert content vectors in content namespace - this can take a few minutes
print("Uploading vectors to content namespace..")
for batch_df in df_batcher(bible_data):
    vectors = list(zip(batch_df.id.astype(str), batch_df.interpretacion_vector, batch_df.metadata))
    index.upsert(vectors=vectors, namespace='v1')

Uploading vectors to content namespace..


  return bound(*args, **kwds)


In [28]:
# Check index size of the namespace to confirm all of our docs have been loaded
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'v1': {'vector_count': 861}},
 'total_vector_count': 861}

## Bible By Theme Interpreted: v2

In [29]:
# Reading the bible interpreted with embeddings
bible_data = pd.read_csv('bible/data/bible_by_theme_int_emb_v2.txt', sep='|', encoding='utf-8')

# Tranforming vector string into list
bible_data['interpretacion_vector'] = bible_data['interpretacion_vector'].apply(ast.literal_eval)

# Showing the bible_data object
bible_data.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 861 entries, 0 to 860
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     861 non-null    object
 1   pasaje                 861 non-null    object
 2   texto                  861 non-null    object
 3   interpretación         861 non-null    object
 4   temas                  861 non-null    object
 5   interpretacion_vector  861 non-null    object
dtypes: object(6)
memory usage: 40.5+ KB


In [31]:
# Formatting the Bible dataframe
# Transforming index as numeric value
bible_data['id'] = bible_data['id'].str.replace('vec', '').astype(int)

# Craeting column "metadata"
bible_data['metadata'] = bible_data.apply(lambda row: {
    'pasaje': row['pasaje'],
    'texto': row['texto'],
    'interpretacion': row['interpretación'],
    'temas': row['temas']
}, axis=1)

# Dropping the columns inserted in metadata
bible_data = bible_data.drop(columns=['pasaje','texto','interpretación','temas'])

# Reordering columns
columns = ['id','interpretacion_vector','metadata']
bible_data = bible_data[columns]
bible_data.head()

Unnamed: 0,id,interpretacion_vector,metadata
0,1,"[0.04122506454586983, -0.013365025632083416, 0...","{'pasaje': '1 Corintios 10:12', 'texto': 'Así ..."
1,2,"[0.04572800546884537, -0.02974953129887581, 0....","{'pasaje': '1 Corintios 10:13', 'texto': 'No o..."
2,3,"[0.04388764873147011, 0.016837339848279953, 0....","{'pasaje': '1 Corintios 10:31', 'texto': 'Si p..."
3,4,"[0.07951465994119644, 0.01010322105139494, 0.0...","{'pasaje': '1 Corintios 11:9', 'texto': 'Porqu..."
4,5,"[0.02176767587661743, -0.029667839407920837, 0...","{'pasaje': '1 Corintios 13:13', 'texto': 'Y ah..."


In [32]:
# Upsert content vectors in content namespace - this can take a few minutes
print("Uploading vectors to content namespace..")
for batch_df in df_batcher(bible_data):
    vectors = list(zip(batch_df.id.astype(str), batch_df.interpretacion_vector, batch_df.metadata))
    index.upsert(vectors=vectors, namespace='v2')

Uploading vectors to content namespace..


  return bound(*args, **kwds)


In [33]:
# Check index size of the namespace to confirm all of our docs have been loaded
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'v1': {'vector_count': 861}, 'v2': {'vector_count': 861}},
 'total_vector_count': 1722}

# Catholic Bible Interpreted: v3

In [5]:
# Index for Bible By Theme Interpreted v1 and v2
index_name = 'bible-verses-openai-large'

# Setting the host of the index from the Pinecone web admin portal
index = pc.Index(index_name=index_name, host='https://bible-verses-openai-large-rsup9mo.svc.aped-4627-b74a.pinecone.io')

# Check index size of the namespace to confirm all of our docs have been loaded
index.describe_index_stats()

{'dimension': 3072,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [6]:
# Reading the bible interpreted with embeddings
bible_data = pd.read_csv('bible/data/cat_bible_int_openai_emb.txt', sep='|', encoding='utf-8')

# Tranforming vector string into list
bible_data['interpretacion_vector'] = bible_data['interpretacion_vector'].apply(ast.literal_eval)

# Showing the bible_data object
bible_data.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858 entries, 0 to 857
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     858 non-null    int64 
 1   pasaje                 858 non-null    object
 2   texto                  858 non-null    object
 3   interpretacion         858 non-null    object
 4   area_vida              858 non-null    object
 5   interpretacion_vector  858 non-null    object
dtypes: int64(1), object(5)
memory usage: 40.3+ KB


In [9]:
# Formatting the Bible dataframe

# Craeting column "metadata"
bible_data['metadata'] = bible_data.apply(lambda row: {
    'pasaje': row['pasaje'],
    'texto': row['texto'],
    'interpretacion': row['interpretacion'],
    'temas': row['area_vida']
}, axis=1)

# Dropping the columns inserted in metadata
bible_data = bible_data.drop(columns=['pasaje','texto','interpretacion','area_vida'])

# Reordering columns
columns = ['id','interpretacion_vector','metadata']
bible_data = bible_data[columns]
bible_data.head()

Unnamed: 0,id,interpretacion_vector,metadata
0,1,"[-0.016444619745016098, -0.0008524770382791758...","{'pasaje': '1 Corintios 10:12', 'texto': 'Por ..."
1,2,"[0.007297295145690441, -0.030060498043894768, ...","{'pasaje': '1 Corintios 10:13', 'texto': 'Hast..."
2,3,"[-0.023252546787261963, -0.01617797650396824, ...","{'pasaje': '1 Corintios 10:31', 'texto': 'En r..."
3,4,"[-0.040891166776418686, -0.0006036538979969919...","{'pasaje': '1 Corintios 11:9', 'texto': 'ni fu..."
4,5,"[-0.030409976840019226, 0.008826683275401592, ...","{'pasaje': '1 Corintios 13:13', 'texto': 'En u..."


In [10]:
# Upsert content vectors in content namespace - this can take a few minutes
print("Uploading vectors to content namespace..")
for batch_df in df_batcher(bible_data):
    vectors = list(zip(batch_df.id.astype(str), batch_df.interpretacion_vector, batch_df.metadata))
    index.upsert(vectors=vectors, namespace='v3')

Uploading vectors to content namespace..


  return bound(*args, **kwds)


In [15]:
# Check index size of the namespace to confirm all of our docs have been loaded
index.describe_index_stats()

{'dimension': 3072,
 'index_fullness': 0.0,
 'namespaces': {'v3': {'vector_count': 858}},
 'total_vector_count': 858}