## Install dependencies for this data pipeline

Please review the dependencies before installing. The key libraries used for this data pipeline include:

* OpenAI
* Pinecone GRPC client
* Langchain
* Pickle

In [1]:
pip install -r './requirements.txt'


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
from dotenv import load_dotenv
import os

load_dotenv()

OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
PINECONE_API_KEY = os.environ['PINECONE_API_KEY']
PINECONE_API_ENV = os.environ['PINECONE_API_ENV']
PINECONE_INDEX_NAME = os.environ['PINECONE_INDEX_NAME']
PINECONE_NAMESPACE = os.environ['PINECONE_NAMESPACE']
PDF_SOURCE_FILE = os.environ['PDF_SOURCE_FILE']
SOURCE_TEXT_FILE= f"./data/{os.environ['PINECONE_NAMESPACE']}-source_data.txt"
EMBEDDING_ARCHIVE_FILE = f"./data/{os.environ['PINECONE_NAMESPACE']}-embeddings.pickle"

## Set Pinecone and OpenAI API keys as environment variables

You must have a '.env' file that contains the following variables:

```
OPENAI_API_KEY = 'YOUR OPENAI KEY'
PINECONE_API_KEY = 'YOUR PINECONE KEY'
PINECONE_API_ENV = 'YOUR PINECONE API ENV'
PINECONE_INDEX_NAME = 'YOUR PINECONE INDEX NAME'
```

In [3]:
import pinecone

index_name = PINECONE_INDEX_NAME

# initialize connection to pinecone (get API key at app.pinecone.io)
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_API_ENV  # may be different, check at app.pinecone.io
)

if index_name not in pinecone.list_indexes():
    # if does not exist, create index
    pinecone.create_index(
        index_name,
        dimension=1536,
        metric='cosine',
        metadata_config={'indexed': ['unused']},
        pod_type='p1.x1'
    )
# connect to index
index = pinecone.Index(index_name)
# view index stats
index.describe_index_stats()

  from tqdm.autonotebook import tqdm


{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'adp': {'vector_count': 301}},
 'total_vector_count': 301}

## Create pinecone index for the vector embeddings

If the index doesn't exist, then create it. Be sure to set:

* dimension = '1536' - OpenAI embeddings use 1536 dimension vectors
* metric = 'cosine' - Other metric options are available but this value usually is best
* pod_type = 'p1.x1' - This is a low latency pod type

## Define text chunking, text transformation and embedding creation

Define methods that use chunking with overlap to transform records into Pinecone vector records.

In [4]:
import openai
from tenacity import retry, wait_random_exponential, stop_after_attempt, retry_if_not_exception_type
from typing import List
from uuid import uuid4
import textwrap
import pandas as pd
from pinecone_text.sparse import BM25Encoder
openai.api_key = OPENAI_API_KEY
EMBEDDING_MODEL = 'text-embedding-ada-002'
EMBEDDING_CTX_LENGTH = 8191
EMBEDDING_ENCODING = 'cl100k_base'

# let's make sure to not retry on an invalid request, because that is what we want to demonstrate
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6), retry=retry_if_not_exception_type(openai.InvalidRequestError))
def get_embedding(text_or_tokens, model=EMBEDDING_MODEL):
    return openai.Embedding.create(input=text_or_tokens, model=model)

def chunk_text(text: str, max_chunk_size: int, overlap_size: int) -> List[str]:
    """Helper function to chunk a text into overlapping chunks of specified size."""
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + max_chunk_size, len(text))
        chunks.append(text[start:end])
        start += max_chunk_size - overlap_size
    return chunks

def transform_record(record: dict) -> List[dict]:
    """Transform a single record as described in the prompt."""
    max_chunk_size = 500
    overlap_size = 100
    chunks = chunk_text(record, max_chunk_size, overlap_size)
    transformed_records = []
    recordId = str(uuid4())
    for i, chunk in enumerate(chunks):
        chunk_id = f"{recordId}-{i+1}"
        
        transformed_records.append({
            'chunk_id': chunk_id,
            'chunk_parent_id': recordId,
            'chunk_text': chunk,
            'vector': get_embedding(chunk).get('data')[0]['embedding']
        })
        
    return transformed_records

## Create source data vector embeddings and save locally

Open the text file that represents our source data, read the data into chunks, get OpenAI embeddings for each chunk and create records that can be saved offline.

This operation takes awhile because each

In [5]:
import pickle
from tqdm.auto import tqdm
from pdfminer.high_level import extract_text

pdf_source_text = extract_text(PDF_SOURCE_FILE)
source_data = open(SOURCE_TEXT_FILE, 'w')
source_data.write(pdf_source_text)
print("PDF source extracted to text source file")

rf = open(SOURCE_TEXT_FILE, 'r', encoding='ISO-8859-1').read()

chunked_data = []
chunk_array = transform_record(rf)

for i in tqdm(range(0, len(chunk_array))):
    chunked_data.append(chunk_array[i])

wf = open(EMBEDDING_ARCHIVE_FILE, 'wb')

pickle.dump(chunked_data, wf)
print("Text source embeddings generated and saved to embedding archive")
wf.close()
#rf.close()


PDF source extracted to text source file


100%|██████████| 1929/1929 [00:00<00:00, 1433270.58it/s]

Text source embeddings generated and saved to embedding archive





## Upsert source data vector embeddings into Pinecone

In [6]:
from tqdm.auto import tqdm  # this is our progress bar
from pinecone_text.sparse import BM25Encoder
import pickle

def prepare_entries_for_pinecone(entries):
    """
    Prepares an array of entries for upsert to Pinecone.
    Each entry should have a 'vector' field containing a list of floats.
    """
    vectors = []
    for entry in entries:
        vector = entry['vector']
        id = entry.get('chunk_id', '')
        metadata = entry.get('metadata', {'chunk_id': entry.get('chunk_id', ''),'parent_id': entry.get('chunk_parent_id', ''), 'chunk_text': entry.get('chunk_text', '')})
        values = [v for v in vector]
        vectors.append({'id': id, 'metadata': metadata, 'values': values})
    return {'vectors': vectors, 'namespace': ''}

vector_data = pickle.load(open(EMBEDDING_ARCHIVE_FILE, 'rb'))
vectors = prepare_entries_for_pinecone(vector_data)

batch_size = 32  # process everything in batches of 32
for i in tqdm(range(0, len(vectors['vectors']), batch_size)):
    ids_batch = [id['id'] for id in vectors['vectors'][i:i+batch_size]]
    embeds = [id['values'] for id in vectors['vectors'][i:i+batch_size]]
    meta = [id['metadata'] for id in vectors['vectors'][i:i+batch_size]]
    upserts = []
    # loop through the data and create dictionaries for uploading documents to pinecone index
    # for _id, sparse, dense, meta in zip(ids_batch, sparse_values, embeds, meta):
    for _id,dense, meta in zip(ids_batch, embeds, meta):
        upserts.append({
            'id': _id,
            'values': dense,
            'metadata': meta
        })
    # upload the documents to the new hybrid index
    index.upsert(upserts, namespace=PINECONE_NAMESPACE)

100%|██████████| 61/61 [00:30<00:00,  1.98it/s]


In [7]:
PINECONE_NAMESPACE
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'adp': {'vector_count': 301}, 'citi': {'vector_count': 1929}},
 'total_vector_count': 2230}

## Delete pinecone index

In [None]:
index.delete(deleteAll='true', namespace=PINECONE_NAMESPACE)