## Splitting and Embedding Text Using LangChain

In [4]:
# load environment for OpenAI and Pinecone config
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

with open('files/churchill_speech.txt') as f:
    churchill_speech = f.read()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len
)

In [3]:
chunks = text_splitter.create_documents([churchill_speech])

#### Embedding Cost

In [9]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.0004:.6f}')

print_embedding_cost(chunks)

Total Tokens: 4820
Embedding Cost in USD: 0.001928


In [10]:
from langchain.embeddings import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [15]:
vector = embedding.embed_query(chunks[0].page_content)

#### Inserting the Embeddings into a Pinecone index

In [10]:
import os
import pinecone
from langchain.vectorstores import Pinecone

pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))


In [12]:
# delete all indexes
indexes = pinecone.list_indexes()
for i in indexes:
    print('Deleting all indexes...', end='')
    pinecone.delete_index(i)

In [14]:
index_name = 'churchhill-speech'
if index_name not in pinecone.list_indexes():
    print(f'Creating index {index_name}...')
    pinecone.create_index(index_name, dimension=1536, metric='cosine')
    print('Done!')