## Splitting and Embedding Text Using Langchain

In [1]:
import os
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv(), override=True)

True

In [21]:
# default text splitter is RecursiveCharacterTextSplitter
# By default text characters try to split by backslash, newline, whitespace
from langchain.text_splitter import RecursiveCharacterTextSplitter
with open('./files/churchill_speech.txt') as f:
          churchill_speech = f.read()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len
)

In [23]:
chunks = text_splitter.create_documents([churchill_speech])
print(chunks[2])
print(chunks[10].page_content, "==")
print(f'now we have {len(chunks)} chunks')

page_content='From the moment that the French defenses at Sedan and on the Meuse were broken at the end of the'
penetration were realized and when a new French Generalissimo, General Weygand, assumed ==
now we have 300 chunks


#### Embedding cost

In [26]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens/1000 * 0.0004:.6f}')

print_embedding_cost(chunks)

Total Tokens: 4820
Embedding Cost in USD: 0.001928


In [45]:
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

In [None]:
vector = embeddings.embed_query(chunks[0].page_content)
vector

#### Inserting the embeddings into a Pinecone Index

In [47]:
import os
import pinecone
from langchain.vectorstores import Pinecone

pinecone.init(
    api_key=os.environ.get('PINECONE_API_KEY'),
    environment = os.environ.get('PINECONE_ENV')
)

In [56]:
# deleting all indexes
indexes = pinecone.list_indexes()

In [57]:
for i in indexes:
    print("Deleting all indexes")
    pinecone.delete_index(i)
    print("Done")

Deleting all indexes
Done


In [58]:
index_name = 'churchill-speech'
if index_name not in pinecone.list_indexes():
    pinecone.create_index(index_name, dimension=1536, metric='cosine')
    print("Done")

Done


In [59]:
vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)

### Asking Question - Similarity search

In [61]:
query = "Where shoukd they fight?"
result = vector_store.similarity_search(query)
result

[Document(page_content='front, now on that, fighting'),
 Document(page_content='fighting attendant upon them followed, these unfortunate people would be far better out of the way,'),
 Document(page_content='shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and'),
 Document(page_content='of their far more numerous Air Force, was thrown into the battle or else concentrated upon Dunkirk')]

In [62]:
for r in result:
    print(r.page_content)
    print('_' * 50)

front, now on that, fighting
__________________________________________________
fighting attendant upon them followed, these unfortunate people would be far better out of the way,
__________________________________________________
shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and
__________________________________________________
of their far more numerous Air Force, was thrown into the battle or else concentrated upon Dunkirk
__________________________________________________


In [78]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
# llm = ChatOpenAI(model='gpt-4', temperature=0.7)
llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})
chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

In [79]:
query = "Where shoukd they fight?"
answer = chain.run(query)
answer

'They should fight on the beaches, on the landing grounds, and in the fields.'

In [80]:
query = "Who was the king of Belgium at that time"
answer = chain.run(query)
answer

'The king of Belgium at that time was King Leopold.'

In [81]:
query = "What about the French armies?"
answer = chain.run(query)
answer

'The French armies were involved in the conflict with the British armies. They were holding the area in question and had plans to advance across the Somme with a strong force.'