### Splitting & Embedding text using LangChain

In [2]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(),override=True)

True

In [4]:
os.environ.get('PINECONE_API_KEY')

'de59d226-4577-481d-8957-f9f70b7e3640'

### Document Loaders in Langchain
- This module helps us to load any type of document into our model.
- [Here are examples](https://python.langchain.com/docs/integrations/document_loaders/)
- 

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
with open('./speech.txt') as f:
    gandhi_speech=f.read()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len
)

In [11]:
chunks = text_splitter.create_documents([gandhi_speech])
print(len(chunks))

170


### Embedding Cost

In [13]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens/1000 * 0.0004:.6f}')

print_embedding_cost(chunks)

Total Tokens: 3448
Embedding Cost in USD: 0.001379


In [21]:
from langchain.embeddings import OpenAIEmbeddings
# NOTE
# 1.No need to pass the api key as the parameter to OpenAIEmbeddings
#   function. It automatically grabs it from the env.
# 2.This class can be used to embed text to vector.
embeddings = OpenAIEmbeddings()

In [22]:
vector = embeddings.embed_query(chunks[0].page_content)
print(vector)

[-0.011723315765591042, -0.012774645811077717, -0.011663807044897046, -0.0018745404179298824, -0.034435999888955585, 0.03205563243474359, -0.010804228873850926, -0.03663123158453544, -0.021529112786776525, -0.02675270013029739, 0.027017184575145296, 0.022891213079414358, -0.003507737895179575, -0.016834496195596682, -0.010295094361741235, 0.00927682626884446, 0.004195399733524514, 0.0026663435652272485, 0.002226636592419175, 0.013204434430939473, 0.007716362176909397, 0.011452218930225157, -0.006119531334109357, 0.004122666231794558, 0.0050450593187936635, 0.010658761870391015, 0.02867021887470295, -0.030812551446138967, 0.007676689230785429, -0.02302345530183831, -0.011392709278208554, 0.00012645709376459113, -0.018950379204960774, -0.04628494921174303, -0.025456721880194145, 0.005207056832887222, -0.03509721286372056, 0.005593866311366019, 0.02252093224892399, 0.012635790266813177, 0.02257382951042261, -0.01184894652881962, 0.0033259041408546853, 0.00891315876019468, -0.0160674886978

### Inserting the Embeddings into a Pinecone Index

In [17]:
import os
import pinecone
from langchain.vectorstores import Pinecone

pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'),
             environment=os.environ.get('PINECONE_ENV'))

  from tqdm.autonotebook import tqdm


In [18]:
# deleting all indexex
indexes = pinecone.list_indexes()
for i in indexes:
    print('Deleting all indexes ... ', end='')
    pinecone.delete_index(i)
    print('Done')

In [20]:
# Rules for pinecone index name 
# 'Index name must consist of lower case alphanumeric characters or '-''
#  And all lower case naming and must start and end with an alphanumeric
#  character.

# THis is an example of invalid index name
#index_name = 'Gandhi-speech'


index_name = 'gandhi-speech'
if index_name not in pinecone.list_indexes():
    print(f'Creating index {index_name} ...')
    pinecone.create_index(index_name,dimension=1536,metric='cosine')
    print('Done!')

Creating index gandhi-speech ...
Done!


- This is how the creation of index <ins>gandhi-speech</ins> looks like.
<img src='./images/index-creation-example.png' />

In [23]:
# We will updload the vectors to pinecone using langchain.
# Chunks is the list of text documents that has been obtained using the 
# module RecursiveCharacterTextSplitter

# The embeddings is responsible for converting text data into embeddings
# using OpenAIs embedding model.

# Index_name is the index name :) 

vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)

### Asking Questions (Similarity Search)
- [Remember this](./ChatGPT-PineCone-LLMs-POC.ipynb#How-Vector-DB-works)

In [25]:
query = 'What is the name of speaker?'
result = vector_store.similarity_search(query)
print(result)

[Document(page_content='Table Conference Speech (30th of November 1931)The round table conference speech was given by', metadata={}), Document(page_content='speech\xa0Mahatma Gandhi Famous SpeechMahatma Gandhi or Mohandas Karamchand Gandhi was an Indian', metadata={}), Document(page_content='1916)Dandi March Speech (11th of March 1930)Round Table Conference Speech (30th of November', metadata={}), Document(page_content='he was giving the speech.\xa0\xa0Mahatma Gandhi was sharply criticizing the overuse of the English', metadata={})]


In [27]:
for r in result:
    print(r.page_content)
    print('-' * 50)

Table Conference Speech (30th of November 1931)The round table conference speech was given by
--------------------------------------------------
speech Mahatma Gandhi Famous SpeechMahatma Gandhi or Mohandas Karamchand Gandhi was an Indian
--------------------------------------------------
1916)Dandi March Speech (11th of March 1930)Round Table Conference Speech (30th of November
--------------------------------------------------
he was giving the speech.  Mahatma Gandhi was sharply criticizing the overuse of the English
--------------------------------------------------


In [29]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model='gpt-3.5-turbo',temperature=1)

retriever = vector_store.as_retriever(search_type='similarity',search_kwargs={'k':3})

chain = RetrievalQA.from_chain_type(llm=llm,chain_type='stuff',retriever=retriever)



In [35]:
query = 'What is the spaker saying?'
answer = chain.run(query);
print(answer)

The speaker is sharply criticizing the overuse of the English language and emphasizing the importance of non-violence.
