# Environment Setup

In [1]:
import os
from dotenv import load_dotenv 
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_cohere import CohereEmbeddings
from langchain_community.llms import Cohere
from langchain.chains import RetrievalQA
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.chat_models import ChatCohere
from langchain_core.prompts import ChatPromptTemplate
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone


  from .autonotebook import tqdm as notebook_tqdm

For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langchain_pinecone.vectorstores import Pinecone, PineconeVectorStore


In [2]:
load_dotenv()

True

## read file


# Document Loading

In [3]:
def read_doc(directory):
    file_loader=PyPDFDirectoryLoader(directory)
    documents=file_loader.load()
    return documents

In [4]:
docs=read_doc('documents/')
docs

ImportError: `pypdf` package not found, please install it with `pip install pypdf`

In [None]:
def chunk_data(docs,chunk_size=100,chunk_overlap=20):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    doc=text_splitter.split_documents(docs)
    return doc

# Splitting Text into Chunks

In [None]:
documents = chunk_data(docs=docs)
documents

[Document(metadata={'producer': 'Microsoft® Word LTSC', 'creator': 'Microsoft® Word LTSC', 'creationdate': '2025-02-13T13:17:53+01:00', 'author': 'Anthony Orji', 'moddate': '2025-02-13T13:17:53+01:00', 'source': 'documents\\To the Love of my life.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1'}, page_content='To the Love of my life,  \nthe one whose smile lightens up my mood, whose voice, so calm and tender,'),
 Document(metadata={'producer': 'Microsoft® Word LTSC', 'creator': 'Microsoft® Word LTSC', 'creationdate': '2025-02-13T13:17:53+01:00', 'author': 'Anthony Orji', 'moddate': '2025-02-13T13:17:53+01:00', 'source': 'documents\\To the Love of my life.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1'}, page_content='makes me feel like a child in the embrace of her father. I can’t completely'),
 Document(metadata={'producer': 'Microsoft® Word LTSC', 'creator': 'Microsoft® Word LTSC', 'creationdate': '2025-02-13T13:17:53+01:00', 'author': 'Anthony Orji', 'moddate': '2025-02-13T

In [None]:
embeddings=CohereEmbeddings(model="embed-english-light-v3.0",
                            cohere_api_key=os.environ['COHERE_API_KEY'])
embeddings

CohereEmbeddings(client=<cohere.client.Client object at 0x000002066E1818C0>, async_client=<cohere.client.AsyncClient object at 0x000002067D002580>, model='embed-english-light-v3.0', truncate=None, cohere_api_key=SecretStr('**********'), embedding_types=['float'], max_retries=3, request_timeout=None, user_agent='langchain:partner', base_url=None)

In [None]:
vectors=embeddings.embed_query("what does the writer love about the recipient?")

# Embeddings and Vector Store

In [None]:
len(vectors)

384

In [None]:


pc = Pinecone(api_key=os.environ['PINECONE_API_KEY'])

index_name = "pineconedemo"

if not pc.has_index(index_name):
    pc.create_index_for_model(
        name=index_name,
        cloud="aws",
        region="us-east-1",
        embed={
            "model":"llama-text-embed-v2",
            "field_map":{"text": "chunk_text"}
        }
    )

### cosine similarity to retrieve result



# Querying the Vector Database

In [None]:
#from langchain.chains.question_answering import load_qa_chain


In [None]:
vectorstore_from_docs = PineconeVectorStore.from_documents(
        documents,
        index_name=index_name,
        embedding=embeddings
    )
retriever = vectorstore_from_docs.as_retriever()

In [None]:


llm = ChatCohere(model="command-r", cohere_api_key=os.environ['COHERE_API_KEY'])

#llm = Cohere(model="gptd-instruct-tft", cohere_api_key=os.environ['COHERE_API_KEY'])

In [None]:
system_prompt = (
    "Use the given context to answer the question. "
    "If you don't know the answer, say you don't know. "
    "Use three sentence maximum and keep the answer concise.\n\n"
    "Context: {context}"
)
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [None]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
chain = create_retrieval_chain(retriever, question_answer_chain)


In [None]:
query = "what does the writer love about the recipient?"
response = chain.invoke({"input": query})
print(response)

{'input': 'what does the writer love about the recipient?', 'context': [Document(id='0be45e05-f61e-4ff4-9e04-617213e17aac', metadata={'author': 'Anthony Orji', 'creationdate': '2025-02-13T13:17:53+01:00', 'creator': 'Microsoft® Word LTSC', 'moddate': '2025-02-13T13:17:53+01:00', 'page': 1.0, 'page_label': '2', 'producer': 'Microsoft® Word LTSC', 'source': 'documents\\To the Love of my life.pdf', 'total_pages': 4.0}, page_content='Now I know what it feels to have someone genuinely care so much about'), Document(id='0fcd0f0f-3373-45a4-83e0-b9f11fdc5e53', metadata={'author': 'Anthony Orji', 'creationdate': '2025-02-13T13:17:53+01:00', 'creator': 'Microsoft® Word LTSC', 'moddate': '2025-02-13T13:17:53+01:00', 'page': 1.0, 'page_label': '2', 'producer': 'Microsoft® Word LTSC', 'source': 'documents\\To the Love of my life.pdf', 'total_pages': 4.0}, page_content='Now I know what it feels to have someone genuinely care so much about'), Document(id='ae44a80d-807e-46ed-913c-5c280495bfd5', metada