# Vectorstores and Embeddings


# Installing required libraries

In [1]:
# Installing required libraries
!pip install openai
!pip install cohere
!pip install langchain
!pip install chromadb
!pip install tiktoken
!pip install langchain[sentence-transformers]
!pip install sentence-transformers
!pip install pypdf
!pip install huggingface_hub
!pip install langchain[HuggingFace]

# Importing necessary modules
import os
import sys


# Importing specific components from langchain
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import HuggingFaceHub
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate







In [2]:
!tar zxvf l3vs.tar.gz
!ls


./
./03_vectorstores_and_embeddings.ipynb
./docs/
./docs/cs229_lectures/
./docs/cs229_lectures/MachineLearning-Lecture01.pdf
./docs/cs229_lectures/MachineLearning-Lecture02.pdf
./docs/cs229_lectures/MachineLearning-Lecture03.pdf
./docs/chroma/
./docs/chroma/index/
./docs/chroma/index/index_b9cafc6b-acdf-4e7c-bd74-6249b3d7ec04.bin
./docs/chroma/index/id_to_uuid_b9cafc6b-acdf-4e7c-bd74-6249b3d7ec04.pkl
./docs/chroma/index/uuid_to_id_b9cafc6b-acdf-4e7c-bd74-6249b3d7ec04.pkl
./docs/chroma/index/index_metadata_b9cafc6b-acdf-4e7c-bd74-6249b3d7ec04.pkl
./docs/chroma/chroma-embeddings.parquet
./docs/chroma/chroma-collections.parquet
./.ipynb_checkpoints/
./.ipynb_checkpoints/03_vectorstores_and_embeddings-checkpoint.ipynb
03_vectorstores_and_embeddings.ipynb  docs  l3vs.tar.gz  Langchain-Tester-Resume.pdf  sample_data


In [3]:

# Loading PDF documents using PyPDFLoader
loaders = [
    PyPDFLoader("./Langchain-Tester-Resume.pdf")
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [4]:
# Splitting documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 150,
    chunk_overlap = 15
)

In [5]:
splits = text_splitter.split_documents(docs)

In [6]:
len(splits)

16

## Embeddings

Using Open Source HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [7]:
# Initializing HuggingFaceEmbeddings with a specific model
embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


## Vectorstores
In memory Chromadb.  Initializing an in-memory Chromadb with document embeddings

In [8]:
persist_directory = 'docs/chroma/'

In [9]:
!rm -rf ./docs/chroma  # remove old database files if any

In [10]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [11]:
print(vectordb._collection.count())

16


### Similarity Search using a question

In [12]:
question = "What is Langchain Tester's email?"

In [13]:
docs = vectordb.similarity_search(question,k=3)

In [14]:
len(docs)

3

In [15]:
docs[0].page_content

'Langchain\nTester\n123\nTesting\nLane\nTest\nCity,\nTS\n56789\ntestertester@email.com\n(123)\n456-7890\nObjective:\nHighly\nskilled\nand\ndetail-oriented\nLangchain'

# Saving the vector database for future use

In [16]:
vectordb.persist()

# Retrieval

# Retrieving the vector database from the saved location

In [17]:

vectordbretrieve = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding
)

In [18]:
print(vectordbretrieve._collection.count())

16


## Q&A CHAT  USING Flan T5 model


# Question Answering

# Question Answering using the open-source google/flan-t5-small model


In [19]:
HUGGINGFACEHUB_API_TOKEN="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"
import os
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN
# from langchain.llms import HuggingFaceHub
repo_id = "google/flan-t5-small"
llm = HuggingFaceHub(
    repo_id=repo_id, model_kwargs={"temperature":0, "max_length": 64}
)







In [20]:
# Asking a question and performing similarity search
question = "What month and year did LangChain Tester graduate?"
docs = vectordbretrieve.similarity_search(question,k=3)
len(docs)

3

### RetrievalQA chain

In [21]:
# Retrieving answers using the Flan T5 model
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordbretrieve.as_retriever()
)

In [22]:
result = qa_chain({"query": question})

In [23]:
result["result"]

'May 2022'

### RetrievalQA chain with Prompt Template and return_source_documents

In [24]:
# Building a prompt template for question answering
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer.
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

In [26]:
# Running the question answering chain with the prompt template
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordbretrieve.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [27]:
# Asking a question and retrieving answers along with source documents
question = "What is LangChain Tester's email address?"

In [28]:
result = qa_chain({"query": question})

In [29]:
result["result"]

'123 Testing Lane Test City, TS 56789 testertester@email.com'

In [30]:
result["source_documents"][0]

Document(page_content='Langchain\nTester\n123\nTesting\nLane\nTest\nCity,\nTS\n56789\ntestertester@email.com\n(123)\n456-7890\nObjective:\nHighly\nskilled\nand\ndetail-oriented\nLangchain', metadata={'page': 0, 'source': './Langchain-Tester-Resume.pdf'})