FAISS : The full form of this term is Facebook AI Similarity Search. This is another kind of vector database where we convert and store the texts data into vectors. This is a local vector database system.

In [None]:
# importing few libraries
!pip -q install langchain
!pip install pypdf
!pip install sentence-transformers==2.2.2
!pip install openai
!pip install tiktoken
!pip install faiss-cpu

In [None]:
# creating a folder through this command
!mkdir pdfs

In [None]:
# importing PyPDFDirectoryLoader() class from document_loaders of the langchain
from langchain.document_loaders import PyPDFDirectoryLoader

In [None]:
# Step-01 : loading the dataset
loader = PyPDFDirectoryLoader("pdfs")
data = loader.load()

In [None]:
# this is our whole dataset
data

In [None]:
# Step-02 : Splitting the text into chunkings
# importing RecursiveCharacterTextSplitter() class for chunkings
from langchain.text_splitter import RecursiveCharacterTextSplitter
# creating an object of RecursiveCharacterTextSplitter()
# and passing few parameters
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
text_chunks = text_splitter.split_documents(data)

In [None]:
# !pip install pdfquery
# from pdfquery import PDFQuery
# pdf = PDFQuery("/content/pdfs/yolo.pdf")
# text = pdf.pq("LTTextLineHorizontal").text()
# data = text
# print(data)

In [None]:
# checking the length of the chunk
len(text_chunks)

In [None]:
# showing specific chunk through indexing
text_chunks[0]
# showing the page_content
text_chunks[0].page_content
# showing the source where the chunk's of the specific index exist.
text_chunks[0].metadata

In [None]:
# step-03 : Embeddings of the chunks >> for doing this we need a model
# for this we will go to the langchain and following it to the embedding
# section and we will import HuggingFaceEmbeddings and this HuggingFaceEmbeddings will
# import opensourcemodel from HuggingFace Hub. This is the way we embed our text into vectors
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [None]:
# now our data is ready to store and we use here the FAISS vector database system
from langchain.vectorstores import FAISS
vectorstore=FAISS.from_documents(text_chunks, embeddings)


In [None]:
# now our vector database is ready for explore
query_01 = "What is Generative AI?"

In [None]:
# calling the similarity_search() function and passing arguments and parameters
docs = vectorstore.similarity_search(query_01, k=3)

In [None]:
# we have 3 outputs
docs[2].page_content

In [None]:
# checking the outout length
len(docs)

In [None]:
# 3 outputs
for i in docs:
  print(i.page_content)


Now we will use OpenAI API Key and call the OpenAI from large language models of the langchain. langchain is nothing but chaining the components.

In [None]:
# importing from google colab as userdata
from google.colab import userdata
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')

In [None]:
# importing os to access the key through our windows
# get the key from defined variable
import os
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [None]:
# importing OpenAI from langchain.llms
from langchain.llms import OpenAI
llm=OpenAI()

In [None]:
# when we use OpenAI we just pass the vector database, the whole database
# and its kind of question and answering
from langchain.chains import RetrievalQA

In [None]:
# now our data is ready to store and we use here the FAISS vector database system
# from langchain.vectorstores import FAISS
# vectorstore=FAISS.from_documents(text_chunks, embeddings)

In [None]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
    )


In [None]:
# the openai model is ready to answer our questions
query_02 = "What is Generative Ai?"

In [None]:
# print the query through run() method
print(qa.run(query_02))