### Pinecone

In [36]:
# importing warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# installing libraries through pip
! pip install langchain
! pip install pinecone-client==2.2.4
! pip install pypdf
! pip install sentence-transformers==2.2.2

# pip install langchain pinecone-client==2.2.4 pypdf sentence-transformers==2.2.2

In [2]:
# extract the text form pdfs file
# for that we need to import libraries
from langchain.document_loaders import PyPDFDirectoryLoader

Step-01 : Loading the data

In [11]:
# loading the data from pdfs and load the dataset
loader = PyPDFDirectoryLoader("pdfs")
data = loader.load()
# printing the dataset

In [None]:
print(data)

In [13]:
# importing RecursiveCharacterTextSplitter from langchain.text_splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter


Step-02 : Splitting and Chunking the dataset

In [25]:
# calling RecursiveCharacterTextSplitter() class and creating an objecvt
recursive_text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""],
    chunk_size=500, 
    chunk_overlap=20
    )

# calling split_documents() function by the object of the class RecursiveCharacterTextSplitter()
text_chunks = recursive_text_splitter.split_documents(data)

In [26]:
# checking the length of the chunks
len(text_chunks)

29

In [None]:
# checking attribute of the text_chunks
text_chunks[0].page_content
text_chunks[0].metadata
text_chunks[0]

Step-03 : Embeddings of the dataset

In [34]:
# importing HuggingFaceEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings

In [37]:
# calling HuggingFaceEmbeddings() class and creating an object of it.
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [43]:
# calling embed_query() function by the object of HuggingFaceEmbeddings() class
# HuggingFaceEmbeddings() class has 384 dimensions
query_result = embeddings.embed_query('This is Mehedi')
query_result
print('The length of our text is :',len(query_result))

The length of our text is : 384


In [71]:
# importing os for get the key from .env file
import os
from dotenv import load_dotenv
load_dotenv()

True

In [73]:
# read the keys from the .env file 
PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")
PINECONE_API_ENV=os.getenv("PINECONE_API_ENV")
OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")

Step-04 : creating vector database through creating index and API_KEY

In [49]:
# importing the pinecone
import pinecone

# calling init() function through pinecone class
pinecone.init(
    # create pinecone api key from  https://app.pinecone.io
    api_key=PINECONE_API_KEY,

    # next to api key in console  
    environment=PINECONE_API_ENV  
)
# create an index from the official website of the pinecone
index_name = "vectordb" 


In [50]:
# importing Pinecone from langchain.vectorstores
from langchain.vectorstores import Pinecone

In [51]:
# calling from_texts() function through Pinecone class
# we use list comprehension here we took out all the page_content from the text_chunks
# which is created through RecursiveCharacterTextSplitter()
# we are passing our embeddings that is created through HuggingFaceEmbeddings() class and we are passing 
# a model named as 'sentence-transformers/all-MiniLM-L6-v2'
# we also passing index_name that is created from the pinecone official website.
docsearch = Pinecone.from_texts([t.page_content for t in text_chunks], embeddings, index_name=index_name)

In [None]:
# If we already have an index, we can load it
docsearch = Pinecone.from_existing_index(index_name, embeddings)
docsearch

Now our pinecone vector database is ready to use...lets do by similarity search

In [53]:
# similarity search
query_01 = "What is yolo?"

In [65]:
# calling the similarity_search() on top of the docssearch that 
# is created in pinecone as vector database
# we are using k which is a parameter to the similarity_search() function, it means that we
# are expecting 3 outputs like the similar type of results. we can assign k's value as much as we want
docs_01 = docsearch.similarity_search(query_01, k=2)

In [None]:
# printing the results how we assign the value of k ; here k=2
for i in docs_01:
    print(i)

## Through OpenAI : 
now we are going to use this vectordb through OpenAI using OPENAI_API_KEY

In [None]:
# installing openai
! pip install openai

In [74]:
# calling the key of openai
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [75]:
# importing OpenAI library from langchain.llms
from langchain.llms import OpenAI

In [76]:
# creating the object of the OpenAI() class
llm=OpenAI()

In [78]:
# importing RetrievalQA() class from langchain.chains
from langchain.chains import RetrievalQA

In [83]:
# calling the from_chain_type() function and passing few
# parameters through RetrievalQA() class.
# docsearch has component to access the vector database which created through pinecone
# we calling as_retriever() function on top of the docsearch
qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=docsearch.as_retriever()
    )

In [87]:
# our query means we want to search
query_02 = "What is yolo?"

In [None]:
qa.invoke(query_02)